In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# for distance and h-clustering
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform

#Kmeans clustering
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics 
from sklearn.decomposition import PCA

# sklearn does have some functionality too, but mostly a wrapper to scipy
from sklearn.metrics import pairwise_distances 
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('df_merged.csv')
df_ei = pd.read_csv('/Users/maralinetorres/Documents/GitHub/Predicting-Environmental-and-Social-Actions/Datasets/Environmental_impact_cleaned.csv')
stocks = pd.read_csv('/Users/maralinetorres/Documents/GitHub/Predicting-Environmental-and-Social-Actions/Datasets/pilot_stocks.csv')

In [None]:
sectors = pd.read_csv("/Users/maralinetorres/Documents/GitHub/Predicting-Environmental-and-Social-Actions/Datasets/52_tickers_sectors.csv")
stocks['Missing_GHG'] = np.where(stocks['GHG Scope 1'].isna(), 1, 0)
df = pd.merge(stocks, sectors, how='inner',on='Ticker')
df.drop(columns='Name', inplace=True)
stocks = df.copy()
stocks['Utility'] = np.where(stocks.Sector == 'Utilities',1,0)
stocks.info()

## Data Cleaning - version 1

In [None]:
stocks_df = stocks.copy()
stocks_df = stocks_df.loc[:, ~stocks_df.columns.isin(['Logarithm_Total_Assets','Logarithm_Total_Sales','Sector','Missing_GHG'])]
stocks_df.loc[np.isinf(stocks_df.Annual_Stock_Return),'Annual_Stock_Return'] = np.nan #We can see infinity values for PSX and FANG, we want null values instead

In [None]:
stocks_df.describe().T

In [None]:
## Impute null values with that year industry average
null_columns= stocks_df.columns[stocks_df.isnull().any()]
null_columns = null_columns.tolist()

for column in null_columns:
    stocks_df[column] = stocks_df.groupby(['Year','Utility']).transform('mean')[[column]]

In [None]:
stocks_df.isna().sum()

In [None]:
stocks_df = stocks_df.loc[:, ~stocks_df.columns.isin(['Annual_Stock_Return','Change_in_TEC'])]
stocks_df.head()

In [None]:
is_NaN = stocks_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = stocks_df[row_has_NaN]
null_columns1= rows_with_NaN.columns[rows_with_NaN.isnull().any()]
null_columns1 = null_columns1.tolist()
null_columns1.append('Year')
rows_with_NaN[null_columns1]

In [None]:
rows_with_NaN['null_values'] = rows_with_NaN.isnull().sum(axis=1)
rows_with_NaN.head()

plt.figure(figsize=(10,5));
sns.barplot(x='Year', y = 'null_values', data = rows_with_NaN);

In [None]:
#lets get rid of data from 2005 and 2006 because it is missing data for multiple columns
stocks_df = stocks_df.loc[stocks_df.Year >= 2007, ]
stocks_df.info()

## Hierarchical Clustering - version 1 and utility industry

In [None]:
# 1) Standarized the data
df = stocks_df[stocks_df.Utility == 1]
stock_number = df.select_dtypes('number')
sc = StandardScaler()
stock_scaled = sc.fit_transform(stock_number)
stock_scaled = pd.DataFrame(stock_scaled, columns = stock_number.columns)
stock_scaled

In [None]:
#2) Clustering using euclidean and cosine for distance matrix

dc1 = pdist(stock_scaled.values) #euclidean
dc2 = pdist(stock_scaled.values, metric='cosine')

#See now with linkage method and cosine distance matrix work
METHODS = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc2, method=m),
                leaf_rotation= 90)

In [None]:
#3 ) See how linkage method and euclidean distance metric work
plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc1, method=m),
                leaf_rotation= 90)

I am going to use the cosine and average because the cluster start forming lower and there isnt too much height compared to the other methods. Clusters are more group together and compact 

In [None]:
#4) Create the labels
hc1 = linkage(dc2, method='average')
plt.title('Dendogram for Cosine and Average')
dendrogram(hc1,
            leaf_rotation= 90)
plt.axhline(linestyle='--', y=.85)
plt.show()

In [None]:
labels = fcluster(hc1, 3, criterion='maxclust')
np.unique(labels)

In [None]:
#put the labels into the clean dataset
df['cluster'] = labels
df.head(3) #Review the dataset with the labels

In [None]:
#How many stocks per cluster
df.cluster.value_counts(dropna=False, sort=False)

When using two clusters... we can see that the number of observations are more balanced. 

In [None]:
X = stock_scaled.values
sns.scatterplot(X[:,3],X[:,4],hue=df.cluster, cmap="rainbow").set(title='Stock - Hierarchical Clustering')
plt.show()

### K-Means Clustering - version 1 and utility

In [None]:
#Cluster Evaluation - Deciding how many clusters
X = stock_scaled.values
KRANGE = range(2,10)
sse = []

## loop over and evaluate
for k in KRANGE:
  km = KMeans(k)
  labs = km.fit_predict(stock_scaled)
  sse.append(km.inertia_)

#Elbow Method
sns.lineplot(KRANGE,sse)
plt.show()

In [None]:
# Testing K
ss1 = []

for k in KRANGE:
  km = KMeans(k)
  lab = km.fit_predict(stock_scaled)
  ss1.append(metrics.silhouette_score(stock_scaled, lab))

sns.lineplot(KRANGE, ss1)
plt.show()

In [None]:
#It shows 3 clusters are the best option... because it radically descreases when 4.

##K Means for 3
k3 = KMeans(3)
k3.fit(X)
k3_labs = k3.predict(X)

In [None]:
df['k3'] = k3_labs
df.k3.value_counts(dropna=False, sort=False) #

## Hierarchical Clustering - version 1 and Energy industry

In [None]:
# 1) Standarized the data
df1 = stocks_df[stocks_df.Utility == 0]
stock_number = df1.select_dtypes('number')
sc = StandardScaler()
stock_scaled = sc.fit_transform(stock_number)
stock_scaled = pd.DataFrame(stock_scaled, columns = stock_number.columns)
stock_scaled

In [None]:
#2) Clustering using euclidean and cosine for distance matrix

dc1 = pdist(stock_scaled.values) #euclidean
dc2 = pdist(stock_scaled.values, metric='cosine')

#See now with linkage method and cosine distance matrix work
METHODS = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc2, method=m),
                leaf_rotation= 90)

In [None]:
#3 ) See how linkage method and euclidean distance metric work
plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc1, method=m),
                leaf_rotation= 90)

In [None]:
#4) Create the labels
hc1 = linkage(dc2, method='average')
plt.title('Dendogram for Cosine and Average')
dendrogram(hc1,
            leaf_rotation= 90)
plt.axhline(linestyle='--', y=.85)
plt.show()

In [None]:
labels = fcluster(hc1, 2, criterion='maxclust')
np.unique(labels)

In [None]:
#put the labels into the clean dataset
df1['cluster'] = labels
df1.head(3) #Review the dataset with the labels

In [None]:
#How many stocks per cluster
df1.cluster.value_counts(dropna=False, sort=False)

## K-Means version 1 and energy industry

In [None]:
#Cluster Evaluation - Deciding how many clusters
X = stock_scaled.values
KRANGE = range(2,10)
sse = []

## loop over and evaluate
for k in KRANGE:
  km = KMeans(k)
  labs = km.fit_predict(stock_scaled)
  sse.append(km.inertia_)

#Elbow Method
sns.lineplot(KRANGE,sse)
plt.show()

In [None]:
# Testing K
ss1 = []

for k in KRANGE:
  km = KMeans(k)
  lab = km.fit_predict(stock_scaled)
  ss1.append(metrics.silhouette_score(stock_scaled, lab))

sns.lineplot(KRANGE, ss1)
plt.show()

In [None]:
#It shows 3 clusters are the best option... because it radically descreases when 4.

##K Means for 3
k3 = KMeans(3)
k3.fit(X)
k3_labs = k3.predict(X)

In [None]:
df1['k3'] = k3_labs
df1.k3.value_counts(dropna=False, sort=False) #

## Cluster profiling for Utility industry

In [None]:
df_clus = df.drop(columns=['k3','Utility'])
stock_numeric = df_clus.select_dtypes('number')
stock_numeric

In [None]:
clus_profile = stock_numeric.groupby("cluster").mean()
clus_profile

In [None]:
scp = StandardScaler()
cluster_scaled = scp.fit_transform(clus_profile)
cluster_scaled = pd.DataFrame(cluster_scaled, index=clus_profile.index, columns=clus_profile.columns)
plt.figure(figsize=(9,9))
sns.heatmap(cluster_scaled.T, cmap="Blues", center=0)
plt.show()

## Cluster profiling for Energy industry

In [None]:
df_clus = df1.drop(columns=['cluster','Utility'])
stock_numeric = df_clus.select_dtypes('number')
stock_numeric

In [None]:
clus_profile = stock_numeric.groupby("k3").mean()
clus_profile

In [None]:
scp = StandardScaler()
cluster_scaled = scp.fit_transform(clus_profile)
cluster_scaled = pd.DataFrame(cluster_scaled, index=clus_profile.index, columns=clus_profile.columns)
plt.figure(figsize=(9,9))
sns.heatmap(cluster_scaled.T, cmap="Blues", center=0)
plt.show()

## Data Cleaning - version 2

In [None]:
stocks_clean = stocks.copy()
stocks_clean['time_trend'] = stocks_clean.groupby('Ticker').cumcount()
stocks_clean.info()

In [None]:
stocks_clean = stocks_clean.loc[stocks_clean['Missing_GHG'] == 0, stocks_clean.columns.isin(['Company','GHG Scope 1','Total_Assets','Total_Sales','Utility'])]
#stocks_clean['Profitable'] = np.where(stocks_clean.Profitable == True, 1, 0)
stocks_clean

In [None]:
agg = stocks_clean.groupby('Company')[['GHG Scope 1','Total_Assets','Total_Sales']].mean().reset_index()
stocks_cleaned = pd.merge(agg, stocks_clean, on='Company', how = 'inner',suffixes=('', '_drop'))
stocks_cleaned.drop([col for col in stocks_cleaned.columns if 'drop' in col], axis=1, inplace=True)
stocks_cleaned.drop_duplicates(inplace=True)
stocks_cleaned.head()

In [None]:
stocks_cleaned.shape

## Hierarchical Clustering - Version 2 and utility industry

In [None]:
# 1) Standarized the data
stocks_clean1 = stocks_cleaned[stocks_cleaned.Utility == 1]
stock_number = stocks_clean1.select_dtypes('number')
sc = StandardScaler()
stock_scaled = sc.fit_transform(stock_number)
stock_scaled = pd.DataFrame(stock_scaled, columns = stock_number.columns)
stock_scaled

In [None]:
#2) Clustering using euclidean and cosine for distance matrix

dc1 = pdist(stock_scaled.values) #euclidean
dc2 = pdist(stock_scaled.values, metric='cosine')

#See now with linkage method and cosine distance matrix work
METHODS = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc2, method=m),
                leaf_rotation= 90)

In [None]:
#3 ) See how linkage method and euclidean distance metric work
plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc1, method=m),
                leaf_rotation= 90)

In [None]:
#4) Create the labels
hc1 = linkage(dc2, method='average')
plt.title('Dendogram for Cosine and Average')
dendrogram(hc1,
            leaf_rotation= 90)
plt.axhline(linestyle='--', y=.85)
plt.show()

In [None]:
labels = fcluster(hc1, 3, criterion='maxclust')
np.unique(labels)

In [None]:
#put the labels into the clean dataset
stocks_clean1['cluster'] = labels
stocks_clean1.head(3) #Review the dataset with the labels

In [None]:
#How many stocks per cluster
stocks_clean1.cluster.value_counts(dropna=False, sort=False)

## K-Means Clustering - Version 2 and utility industry

In [None]:
#Cluster Evaluation - Deciding how many clusters
X = stock_scaled.values
KRANGE = range(2,10)
sse = []

## loop over and evaluate
for k in KRANGE:
  km = KMeans(k)
  labs = km.fit_predict(stock_scaled)
  sse.append(km.inertia_)

#Elbow Method
sns.lineplot(KRANGE,sse)
plt.show()

In [None]:
# Testing K
ss1 = []

for k in KRANGE:
  km = KMeans(k)
  lab = km.fit_predict(stock_scaled)
  ss1.append(metrics.silhouette_score(stock_scaled, lab))

sns.lineplot(KRANGE, ss1)
plt.show()

In [None]:
#It shows 3 clusters are the best option... because it radically descreases when 4.

##K Means for 3
k3 = KMeans(3)
k3.fit(X)
k3_labs = k3.predict(X)

In [None]:
stocks_clean1['k3'] = k3_labs
stocks_clean1.k3.value_counts(dropna=False, sort=False) #

In [None]:
stocks_clean1.to_csv('/Users/maralinetorres/Documents/GitHub/Predicting-Environmental-and-Social-Actions/Datasets/Utility_comp_clustering.csv', index=False)

### Hierarchical Clustering - Version 2 and energy industry

In [None]:
# 1) Standarized the data
stocks_clean2 = stocks_cleaned[stocks_cleaned.Utility == 0]
stock_number = stocks_clean2.select_dtypes('number')
sc = StandardScaler()
stock_scaled = sc.fit_transform(stock_number)
stock_scaled = pd.DataFrame(stock_scaled, columns = stock_number.columns)
stock_scaled

In [None]:
#2) Clustering using euclidean and cosine for distance matrix

dc1 = pdist(stock_scaled.values) #euclidean
dc2 = pdist(stock_scaled.values, metric='cosine')

#See now with linkage method and cosine distance matrix work
METHODS = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc2, method=m),
                leaf_rotation= 90)

In [None]:
#3 ) See how linkage method and euclidean distance metric work
plt.figure(figsize=(20,5))

for i,m in enumerate(METHODS):
    plt.subplot(1,4,i+1)
    plt.title(m)
    dendrogram(linkage(dc1, method=m),
                leaf_rotation= 90)

In [None]:
#4) Create the labels
hc1 = linkage(dc2, method='average')
plt.title('Dendogram for Cosine and Average')
dendrogram(hc1,
            leaf_rotation= 90)
plt.axhline(linestyle='--', y=.85)
plt.show()

In [None]:
labels = fcluster(hc1, 2, criterion='maxclust')
np.unique(labels)

In [None]:
#put the labels into the clean dataset
stocks_clean2['cluster'] = labels
stocks_clean2.head(3) #Review the dataset with the labels

In [None]:
#How many stocks per cluster
stocks_clean2.cluster.value_counts(dropna=False, sort=False)

### K-means Clustering - Version 2 and energy industry

In [None]:
#Cluster Evaluation - Deciding how many clusters
X = stock_scaled.values
KRANGE = range(2,10)
sse = []

## loop over and evaluate
for k in KRANGE:
  km = KMeans(k)
  labs = km.fit_predict(stock_scaled)
  sse.append(km.inertia_)

#Elbow Method
sns.lineplot(KRANGE,sse)
plt.show()

In [None]:
# Testing K
ss1 = []

for k in KRANGE:
  km = KMeans(k)
  lab = km.fit_predict(stock_scaled)
  ss1.append(metrics.silhouette_score(stock_scaled, lab))

sns.lineplot(KRANGE, ss1)
plt.show()

In [None]:
#Let's go with 4 clusters

##K Means for 4
k4 = KMeans(3)
k4.fit(X)
k4_labs = k4.predict(X)

In [None]:
stocks_clean2['k3'] = k4_labs
stocks_clean2.k3.value_counts(dropna=False, sort=False) #

In [None]:
stocks_clean2.to_csv('/Users/maralinetorres/Documents/GitHub/Predicting-Environmental-and-Social-Actions/Datasets/Energy_comp_clustering.csv', index=False)

## Cluster profiling for version 2 and utility industry

In [None]:
df_clus = stocks_clean1.drop(columns=['k3','Utility'])
stock_numeric = df_clus.select_dtypes('number')
stock_numeric

In [None]:
clus_profile = stock_numeric.groupby("cluster").mean()
clus_profile

In [None]:
scp = StandardScaler()
cluster_scaled = scp.fit_transform(clus_profile)
cluster_scaled = pd.DataFrame(cluster_scaled, index=clus_profile.index, columns=clus_profile.columns)
plt.figure(figsize=(9,9))
sns.heatmap(cluster_scaled.T, cmap="Blues", center=0)
plt.show()

In [None]:
df_clus1 = stocks_clean1.drop(columns=['cluster','Utility'])
stock_numeric = df_clus1.select_dtypes('number')
stock_numeric

In [None]:
clus_profile = stock_numeric.groupby("k3").mean()
clus_profile

In [None]:
scp = StandardScaler()
cluster_scaled = scp.fit_transform(clus_profile)
cluster_scaled = pd.DataFrame(cluster_scaled, index=clus_profile.index, columns=clus_profile.columns)
plt.figure(figsize=(9,9))
sns.heatmap(cluster_scaled.T, cmap="Blues", center=0)
plt.show()

## Cluster profiling for version 2 and energy industry

In [None]:
df_clus = stocks_clean2.drop(columns=['k3','Utility'])
stock_numeric = df_clus.select_dtypes('number')
stock_numeric

In [None]:
clus_profile = stock_numeric.groupby("cluster").mean()
clus_profile

In [None]:
scp = StandardScaler()
cluster_scaled = scp.fit_transform(clus_profile)
cluster_scaled = pd.DataFrame(cluster_scaled, index=clus_profile.index, columns=clus_profile.columns)
plt.figure(figsize=(9,9))
sns.heatmap(cluster_scaled.T, cmap="Blues", center=0)
plt.show()

In [None]:
df_clus = stocks_clean2.drop(columns=['cluster','Utility'])
stock_numeric = df_clus.select_dtypes('number')
stock_numeric

In [None]:
clus_profile = stock_numeric.groupby("k3").mean()
clus_profile

In [None]:
scp = StandardScaler()
cluster_scaled = scp.fit_transform(clus_profile)
cluster_scaled = pd.DataFrame(cluster_scaled, index=clus_profile.index, columns=clus_profile.columns)
plt.figure(figsize=(9,9))
sns.heatmap(cluster_scaled.T, cmap="Blues", center=0)
plt.show()

## Start creating classification score within industry

Utility industry
- Create new column -> GHG Emissions
		○ Cluster 3 --> High
		○ Cluster 2 --> Medium
		○ Cluster 1 --> Low
- Three labels regarding the environmental actions
		○ Improving (Or good)
			§ Last 3 years emissions are below the cluster average
			§ Last 3 years, the % change in GHG scope is descending
			§ Last 3 years, the environmental score higher than cluster average
		○ Neutral
			Last 2 years, emissions are below the cluster average
			Last 2 years, the % change in GHG scope is descending
			Last 2 years, the environmental score higher than cluster average
		○ Deteriorating (Or bad)
            Else, then bad. 

In [None]:
stocks_utility = pd.read_csv('/Users/maralinetorres/Documents/GitHub/Predicting-Environmental-and-Social-Actions/Datasets/Utility_comp_clustering.csv')
stocks_utility.drop(columns='k3', inplace=True)
stocks_utility.head()

In [None]:
stocks_utility_num = stocks_utility.select_dtypes('number')
clus_profile = stocks_utility_num.groupby("cluster").mean()
clus_profile.sort_values(by='GHG Scope 1')

In [None]:
min_ghg = clus_profile['GHG Scope 1'].min()
max_ghg = clus_profile['GHG Scope 1'].max()

conditions = [ (clus_profile['GHG Scope 1'] == min_ghg),
              (clus_profile['GHG Scope 1'] == max_ghg), 
               (clus_profile['GHG Scope 1'].between(left=min_ghg, right=max_ghg))]
choices = ['Low', 'High','Medium']

clus_profile['GHG_Emission_category'] = np.select(condlist=conditions, choicelist=choices)

In [None]:
clus_profile = clus_profile.sort_values(by='GHG Scope 1').reset_index()
clus_profile1 = clus_profile[['cluster','GHG_Emission_category']]
clus_profile1

In [None]:
cluster_stocks = pd.merge(stocks_utility, clus_profile1, on='cluster',how = 'inner')
cluster_stocks.head()

In [None]:
def assignEnvironmentalScoreLabel(stocks, companies, cluster_stocks, clus_profile):
    companies = cluster_stocks.Company.tolist()
    company_label_score = {}
    for com in companies:
        data = stocks.loc[stocks.Company == com]
        clust_num = cluster_stocks.loc[cluster_stocks.Company == com,'cluster'].tolist()
        avg_ghg_scope = clus_profile[clus_profile.cluster == clust_num[0]]['GHG Scope 1'].tolist()
        year_max = data.Year.max()
        year_min = year_max - 2
        
        assigned = checkMetrics(year_max, year_min, data, 'Improving/Good', company_label_score, com, avg_ghg_scope, companies)
        
        if assigned == False:
            year_min = year_max - 1
            data = stocks.loc[stocks.Company == com]
            assigned = checkMetrics(year_max, year_min, data, 'Neutral', company_label_score, com, avg_ghg_scope, companies)
            if assigned == False:
                company_label_score[com] = 'Bad forecast'
    return company_label_score

In [None]:
def checkMetrics(maxYear, min_year, data, label, company_label_score, com, avg_ghg_scope, companies):
    success = False
    com_trend = data.loc[(data.Year <= maxYear) & (data.Year >= min_year), ['Year','Change_in_GHG','Environmental Disclosure Score','GHG Scope 1']]
    change_ghg = com_trend['Change_in_GHG'].tolist()
    res = all(abs(i) > abs(j) for i, j in zip(change_ghg, change_ghg[1:]))
    if res == True:
        ghg_scope = com_trend['GHG Scope 1'].tolist()
        res_ghg = all(i < avg_ghg_scope[0] for i in ghg_scope)
        if res_ghg == True:
            eds_list = com_trend['Environmental Disclosure Score'].tolist()
            eds = stocks.loc[(stocks.Company.isin(companies))&(stocks.Year <= maxYear) & 
                             (stocks.Year >= min_year),['Environmental Disclosure Score']].mean().tolist()
            res_eds = all(i < eds[0] for i in eds_list)
            if res_eds == True:
                company_label_score[com] = label
                success = True
    return success

In [None]:
companies = cluster_stocks.Company.tolist()

In [None]:
assignEnvironmentalScoreLabel(stocks,companies,cluster_stocks, clus_profile)

In [None]:
assignEnvironmentalScoreLabel(stocks,['AES CORP (THE)'],cluster_stocks, clus_profile)