In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [73]:
df = pd.read_csv('all_data.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,event_date,country,counts,population,events_per_capita,gdp_per_capita,Western,Asia,South America,public_trust_percentage,living_under_2_dollar_percentage
0,2021,Australia,681,25685412,2.651310,60697.245436,1,0,0,51.9,
1,2022,Australia,794,26005540,3.053196,65099.845912,1,0,0,49.9,
2,2020,Austria,354,8916864,3.970006,48789.497850,1,0,0,62.6,0.6
3,2021,Austria,562,8955797,6.275265,53517.890451,1,0,0,61.0,0.4
4,2022,Austria,294,9041851,3.251547,52084.681195,1,0,0,61.0,
...,...,...,...,...,...,...,...,...,...,...,...
113,2021,United Kingdom,1585,67026292,2.364744,46869.759058,1,0,0,39.5,0.2
114,2022,United Kingdom,1745,66971395,2.605590,46125.255751,1,0,0,39.5,
115,2020,United States,21585,331511512,6.511086,63528.634303,1,0,0,46.5,0.2
116,2021,United States,13147,332031554,3.959563,70219.472454,1,0,0,40.5,0.2


In [80]:
df_scaled = pd.DataFrame()
df_scaled['event_date'] = df['event_date']
df_scaled['country'] = df['country']
df_scaled['Western'] = df['Western']
df_scaled['Asia'] = df['Asia']
df_scaled['South America'] = df['South America']

for feat in df.columns[2:]:
        if feat != 'Western' and feat != 'Asia' and feat != 'South America':
                df_scaled[f'{feat}_scaled'] = ((df[feat] - df[feat].mean()) / df[feat].std()).round(3)
df_scaled

Unnamed: 0,event_date,country,Western,Asia,South America,counts_scaled,population_scaled,events_per_capita_scaled,gdp_per_capita_scaled,public_trust_percentage_scaled,living_under_2_dollar_percentage_scaled
0,2021,Australia,1,0,0,-0.382,-0.283,-0.834,0.720,0.288,
1,2022,Australia,1,0,0,-0.344,-0.278,-0.699,0.877,0.168,
2,2020,Austria,1,0,0,-0.491,-0.524,-0.391,0.295,0.930,0.095
3,2021,Austria,1,0,0,-0.421,-0.523,0.384,0.464,0.834,-0.154
4,2022,Austria,1,0,0,-0.511,-0.522,-0.632,0.412,0.834,
...,...,...,...,...,...,...,...,...,...,...,...
113,2021,United Kingdom,1,0,0,-0.080,0.311,-0.930,0.226,-0.457,-0.403
114,2022,United Kingdom,1,0,0,-0.026,0.310,-0.849,0.200,-0.457,
115,2020,United States,1,0,0,6.604,4.111,0.463,0.821,-0.036,-0.403
116,2021,United States,1,0,0,3.784,4.118,-0.394,1.060,-0.397,-0.403


In [81]:
df_scaled.corr()

  df_scaled.corr()


Unnamed: 0,event_date,Western,Asia,South America,counts_scaled,population_scaled,events_per_capita_scaled,gdp_per_capita_scaled,public_trust_percentage_scaled,living_under_2_dollar_percentage_scaled
event_date,1.0,0.418741,-0.099151,-0.212938,0.019465,-0.062214,0.14531,0.255186,0.230861,-0.058273
Western,0.418741,1.0,-0.311234,-0.668411,-0.022514,-0.285831,0.235286,0.475514,0.430178,-0.54483
Asia,-0.099151,-0.311234,1.0,-0.095027,-0.023615,0.245017,-0.280537,-0.012674,-0.105139,
South America,-0.212938,-0.668411,-0.095027,1.0,0.162801,0.370382,-0.249832,-0.495538,-0.385211,0.723523
counts_scaled,0.019465,-0.022514,-0.023615,0.162801,1.0,0.81321,0.131153,-0.040161,-0.187741,0.061129
population_scaled,-0.062214,-0.285831,0.245017,0.370382,0.81321,1.0,-0.246147,-0.147677,-0.27377,0.303462
events_per_capita_scaled,0.14531,0.235286,-0.280537,-0.249832,0.131153,-0.246147,1.0,0.193235,0.134652,-0.296407
gdp_per_capita_scaled,0.255186,0.475514,-0.012674,-0.495538,-0.040161,-0.147677,0.193235,1.0,0.710791,-0.462278
public_trust_percentage_scaled,0.230861,0.430178,-0.105139,-0.385211,-0.187741,-0.27377,0.134652,0.710791,1.0,-0.370835
living_under_2_dollar_percentage_scaled,-0.058273,-0.54483,,0.723523,0.061129,0.303462,-0.296407,-0.462278,-0.370835,1.0


In [23]:
plt.scatter3d(x = df['public_trust_percentage'], y = df['living_under_2_dollar_percentage'], z = df['events_per_capita'])
plt.xlabel('Public trust')
plt.ylabel('Living in Poverty')
plt.zlabel('Events per Capita')
plt.title("Fossil Fuels Consumption vs CO2 emissions per population")

AttributeError: module 'matplotlib.pyplot' has no attribute 'scatter3d'

In [66]:
predictors = ['Western', 'South America']
X = df[predictors]
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

         Feature  VIF
0        Western  1.0
1  South America  1.0


In [85]:
filtered_df = df_scaled.dropna(subset=['living_under_2_dollar_percentage_scaled'])
filtered_df

Unnamed: 0,event_date,country,Western,Asia,South America,counts_scaled,population_scaled,events_per_capita_scaled,gdp_per_capita_scaled,public_trust_percentage_scaled,living_under_2_dollar_percentage_scaled
2,2020,Austria,1,0,0,-0.491,-0.524,-0.391,0.295,0.930,0.095
3,2021,Austria,1,0,0,-0.421,-0.523,0.384,0.464,0.834,-0.154
5,2020,Belgium,1,0,0,-0.347,-0.486,0.559,0.181,-1.057,-0.652
6,2021,Belgium,1,0,0,-0.331,-0.485,0.691,0.404,0.012,-0.652
8,2018,Brazil,0,0,1,1.579,2.368,-0.678,-1.122,-1.819,1.836
...,...,...,...,...,...,...,...,...,...,...,...
109,2020,Switzerland,1,0,0,-0.554,-0.528,-1.083,1.620,2.250,-0.652
112,2020,United Kingdom,1,0,0,0.125,0.312,-0.625,-0.012,-0.745,-0.279
113,2021,United Kingdom,1,0,0,-0.080,0.311,-0.930,0.226,-0.457,-0.403
115,2020,United States,1,0,0,6.604,4.111,0.463,0.821,-0.036,-0.403


In [83]:
X_array = np.array([df_scaled['public_trust_percentage_scaled'], df_scaled['Western'], df_scaled['Asia'], df_scaled['South America']]).T
y_array = np.array(df_scaled['events_per_capita_scaled'])
X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.3, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse, r2)

1.0408717315933247 0.21301832026314993


In [90]:
X_array = np.array([df_scaled['public_trust_percentage_scaled'], df_scaled['Western'], df_scaled['Asia'], df_scaled['South America']]).T
y_array = np.array(df_scaled['events_per_capita_scaled'])
X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.3, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse, r2)

1.0408717315933247 0.21301832026314993


In [92]:
df_scaled = pd.DataFrame()
df_scaled['event_date'] = df['event_date']
df_scaled['country'] = df['country']
df_scaled['Western'] = df['Western']
df_scaled['Asia'] = df['Asia']
df_scaled['South America'] = df['South America']
df_scaled['counts'] = df['counts']

for feat in df.columns[2:]:
        if feat != 'Western' and feat != 'Asia' and feat != 'South America' and feat != 'counts':
                df_scaled[f'{feat}_scaled'] = ((df[feat] - df[feat].mean()) / df[feat].std()).round(3)
df_scaled

Unnamed: 0,event_date,country,Western,Asia,South America,counts,population_scaled,events_per_capita_scaled,gdp_per_capita_scaled,public_trust_percentage_scaled,living_under_2_dollar_percentage_scaled
0,2021,Australia,1,0,0,681,-0.283,-0.834,0.720,0.288,
1,2022,Australia,1,0,0,794,-0.278,-0.699,0.877,0.168,
2,2020,Austria,1,0,0,354,-0.524,-0.391,0.295,0.930,0.095
3,2021,Austria,1,0,0,562,-0.523,0.384,0.464,0.834,-0.154
4,2022,Austria,1,0,0,294,-0.522,-0.632,0.412,0.834,
...,...,...,...,...,...,...,...,...,...,...,...
113,2021,United Kingdom,1,0,0,1585,0.311,-0.930,0.226,-0.457,-0.403
114,2022,United Kingdom,1,0,0,1745,0.310,-0.849,0.200,-0.457,
115,2020,United States,1,0,0,21585,4.111,0.463,0.821,-0.036,-0.403
116,2021,United States,1,0,0,13147,4.118,-0.394,1.060,-0.397,-0.403


In [93]:
X_array = np.array([df_scaled['public_trust_percentage_scaled'], df_scaled['Western'], df_scaled['Asia'], df_scaled['South America']]).T
y_array = np.array(df_scaled['events_per_capita_scaled'])
#X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.3, random_state=42)
model = LinearRegression()
# fit the model on the entire data set to compare the r2 scores and make sure theres no overfitting going on 
model.fit(X_array, y_array)

# Predict on the entire set
y_pred = model.predict(X_array)

# Evaluate the model
mse = mean_squared_error(y_array, y_pred)
r2 = r2_score(y_array, y_pred)
print(mse, r2)

0.8316073644921309 0.16124960875448513
