In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [4]:
all_pollutant_data = pd.read_csv(".\\data\\processed_new\\all_pollutant_data_national.csv")
all_pollutant_data["Date"] = pd.to_datetime(all_pollutant_data.Date).dt.date

all_unemployment_data = pd.read_csv(".\\data\\processed_new\\all_unemployment_data.csv")
all_unemployment_data["Date"] = pd.to_datetime(all_unemployment_data.Date).dt.date

In [12]:
all_unemployment_data["Y-m"] = all_unemployment_data["Date"].apply(lambda x: x.strftime('%Y-%m'))
all_unemployment_data_monthly = all_unemployment_data.groupby("Y-m").mean().reset_index()

all_pollutant_data["Y-m"] = all_pollutant_data["Date"].apply(lambda x: x.strftime('%Y-%m'))
all_pollutant_data_monthly = all_pollutant_data.groupby("Y-m").mean().reset_index()

econclimate = all_pollutant_data_monthly.merge(all_unemployment_data_monthly, on="Y-m", how="inner")
columns = econclimate.drop("Y-m", axis=1).columns
econclimate

Unnamed: 0,Y-m,CO,NO2,SO2,PM2_5,Estimated Unemployment Rate (%)
0,2018-01,1213.933874,22.948988,17.717086,127.632185,5.329245
1,2018-02,1160.015898,22.735429,17.600966,113.687866,5.862453
2,2018-03,1200.934585,22.677361,17.497142,105.189174,5.98566
3,2018-04,1198.167295,21.723334,17.322147,94.785057,5.391887
4,2018-05,1184.150922,20.912571,17.152883,93.648113,5.216415
5,2018-06,1063.139832,20.6438,17.075191,80.218179,5.766415
6,2018-07,1040.601425,19.464345,17.071963,72.294033,5.599057
7,2018-08,1059.118546,19.003649,16.89788,71.081183,6.640943
8,2018-09,1014.113641,18.675401,16.918368,82.163834,6.968889
9,2018-10,1033.079623,20.092414,17.124593,114.279329,7.469259


In [13]:
scaler = StandardScaler()
econclimate = scaler.fit_transform(econclimate.drop("Y-m", axis=1))

In [14]:
pd.DataFrame(econclimate, columns=columns).corr()

Unnamed: 0,CO,NO2,SO2,PM2_5,Estimated Unemployment Rate (%)
CO,1.0,0.83233,0.759176,0.799453,-0.48437
NO2,0.83233,1.0,0.724514,0.935187,-0.265674
SO2,0.759176,0.724514,1.0,0.702309,-0.351957
PM2_5,0.799453,0.935187,0.702309,1.0,-0.276007
Estimated Unemployment Rate (%),-0.48437,-0.265674,-0.351957,-0.276007,1.0


In [16]:
X = econclimate[:,:-1]
y = econclimate[:,-1]

In [18]:
model = LinearRegression()
model.fit(X, y)
importance = model.coef_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO -0.8203286022407037
NO2 0.5273926017628908
SO2 -0.06244003948611154
PM2_5 -0.06955146992100003


In [23]:
mod = sm.OLS(y,X)
fit = mod.fit()
fit.summary2()#.tables[1]['P>|t|']

0,1,2,3
Model:,OLS,Adj. R-squared (uncentered):,0.243
Dependent Variable:,y,AIC:,144.5916
Date:,2022-11-27 08:26,BIC:,152.6209
No. Observations:,55,Log-Likelihood:,-68.296
Df Model:,4,F-statistic:,5.423
Df Residuals:,51,Prob (F-statistic):,0.00102
R-squared (uncentered):,0.298,Scale:,0.75663

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
x1,-0.8203,0.2325,-3.5288,0.0009,-1.2870,-0.3536
x2,0.5274,0.3630,1.4529,0.1524,-0.2013,1.2561
x3,-0.0624,0.1868,-0.3342,0.7396,-0.4375,0.3126
x4,-0.0696,0.3338,-0.2084,0.8358,-0.7396,0.6005

0,1,2,3
Omnibus:,40.255,Durbin-Watson:,0.894
Prob(Omnibus):,0.0,Jarque-Bera (JB):,124.522
Skew:,2.085,Prob(JB):,0.0
Kurtosis:,9.079,Condition No.:,7.0


In [19]:
model = DecisionTreeRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO 0.8262181561147027
NO2 0.1032794414075125
SO2 0.012598168024453288
PM2_5 0.057904234453331475


In [20]:
model = RandomForestRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO 0.7766409792450706
NO2 0.09186907391173077
SO2 0.05630465175537494
PM2_5 0.07518529508782384


In [21]:
model = XGBRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO 0.81169665
NO2 0.080727376
SO2 0.062356997
PM2_5 0.045218926
