In [6]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

  from pandas import Int64Index as NumericIndex


In [2]:
all_pollutant_data = pd.read_csv(".\\data\\processed_new\\all_pollutant_data_city.csv")
all_pollutant_data["Date"] = pd.to_datetime(all_pollutant_data.Date).dt.date

all_unemployment_data = pd.read_csv(".\\data\\processed_new\\all_unemployment_data.csv")
all_unemployment_data["Date"] = pd.to_datetime(all_unemployment_data.Date).dt.date

In [10]:
all_unemployment_data["Y-m"] = all_unemployment_data["Date"].apply(lambda x: x.strftime('%Y-%m'))
all_unemployment_data_monthly = all_unemployment_data.groupby(["Y-m", "City"]).mean().reset_index()

all_pollutant_data["Y-m"] = all_pollutant_data["Date"].apply(lambda x: x.strftime('%Y-%m'))
all_pollutant_data_monthly = all_pollutant_data.groupby(["Y-m", "City"]).mean().reset_index()

econclimate = all_pollutant_data_monthly.merge(all_unemployment_data_monthly, on=["Y-m", "City"], how="inner")
columns = econclimate.drop(["Y-m", "City"], axis=1).columns
econclimate

Unnamed: 0,Y-m,City,CO,NO2,SO2,PM2_5,Estimated Unemployment Rate (%)
0,2018-01,Agartala,956.423048,14.775519,7.417984,139.000760,31.11
1,2018-01,Agra,1278.267823,24.762805,12.173979,174.590756,4.03
2,2018-01,Aizwal,835.838529,4.041054,5.734123,112.508366,5.01
3,2018-01,Amritsar,1446.834901,21.450630,11.055651,189.679878,6.69
4,2018-01,Asansol,1137.709346,24.799873,25.826495,166.614636,8.63
...,...,...,...,...,...,...,...
2836,2022-07,Solapur,241.367882,3.462263,2.022338,3.581522,3.73
2837,2022-07,Srinagar,1478.492269,43.655226,12.104774,61.230544,20.22
2838,2022-07,Thiruvananthapuram,220.199491,1.964161,1.588050,4.816510,4.93
2839,2022-07,Varanasi,680.913750,13.196834,10.045312,45.573409,3.29


In [4]:
econclimate.to_csv(".\\data\\processed_new\\econclimate.csv", index=False)

In [11]:
scaler = StandardScaler()
econclimate = scaler.fit_transform(econclimate.drop(["Y-m", "City"], axis=1))

In [12]:
pd.DataFrame(econclimate, columns=columns).corr()

Unnamed: 0,CO,NO2,SO2,PM2_5,Estimated Unemployment Rate (%)
CO,1.0,0.871977,0.729659,0.897399,0.114483
NO2,0.871977,1.0,0.809418,0.69087,0.221463
SO2,0.729659,0.809418,1.0,0.562644,0.087702
PM2_5,0.897399,0.69087,0.562644,1.0,0.051346
Estimated Unemployment Rate (%),0.114483,0.221463,0.087702,0.051346,1.0


In [13]:
X = econclimate[:,:-1]
y = econclimate[:,-1]

In [14]:
model = LinearRegression()
model.fit(X, y)
importance = model.coef_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO -0.21563644508260735
NO2 0.6608917498370815
SO2 -0.24986433728274599
PM2_5 -0.07114735762759516


In [15]:
mod = sm.OLS(y,X)
fit = mod.fit()
fit.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared (uncentered):,0.095
Dependent Variable:,y,AIC:,7783.3328
Date:,2022-11-27 08:31,BIC:,7807.1404
No. Observations:,2841,Log-Likelihood:,-3887.7
Df Model:,4,F-statistic:,75.41
Df Residuals:,2837,Prob (F-statistic):,7.72e-61
R-squared (uncentered):,0.096,Scale:,0.90516

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
x1,-0.2156,0.0666,-3.2370,0.0012,-0.3463,-0.0850
x2,0.6609,0.0451,14.6606,0.0000,0.5725,0.7493
x3,-0.2499,0.0307,-8.1414,0.0000,-0.3100,-0.1897
x4,-0.0711,0.0450,-1.5827,0.1136,-0.1593,0.0170

0,1,2,3
Omnibus:,1610.929,Durbin-Watson:,1.62
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17313.711
Skew:,2.504,Prob(JB):,0.0
Kurtosis:,14.008,Condition No.:,8.0


In [16]:
model = DecisionTreeRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO 0.15914828519822488
NO2 0.3040906594659085
SO2 0.33211753072843514
PM2_5 0.20464352460743138


In [17]:
model = RandomForestRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO 0.17984746178799432
NO2 0.29376413639404336
SO2 0.3357605904503797
PM2_5 0.19062781136758267


In [18]:
model = XGBRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i in range(len(importance)):
    print(columns[i], importance[i])

CO 0.13198125
NO2 0.29829365
SO2 0.32388422
PM2_5 0.24584088
