<h1 align = 'center'>Air Quality</h1>
<br>
<h3 align = 'center'>Author - Naman Talwar</h3>
<br>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("air-quality-india.csv")
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().style.background_gradient(cmap = "Blues")

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
data.plot(y="PM2.5",figsize=(15,6))

plt.xlabel("Timestamp")
plt.ylabel("Particulate Matter 2.5")
plt.title("PM2.5 With Respect to Time")

plt.show()

<br>
<h2>Distribution of Particulate Matter by Month and Year</h2>


In [None]:
import plotly.express as px

In [None]:
fig = px.scatter_3d(data, x = "Year", y = "Month", z = "PM2.5",
                    color = "PM2.5", color_continuous_scale = ["#00FF00", "#FFC800", "#FF0000", "#B803BF"], 
                    range_color = (-45, 225)) 

fig.update_traces(marker = dict(size = 3.5)) 
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<br>
<h2>Distribution of Particulate Matter by Day and Month</h3>

In [None]:
fig = px.scatter_3d(data, x = "Month", y = "Day", z = "PM2.5",
                    color = "PM2.5", color_continuous_scale = ["#00FF00", "#FFC800", "#FF0000", "#B803BF"], 
                    range_color = (-45, 225)) 

fig.update_traces(marker = dict(size = 2.5)) 
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<br>
<h2>Distribution of Particulate Matter by Day and Month - Hourwise Animation

In [None]:
fig = px.scatter_3d(data, x = "Month", y = "Day", range_z = [0, 250], z = "PM2.5",
                    color = "PM2.5", color_continuous_scale = ["#00FF00", "#FFC800", "#FF0000", "#B803BF"],
                    animation_frame = "Hour", range_color = (-45, 225))

fig.update_traces(marker = dict(size = 3.5))
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<br>
<h2>Distribution of Particulate Matter by Hour and Day</h2>

In [None]:
fig = px.scatter_3d(data, x = "Day", y = "Hour", z = "PM2.5",
                    color = "PM2.5", color_continuous_scale = ["#00FF00", "#FFC800", "#FF0000", "#B803BF"], 
                    range_color = (-45, 225)) 

fig.update_traces(marker = dict(size = 2.5)) 
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<br>
<h2>Distribution of Particulate Matter by Day and Year - Hourwise Animation

In [None]:
fig = px.scatter_3d(data, x = "Year", y = "Day", range_z = [0,250], z = "PM2.5",
                    color = "PM2.5", color_continuous_scale = ["#00FF00", "#FFC800", "#FF0000", "#B803BF"],
                    animation_frame = "Hour", range_color = (-45, 225)) # range of color bar

fig.update_traces(marker = dict(size = 3.5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<br>
<h2>Boxplot of Particulate Matter Distribution by Year</h3>


In [None]:
fig = px.box(data,x= "Year", y= "PM2.5",points = "all", notched = True,color_discrete_sequence = px.colors.qualitative.Set3, color = "Year")

fig.update_layout(template = "plotly_dark")

fig.show()

In [None]:
fig = px.histogram(data, x = "Year", y = "PM2.5", color="Month", hover_data = data.columns,
                   color_discrete_sequence = px.colors.qualitative.Set3, 
                   title = "Sum of PM2.5 Distribution by Year and Month" )

fig.update_layout(template = "plotly_dark")
fig.show()

In [None]:
fig = px.box(data, x = "Month", y = "PM2.5", points = "all", color = "Month",
             color_discrete_sequence = px.colors.qualitative.Set3,
             title = "PM2.5 Distribution by Month")

fig.update_layout(template = "plotly_dark")

fig.show()

In [None]:
fig = px.histogram(data, x = "Month", y = "PM2.5", color="Year", hover_data = data.columns,
                   color_discrete_sequence = px.colors.qualitative.Set3, 
                   title = "Sum of PM2.5 Distribution by Month and Year" )

fig.update_layout(template = "plotly_dark")
fig.show()

In [None]:
fig = px.density_contour(data, x = "Month", y = "PM2.5", title = "PM2.5 Density by Month")

fig.update_layout(yaxis_range = [0, 115])
fig.update_layout(template = "plotly_dark")
fig.update_traces(contours_coloring = "fill", contours_showlabels = True)
fig.show()

In [None]:
# Here I store years 2018-2021 in a data frame because 2017 and 2022 does not have data from every month
df1 = data[data["Year"] > 2017]
df2 = df1[df1["Year"] < 2022]

for i in range(1, 13):
    
    df0 = df2[df2["Month"] == i]

    fig = px.histogram(df0, x = "Day", y = "PM2.5", color="Year", hover_data = data.columns,
                       color_discrete_sequence = px.colors.qualitative.Pastel, 
                       title = f"Daily Sum of PM2.5 in Month {i} (2018-2021)")
    
    fig.update_layout(yaxis_range = [0, 11000])
    fig.update_layout(template = "plotly_dark")

    fig.show()

In [None]:
for i in range(2018, 2022):
    
    df0 = df2[df2["Year"] == i]

    fig = px.histogram(df0, x = "Hour", y = "PM2.5", color="Month", hover_data = data.columns,
                       color_discrete_sequence = px.colors.qualitative.Set3, 
                       title = f"Total Sum of PM2.5 in Year: {i} by Hour")
    
    fig.update_layout(yaxis_range = [0, 21000])
    fig.update_layout(template = "plotly_dark")

    fig.show()

In [None]:
df2.head()

In [None]:
df2.info()

In [None]:
df2['Date'] = pd.to_datetime(df2['Timestamp']).dt.date
df2['Time'] = pd.to_datetime(df2['Timestamp']).dt.time

In [None]:
df2.head()

In [None]:
df2 = df2.drop("Timestamp",axis = 1)
df2.head()

In [None]:
df2.tail()

In [None]:
X = df2.loc[:,["Year","Month","Day","Hour"]]
X.head()

In [None]:
y = df2.loc[:,["PM2.5"]]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

<h4>Linear Regression


In [None]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
reg.score(X_train,y_train)

In [None]:
reg.score(X_test,y_test)

In [None]:
from sklearn import metrics

In [None]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test,y_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

In [None]:
print("Score: ",metrics.r2_score(y_test,y_pred))

<h4>Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train,y_train) 

In [None]:
y1_pred = reg_rf.predict(X_test)

In [None]:
reg_rf.score(X_train,y_train)

In [None]:
reg_rf.score(X_test,y_test)

In [None]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test,y1_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y1_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test,y1_pred)))

In [None]:
print("Score: ",metrics.r2_score(y_test,y1_pred))

<h4>SVR

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
reg_svr = make_pipeline(StandardScaler(), SVR(C= 1, epsilon = 0.2))
reg_svr.fit(X_train,y_train)

In [None]:
y3_pred = reg_svr.predict(X_test)

In [None]:
reg_svr.score(X_train,y_train)

In [None]:
reg_svr.score(X_test,y_test)

In [None]:
print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y3_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y3_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test,y3_pred)))

In [None]:
print("Score: ", metrics.r2_score(y_test,y3_pred))

<h4>Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
reg_lr = Lasso(alpha = 1.0)
reg_lr.fit(X_train,y_train)

In [None]:
y4_pred = reg_lr.predict(X_test)

In [None]:
reg_lr.score(X_train,y_train)

In [None]:
reg_lr.score(X_test,y_test)

In [None]:
print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y4_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y4_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test,y4_pred)))

In [None]:
print("Score: ",metrics.r2_score(y_test,y4_pred))

<h4>Ridge Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

In [None]:
ridge=Ridge(normalize=True)
search=GridSearchCV(estimator=ridge,param_grid={'alpha':np.logspace(-5,2,8)},
                    scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [None]:
search.fit(X_train,y_train)
search.best_params_

In [None]:
ridge=Ridge(normalize=True,alpha=0.01)
ridge.fit(X_train,y_train)

In [None]:
y5_pred = ridge.predict(X_test)

In [None]:
ridge.score(X_train,y_train)

In [None]:
ridge.score(X_test,y_test)

In [None]:
print("Score: ",metrics.r2_score(y_test,y5_pred))

In [None]:
print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y5_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y5_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test,y5_pred)))

<h4>Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg_dt = DecisionTreeRegressor()
reg_dt.fit(X_train,y_train)

In [None]:
y2_pred = reg_dt.predict(X_test)

In [None]:
reg_dt.score(X_train,y_train)

In [None]:
reg_dt.score(X_test,y_test)

In [None]:
print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y2_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y2_pred))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,y2_pred)))

In [None]:
print("Score: ",metrics.r2_score(y_test,y2_pred))

<h4>Bayesian Regression

In [None]:
from sklearn.linear_model import BayesianRidge
reg_bay = BayesianRidge()
reg_bay.fit(X_train, y_train)

In [None]:
y7_pred = reg_bay.predict(X_test)

In [None]:
reg_bay.score(X_train,y_train)

In [None]:
reg_bay.score(X_test,y_test)

In [None]:
print("Score: ",metrics.r2_score(y_test,y7_pred))

In [None]:
print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y7_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y7_pred))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,y7_pred)))

<h4>We can see that Random Forest Regression is the best model for this data</h4>

<h3>Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5, 8]
min_samples_leaf = [1, 2, 5]
bootstrap = [True, False]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=1, n_jobs = -1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction = rf_random.predict(X_test)

In [None]:
plt.figure(figsize = (8,8))
plt.scatter(y_test,prediction, alpha = 0.5)
plt.xlabel("y_test",fontsize = 12)
plt.ylabel('y_pred',fontsize = 12)
plt.show()

In [None]:
print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,prediction))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,prediction))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,prediction)))

In [None]:
print("R2 Score: ",metrics.r2_score(y_test,prediction))