##### Importing libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.express as px
import plotly

from sklearn.preprocessing import LabelEncoder

import pickle

from math import sqrt

# Function for splitting training and test set
from sklearn.model_selection import train_test_split

# Function to perform data standardization 
from sklearn.preprocessing import StandardScaler

# Import classes for ML Models
from sklearn.linear_model import Ridge  ## Linear Regression + L2 regularization
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


# Evaluation Metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix


# To save the final model on disk
from sklearn.externals import joblib

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)  

ModuleNotFoundError: No module named 'mlxtend'

## Exploratory Data Analysis, Data Preprocessing, and data Visualization

Reading CSV file data


In [None]:
df = pd.read_csv("wbengal.csv")

 Displaying first five rows.

In [None]:
df.head()

In [None]:
df.shape

Displaying the all columns of the dataset.

In [None]:
df.columns

In [None]:
df['state'].unique()

As we can see the Unnamed: 0 and data attribute is not useful for us for the prediction. Also state for which we are doing is the West Bengal which is same throughout the data. So we will be dorpping all of them.

In [None]:
df.drop(['Unnamed: 0', 'date', 'state'], axis = 1, inplace = True)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.dtypes

As you can see location, type and the AQI_range is the only object data type. Rest all are the float values.

In [None]:
df['AQI_Range'].value_counts()

In [None]:
# Displaying unique locations
print(df['location'].nunique())
df['location'].unique()

There are total 17 unique locations in the dataset.

In [None]:
# Displaying unique type
print(df['type'].nunique())
df['type'].unique()

* As you can see the 'Industrial Area', 'Industrial', and 'Industrial Areas' are same. So we can assign the same labels to them. This way it will reduce our unique values of location.
* Also 'Residential', 'Residential, Rural and other Areas', 'Residential and others' can be combine into 1 group.

In [None]:
df['type'].replace(['Industrial Area', 'Industrial Areas', 'Industrial'], 'Industrial Area', inplace = True)
df['type'].replace(['Residential', 'Residential, Rural and other Areas', 'Residential and others'], 'Residential',
                   inplace = True)

In [None]:
# Displaying unique type
print(df['type'].nunique())
df['type'].unique()

In [None]:
df['location'].value_counts()

In [None]:
plt.figure(figsize = (10,10))
ax = sns.countplot(data = df, x = 'AQI_Range')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
plt.tight_layout()
plt.title("Count AQI Ranges")
plt.xlabel("AQI Range")
plt.ylabel("Count")
plt.show()

Countplot by each location of West Bengal.

In [None]:
plt.figure(figsize = (10,10))
ax = sns.countplot(data = df, x = 'location')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
plt.tight_layout()
plt.title("Count of each location")
plt.xlabel("Locations")
plt.ylabel("Count")
plt.show()

Barplot of AQI for each location in WB.

In [None]:
plt.figure(figsize = (10,10))
ax = sns.barplot(data = df, x = 'location', y = "AQI")
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
plt.tight_layout()
plt.title("Barplot of AQI by each location in WB")
plt.xlabel("Locations")
plt.ylabel("AQI Values")
plt.show()

As you can see from the above plot the DURAGAPUR (WB) location has the highest AQI Value.

Barplot of AQI by each type in WB.

In [None]:
plt.figure(figsize = (10,10))
ax = sns.barplot(data = df, x = 'type', y = "AQI")
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
plt.tight_layout()
plt.title("Barplot of AQI by each type in WB")
plt.xlabel("Type")
plt.ylabel("AQI Values")
plt.show()

The highest AQI value is due to the Industrial type.

In [None]:
df_plot = df.fillna('NA').groupby(['location','type'])['AQI'].sum().groupby(['location','type']).max().sort_values().groupby(['location']).sum().sort_values(ascending=False)
top_count = pd.DataFrame(df_plot)
top_count1 = pd.DataFrame(df_plot.head(17))
fig_reg = px.bar(top_count1,x=top_count1.index, y='AQI',color='AQI')
fig_reg.update_layout(
    title="AQI Values by each location in west bengal",
    xaxis_title=" Locations",
    yaxis_title="AQI Values ",
    )
fig_reg.show(renderer="notebook")

In [None]:
df_plot = df.fillna('NA').groupby(['location','type'])['AQI'].sum().groupby(['location','type']).max().sort_values().groupby(['type']).sum().sort_values(ascending=False)
top_count = pd.DataFrame(df_plot)
top_count1 = pd.DataFrame(df_plot.head(17))
fig_reg = px.bar(top_count1,x=top_count1.index, y='AQI',color='AQI')
fig_reg.update_layout(
    title="AQI Values by each type in west bengal",
    xaxis_title=" Type",
    yaxis_title="AQI Values ",
    )
fig_reg.show(renderer="notebook")

Plotting AQI Range and AQI values for the randomly 2 to 3 location.

In [None]:
sort=df[df["location"]=="HALDIA"].sort_values(by=["AQI"],ascending=False)
#sort_fat=df[df["location"]=="HALDIA"].sort_values(by=["Fatalities"],ascending=False)

In [None]:
fig = px.line(sort, x="AQI", y="AQI_Range",color='AQI_Range', title='AQI Range and AQI values by HALDIA location')
fig.update_layout( xaxis_title=" AQI ",yaxis_title=" AQI Range")

fig.show(renderer="notebook")

As we can see the the AQI values less than 50 is havig good AQI Range. For the Hazardous AQI Range the AQI values are greater than 400. Similarly for the others you can observe from the above plot.

In [None]:
sort=df[df["location"]=="CALCUTTA"].sort_values(by=["AQI"],ascending=False)
fig = px.line(sort, x="AQI", y="AQI_Range",color='AQI_Range', title='AQI Range and AQI values by CALCUTTA location')
fig.update_layout( xaxis_title=" AQI ",yaxis_title=" AQI Range")

fig.show(renderer="notebook")

For the CALCUTTA location the AQI values can go more than 800 for the Hazardous AQI Range.

In [None]:
sort=df[df["location"]=="ULUBERIA"].sort_values(by=["AQI"],ascending=False)
fig = px.line(sort, x="AQI", y="AQI_Range",color='AQI_Range', title='AQI Range and AQI values by ULUBERIA location')
fig.update_layout( xaxis_title=" AQI ",yaxis_title=" AQI Range")

fig.show(renderer="notebook")

For the ULUBERIA location the AQI Range is poor throughout the all the values of the AQI. The AQI values are not also exceeding more than the 200.

Plot of so2 and no2 values by each location in west bengal.

In [None]:
sort=df.sort_values("AQI",ascending=False)
fig = px.scatter(sort, x="so2", y="no2",color='location', title='so2 and no2 values by each location in west bengal')
fig.update_layout( xaxis_title=" so2 values ",yaxis_title="no2 values")

fig.show(renderer="notebook")

As we can see for each location the most of the so2 and no2 values are below 150.

Plot of so2 and no2 values by each type in west bengal.

In [None]:
sort=df.sort_values("AQI",ascending=False)
fig = px.scatter(sort, x="so2", y="no2",color='type', title='so2 and no2 values by each type in west bengal')
fig.update_layout( xaxis_title=" so2 values ",yaxis_title="no2 values")

fig.show(renderer="notebook")

Most of the values of so2 and no2 are due to the Industrial Area.

In [None]:
sort=df.sort_values("AQI",ascending=False)
fig = px.line(sort, x="AQI", y="AQI_Range",color='location', title='AQI Range and AQI values by each location in west bengal')
fig.update_layout( xaxis_title=" AQI values ",yaxis_title="AQI Range")

fig.show(renderer="notebook")

Checking for the null values in the dataset.

In [None]:
df.isnull().sum()

##### Label Encoding

There is no value mission in our dataset. So all the values are filled in the dataset. As we have seen above the AQI Range, location and type are the categorical features. So we need to convert it into the numbers.

In [None]:
label_en1 = LabelEncoder()
df['type'] = label_en1.fit_transform(df['type'])

In [None]:
label_en1.classes_

In [None]:
label_en1.transform(['Industrial Area', 'RIRUO', 'Residential'])

In [None]:
label_en2 = LabelEncoder()
df['location'] = label_en2.fit_transform(df['location'])
print(label_en2.classes_)

In [None]:
label_en2.transform(['ASANSOL', 'BARRACKPORE', 'BARUIPUR', 'CALCUTTA', 'DANKUNI', 'DURGAPUR',
                     'DURGAPUR (WB)', 'HALDIA', 'HOWRAH', 'KALYANI', 'KOLKATA', 'MALDAH', 'RANIGANJ',
                     'SANKRAIL', 'SILIGURI', 'SOUTH SUBURBAN', 'ULUBERIA'])

In [None]:
label_en3 = LabelEncoder()
df['AQI_Range'] = label_en3.fit_transform(df['AQI_Range'])
print(label_en3.classes_)

In [None]:
label_en3.transform(['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'])

In [None]:
df.dtypes

This will confirm that all the features contains now numeric values.

Saving all the 3 encoders to the disk for the future use.

In [None]:
output = open('Type_encoder.pkl', 'wb')
pickle.dump(label_en1, output)
output.close()

output = open('Location_encoder.pkl', 'wb')
pickle.dump(label_en2, output)
output.close()

output = open('AQI_Range_encoder.pkl', 'wb')
pickle.dump(label_en3, output)
output.close()

Checking the model saved to disk by again loading it. This is just for the cross verification.

In [None]:
pkl_file = open('Type_encoder.pkl', 'rb')
label = pickle.load(pkl_file) 
pkl_file.close()

In [None]:
label.transform(['Industrial Area', 'RIRUO', 'Residential'])

In [None]:
df.head()

Separating target feature

In [None]:
y1 = df['AQI']
y2 = df['AQI_Range']
X = df.drop(['AQI', 'AQI_Range'], axis = 1)

#### Data standardization
* In Data Standardization we perform zero mean centring and unit scaling; i.e. we make the mean of all the features as zero and the standard deviation as 1.
* Thus we use **mean** and **standard deviation** of each feature.
* It is very important to save the **mean** and **standard deviation** for each of the feature from the **training set**, because we use the same mean and standard deviation in the test set.

In [None]:
a = StandardScaler()
X = a.fit_transform(X)

In [None]:
X.mean()

In [None]:
X.std()

#### Train and test split

As we have to developed the model to predict the 2 outcome so we will build the separate model for each outcome and accordingly also do the splitting of the data.

In [None]:
# Split X and y into train and test sets: 80-20
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=1234)

In [None]:
print(X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape)

In [None]:
# Split X and y into train and test sets: 80-20
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=1234)

In [None]:
print(X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape)

### Machine Learning Models for AQI values (Regression problem).

##### 1. Ridge Regression

In [None]:
ridge1 = Ridge()
ridge1.fit(X1_train, y1_train)

In [None]:
## Predict Test results
y_pred = ridge1.predict(X1_test)


print("Validation Results for Ridge Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y1_test, y_pred)))
print("R-squared: ", r2_score(y1_test, y_pred))
print("Mean Absolute Error: ", mae(y1_test, y_pred))

##### 2. Random Forest

In [None]:
random1 = RandomForestRegressor()
random1.fit(X1_train, y1_train)

In [None]:
## Predict Test results
y_pred = random1.predict(X1_test)


print("Validation Results for Random Forest Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y1_test, y_pred)))
print("R-squared: ", r2_score(y1_test, y_pred))
print("Mean Absolute Error: ", mae(y1_test, y_pred))

##### 3. XGBoost

In [None]:
xgb1 = XGBRegressor(n_estimators=1000)
xgb1.fit(X1_train, y1_train)

In [None]:
## Predict Test results
y_pred = xgb1.predict(X1_test)


print("Validation Results for Xgboost Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y1_test, y_pred)))
print("R-squared: ", r2_score(y1_test, y_pred))
print("Mean Absolute Error: ", mae(y1_test, y_pred))


From the preformance of the 3 models we can say that the XGBoost has performed very well. So saving the model to the disk.

In [None]:
joblib.dump(xgb1, 'AQI_model.pkl') 

### Machine Learning Models for AQI Range (Classification problem).

##### 1. KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X2_train, y2_train)

In [None]:
accuracy = knn.score(X2_test, y2_test)
print("accuracy = ",accuracy * 100,"%")

In [None]:
y_pred = knn.predict(X2_test)

Confusion Matrix

In [None]:
confusion_matrix(y2_test,y_pred).T

Plotting confusion matrix with proper labels.

In [None]:
cm = confusion_matrix(y2_test,y_pred)
plot_confusion_matrix(cm, figsize = (10, 5 ))
plt.xticks(range(6), ['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'], fontsize=16, rotation = 90)
plt.yticks(range(6), ['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'], fontsize=16)

##### 2.Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X2_train, y2_train)

In [None]:
y_pred = logreg.predict(X2_test)

In [None]:
confusion_matrix(y2_test,y_pred).T

In [None]:
cm = confusion_matrix(y2_test,y_pred)
plot_confusion_matrix(cm, figsize = (10, 5 ))
plt.xticks(range(6), ['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'], fontsize=16, rotation = 90)
plt.yticks(range(6), ['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'], fontsize=16)

###### 3. Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X2_train, y2_train)

In [None]:
y_pred = gb.predict(X2_test)

In [None]:
confusion_matrix(y2_test,y_pred).T

In [None]:
cm = confusion_matrix(y2_test,y_pred)
plot_confusion_matrix(cm, figsize = (10, 5 ))
plt.xticks(range(6), ['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'], fontsize=16, rotation = 90)
plt.yticks(range(6), ['Good', 'Hazardous', 'Moderate', 'Poor', 'Unhealthy', 'Very unhealthy'], fontsize=16)

By looking at the confusion matrix we can say that the gradient boosting has performed well. So we will save it on the disk for the future use.

In [None]:
joblib.dump(gb, 'AQI_Range_model.pkl') 

In [None]:
type(X)

In [None]:
df.tail()