In [1]:
#import neccessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('C:/Users/Maedeh/Downloads/psi_df_2016_2019.csv') #import the dataset

In [3]:
df.head()

Unnamed: 0,national,south,north,east,central,west,timestamp
0,47,44,37,47,47,34,2016-02-07T18:00:00+08:00
1,59,57,54,59,57,56,2016-02-08T23:00:00+08:00
2,59,57,55,59,57,56,2016-02-09T01:00:00+08:00
3,59,57,55,59,56,56,2016-02-09T02:00:00+08:00
4,59,58,55,59,56,56,2016-02-09T03:00:00+08:00


In [4]:
df.describe()

Unnamed: 0,national,south,north,east,central,west
count,30022.0,30022.0,30022.0,30022.0,30022.0,30022.0
mean,50.502232,54.294351,54.003464,52.491007,52.885884,50.299014
std,13.107219,10.844489,10.991413,10.365553,11.832203,11.672157
min,13.0,20.0,19.0,14.0,13.0,16.0
25%,42.0,50.0,49.0,47.0,47.0,43.0
50%,53.0,54.0,55.0,54.0,54.0,52.0
75%,58.0,59.0,59.0,58.0,59.0,57.0
max,143.0,154.0,145.0,131.0,154.0,143.0


### Data Preprocessing

In [5]:
import datetime

In [6]:
df['timestamp']=df['timestamp'].agg(pd.Timestamp)
df['timestamp'] = pd.to_datetime(df['timestamp']).dt.to_period('d')



In [7]:
df

Unnamed: 0,national,south,north,east,central,west,timestamp
0,47,44,37,47,47,34,2016-02-07
1,59,57,54,59,57,56,2016-02-08
2,59,57,55,59,57,56,2016-02-09
3,59,57,55,59,56,56,2016-02-09
4,59,58,55,59,56,56,2016-02-09
...,...,...,...,...,...,...,...
30017,76,78,72,74,78,71,2019-11-06
30018,75,77,71,73,77,71,2019-11-06
30019,74,77,71,73,77,71,2019-11-06
30020,74,78,72,73,78,71,2019-11-06


In [8]:
df['month'] = df['timestamp'].dt.month #get the month from timestamp column
df.drop('timestamp', axis=1, inplace=True)

def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

df = swap_columns(df, 'national', 'month') #move the target column so the dataset gets more clear
df

Unnamed: 0,month,south,north,east,central,west,national
0,2,44,37,47,47,34,47
1,2,57,54,59,57,56,59
2,2,57,55,59,57,56,59
3,2,57,55,59,56,56,59
4,2,58,55,59,56,56,59
...,...,...,...,...,...,...,...
30017,11,78,72,74,78,71,76
30018,11,77,71,73,77,71,75
30019,11,77,71,73,77,71,74
30020,11,78,72,73,78,71,74


In [9]:
df.isnull().sum()

month       0
south       0
north       0
east        0
central     0
west        0
national    0
dtype: int64

we don't have any missed value.

In [10]:
column = df['national']
max_value = column.max()
max_value

143

maximum value of our target is 143, so we don't have any 'very high' value.

In [11]:
category = pd.cut(df.national,bins=[0,55,150,250],labels=['Normal','Elevated','High'])
df.insert(7, 'Level', category)
df

Unnamed: 0,month,south,north,east,central,west,national,Level
0,2,44,37,47,47,34,47,Normal
1,2,57,54,59,57,56,59,Elevated
2,2,57,55,59,57,56,59,Elevated
3,2,57,55,59,56,56,59,Elevated
4,2,58,55,59,56,56,59,Elevated
...,...,...,...,...,...,...,...,...
30017,11,78,72,74,78,71,76,Elevated
30018,11,77,71,73,77,71,75,Elevated
30019,11,77,71,73,77,71,74,Elevated
30020,11,78,72,73,78,71,74,Elevated


In [12]:
#Label Encoding
df['Level'] = df['Level'].cat.codes
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,month,south,north,east,central,west,national,Level
0,2,44,37,47,47,34,47,0
1,2,57,54,59,57,56,59,1
2,2,57,55,59,57,56,59,1
3,2,57,55,59,56,56,59,1
4,2,58,55,59,56,56,59,1
...,...,...,...,...,...,...,...,...
30017,11,78,72,74,78,71,76,1
30018,11,77,71,73,77,71,75,1
30019,11,77,71,73,77,71,74,1
30020,11,78,72,73,78,71,74,1


In [13]:
#split the dataset for training multiple linear regression model
from sklearn.model_selection import train_test_split
train , test = train_test_split(df, test_size = 0.3)
x_train = train.drop('Level', axis=1)
y_train = train['Level']
x_test = test.drop('Level', axis = 1)
y_test = test['Level']

In [14]:
#normalize the dataset (mostly because of the month column)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train_scaled)
x_test_scaled = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test_scaled)

### Build a SVM model to predict the pollution level

In [15]:
from sklearn.svm import SVC

In [16]:
svc = SVC(kernel='linear', C=10.0, random_state=1) #C is the regularization parameter 
svc.fit(x_train, y_train)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=1,
    shrinking=True, tol=0.001, verbose=False)

In [17]:
y_pred = svc.predict(x_test)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[5273  471]
 [   0 3263]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      5744
           1       0.87      1.00      0.93      3263

    accuracy                           0.95      9007
   macro avg       0.94      0.96      0.94      9007
weighted avg       0.95      0.95      0.95      9007



In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('recall: %.3f' % recall_score(y_test, y_pred))
print('f1_score: %.3f' % f1_score(y_test, y_pred))

Accuracy: 0.948
Precision: 0.874
recall: 1.000
f1_score: 0.933
