Data taken from: https://archive.ics.uci.edu/ml/datasets/Air+quality

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = [10, 5]

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip'
r = urlopen(path).read()
zf = ZipFile(BytesIO(r))

f = 'AirQualityUCI.csv'

In [3]:
df = pd.read_csv(zf.open(f), sep=';', parse_dates=['Date'])
print(df.dtypes)
df.head()

Date             datetime64[ns]
Time                     object
CO(GT)                   object
PT08.S1(CO)             float64
NMHC(GT)                float64
C6H6(GT)                 object
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                        object
RH                       object
AH                       object
Unnamed: 15             float64
Unnamed: 16             float64
dtype: object


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,2004-10-03,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578,,
1,2004-10-03,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255,,
2,2004-10-03,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502,,
3,2004-10-03,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867,,
4,2004-10-03,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888,,


In [4]:
df.shape

(9471, 17)

In [5]:
df.isna().sum()

Date              114
Time              114
CO(GT)            114
PT08.S1(CO)       114
NMHC(GT)          114
C6H6(GT)          114
PT08.S2(NMHC)     114
NOx(GT)           114
PT08.S3(NOx)      114
NO2(GT)           114
PT08.S4(NO2)      114
PT08.S5(O3)       114
T                 114
RH                114
AH                114
Unnamed: 15      9471
Unnamed: 16      9471
dtype: int64

In [6]:
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16'])

In [7]:
df = df.dropna(subset=['Date'])

In [8]:
obj_cols = [col for col in df.columns if df[col].dtype=='object']
obj_cols

['Time', 'CO(GT)', 'C6H6(GT)', 'T', 'RH', 'AH']

In [9]:
df[obj_cols].head()

Unnamed: 0,Time,CO(GT),C6H6(GT),T,RH,AH
0,18.00.00,26,119,136,489,7578
1,19.00.00,2,94,133,477,7255
2,20.00.00,22,90,119,540,7502
3,21.00.00,22,92,110,600,7867
4,22.00.00,16,65,112,596,7888


In [10]:
obj_cols.pop(0)
obj_cols

['CO(GT)', 'C6H6(GT)', 'T', 'RH', 'AH']

In [11]:
for col in obj_cols:
    df[col] = df[col].str.replace(',', '.')
    df[col] = pd.to_numeric(df[col])

df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-10-03,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2004-10-03,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2004-10-03,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2004-10-03,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,2004-10-03,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


In [12]:
df['Year'] = [int(date.year) for date in df['Date']]
df['Month'] = [date.strftime('%B') for date in df['Date']]
df['DayOfWeek'] = [date.strftime('%A') for date in df['Date']]
df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Year,Month,DayOfWeek
0,2004-10-03,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,2004,October,Sunday
1,2004-10-03,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,2004,October,Sunday
2,2004-10-03,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,2004,October,Sunday
3,2004-10-03,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,2004,October,Sunday
4,2004-10-03,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,2004,October,Sunday


# Predict CO concentration of next hour.

In [13]:
df.shape

(9357, 18)

In [14]:
df['Target'] = df['CO(GT)'].shift(1)
df = df.dropna()

In [15]:
df.shape

(9356, 19)

In [16]:
matrix = df.corr(method='pearson')
matrix

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Year,Target
CO(GT),1.0,0.041366,0.128274,-0.031391,0.029904,0.526458,-0.090026,0.67112,-0.073759,0.08028,-0.068944,-0.048237,-0.045903,0.198112,0.865878
PT08.S1(CO),0.041366,1.0,0.169837,0.852706,0.933111,0.278007,0.086945,0.153995,0.845143,0.892432,0.754871,0.745394,0.764921,-0.03309,0.037374
NMHC(GT),0.128274,0.169837,1.0,0.037275,0.110029,-0.004426,0.048644,0.103233,0.162574,0.101062,-3e-05,0.008243,0.012457,-0.164443,0.12369
C6H6(GT),-0.031391,0.852706,0.037275,1.0,0.767432,-0.001174,0.512191,-0.011004,0.774676,0.641334,0.971376,0.925062,0.984555,-0.086443,-0.031283
PT08.S2(NMHC),0.029904,0.933111,0.110029,0.767432,1.0,0.331276,-0.073709,0.176471,0.87478,0.909904,0.669028,0.585801,0.646571,-0.150734,0.02474
NOx(GT),0.526458,0.278007,-0.004426,-0.001174,0.331276,1.0,-0.436099,0.817148,0.035547,0.4619,-0.138452,-0.053009,-0.095847,0.29053,0.5083
PT08.S3(NOx),-0.090026,0.086945,0.048644,0.512191,-0.073709,-0.436099,1.0,-0.256281,0.122685,-0.208932,0.588124,0.573554,0.621625,-0.151692,-0.084541
NO2(GT),0.67112,0.153995,0.103233,-0.011004,0.176471,0.817148,-0.256281,1.0,-0.022204,0.253418,-0.084109,-0.081314,-0.060449,0.321602,0.65121
PT08.S4(NO2),-0.073759,0.845143,0.162574,0.774676,0.87478,0.035547,0.122685,-0.022204,1.0,0.723678,0.755071,0.640709,0.691916,-0.396645,-0.076524
PT08.S5(O3),0.08028,0.892432,0.101062,0.641334,0.909904,0.4619,-0.208932,0.253418,0.723678,1.0,0.503705,0.524955,0.519466,0.004269,0.075966


In [17]:
np.abs(matrix)

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Year,Target
CO(GT),1.0,0.041366,0.128274,0.031391,0.029904,0.526458,0.090026,0.67112,0.073759,0.08028,0.068944,0.048237,0.045903,0.198112,0.865878
PT08.S1(CO),0.041366,1.0,0.169837,0.852706,0.933111,0.278007,0.086945,0.153995,0.845143,0.892432,0.754871,0.745394,0.764921,0.03309,0.037374
NMHC(GT),0.128274,0.169837,1.0,0.037275,0.110029,0.004426,0.048644,0.103233,0.162574,0.101062,3e-05,0.008243,0.012457,0.164443,0.12369
C6H6(GT),0.031391,0.852706,0.037275,1.0,0.767432,0.001174,0.512191,0.011004,0.774676,0.641334,0.971376,0.925062,0.984555,0.086443,0.031283
PT08.S2(NMHC),0.029904,0.933111,0.110029,0.767432,1.0,0.331276,0.073709,0.176471,0.87478,0.909904,0.669028,0.585801,0.646571,0.150734,0.02474
NOx(GT),0.526458,0.278007,0.004426,0.001174,0.331276,1.0,0.436099,0.817148,0.035547,0.4619,0.138452,0.053009,0.095847,0.29053,0.5083
PT08.S3(NOx),0.090026,0.086945,0.048644,0.512191,0.073709,0.436099,1.0,0.256281,0.122685,0.208932,0.588124,0.573554,0.621625,0.151692,0.084541
NO2(GT),0.67112,0.153995,0.103233,0.011004,0.176471,0.817148,0.256281,1.0,0.022204,0.253418,0.084109,0.081314,0.060449,0.321602,0.65121
PT08.S4(NO2),0.073759,0.845143,0.162574,0.774676,0.87478,0.035547,0.122685,0.022204,1.0,0.723678,0.755071,0.640709,0.691916,0.396645,0.076524
PT08.S5(O3),0.08028,0.892432,0.101062,0.641334,0.909904,0.4619,0.208932,0.253418,0.723678,1.0,0.503705,0.524955,0.519466,0.004269,0.075966


In [18]:
subset = np.abs(matrix).sort_values(by='Target', ascending=False)
subset.head(10)

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Year,Target
Target,0.865878,0.037374,0.12369,0.031283,0.02474,0.5083,0.084541,0.65121,0.076524,0.075966,0.067452,0.047115,0.044477,0.19746,1.0
CO(GT),1.0,0.041366,0.128274,0.031391,0.029904,0.526458,0.090026,0.67112,0.073759,0.08028,0.068944,0.048237,0.045903,0.198112,0.865878
NO2(GT),0.67112,0.153995,0.103233,0.011004,0.176471,0.817148,0.256281,1.0,0.022204,0.253418,0.084109,0.081314,0.060449,0.321602,0.65121
NOx(GT),0.526458,0.278007,0.004426,0.001174,0.331276,1.0,0.436099,0.817148,0.035547,0.4619,0.138452,0.053009,0.095847,0.29053,0.5083
Year,0.198112,0.03309,0.164443,0.086443,0.150734,0.29053,0.151692,0.321602,0.396645,0.004269,0.164727,0.028008,0.069695,1.0,0.19746
NMHC(GT),0.128274,0.169837,1.0,0.037275,0.110029,0.004426,0.048644,0.103233,0.162574,0.101062,3e-05,0.008243,0.012457,0.164443,0.12369
PT08.S3(NOx),0.090026,0.086945,0.048644,0.512191,0.073709,0.436099,1.0,0.256281,0.122685,0.208932,0.588124,0.573554,0.621625,0.151692,0.084541
PT08.S4(NO2),0.073759,0.845143,0.162574,0.774676,0.87478,0.035547,0.122685,0.022204,1.0,0.723678,0.755071,0.640709,0.691916,0.396645,0.076524
PT08.S5(O3),0.08028,0.892432,0.101062,0.641334,0.909904,0.4619,0.208932,0.253418,0.723678,1.0,0.503705,0.524955,0.519466,0.004269,0.075966
T,0.068944,0.754871,3e-05,0.971376,0.669028,0.138452,0.588124,0.084109,0.755071,0.503705,1.0,0.885911,0.981002,0.164727,0.067452


In [19]:
columns = subset.head(7).index.tolist()
columns

['Target', 'CO(GT)', 'NO2(GT)', 'NOx(GT)', 'Year', 'NMHC(GT)', 'PT08.S3(NOx)']

In [20]:
X = df[columns]
X.head()

Unnamed: 0,Target,CO(GT),NO2(GT),NOx(GT),Year,NMHC(GT),PT08.S3(NOx)
1,2.6,2.0,92.0,103.0,2004,112.0,1174.0
2,2.0,2.2,114.0,131.0,2004,88.0,1140.0
3,2.2,2.2,122.0,172.0,2004,80.0,1092.0
4,2.2,1.6,116.0,131.0,2004,51.0,1205.0
5,1.6,1.2,96.0,89.0,2004,38.0,1337.0


In [21]:
y = X.Target
X = X.drop(columns=['Target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [22]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression()

In [23]:
pred = reg.predict(X_test)

In [24]:
def print_scores(pred, y_test):
    print('MAE = {:.2f}'.format(mean_absolute_error(pred, y_test)))
    print('RMSE = {:.2f}'.format(np.sqrt(mean_squared_error(pred, y_test))))
    print('R2 = {:.2f}'.format(r2_score(pred, y_test)))

In [25]:
print_scores(pred, y_test)

MAE = 15.16
RMSE = 38.48
R2 = 0.68


Benchmark model: assume CO(GT) of the next day is the same as today.

In [26]:
pred = X_test['CO(GT)']
print_scores(pred, y_test)

MAE = 8.66
RMSE = 40.69
R2 = 0.72
