# Models 

In this notebook, we develop the models for the transformed dataset. 

In [36]:
import pandas as pd
import numpy as np
import os
import tqdm

# plotting
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# models 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.svm import LinearSVR

# others
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Definitions for the notebook

In [2]:
# For local folder
IMAGES_FOLDER = "../notes/images/"
# For Google colab
#IMAGES_FOLDER = "drive/MyDrive/Arquivos Acadêmicos/Disciplinas FGV/Machine Learning/images/"

sns.set()

pd.set_option('precision', 3)
pd.options.mode.chained_assignment = None

%matplotlib inline

## Importing the dataset

In [3]:
# For local folder
location = "../data/"
# For Google colab
#location = "drive/MyDrive/Arquivos Acadêmicos/Disciplinas FGV/Machine Learning/"

air_data = pd.read_csv(location + "RiodeJaneiro_MonitorAr_hourly_p3.csv", index_col = 0)
air_data.weekend = air_data.weekend.astype(int)
air_data.head()

Unnamed: 0,year,month,day,CodNum,Lat,Lon,Chuva,Pres,RS,Temp,...,CO_lag24,CO_MA24,O3_lag1,O3_lag2,O3_lag24,O3_MA24,PM10_lag1,PM10_lag2,PM10_lag24,PM10_MA24
163253,2011,1,2,1,-22.965,-43.18,3.617,-1.53,-1.128,-0.14,...,-0.763,-0.936,-0.738,-0.543,-1.19,-0.365,0.101,0.078,0.087,0.082
326745,2011,1,2,7,-22.898,-43.222,-0.272,-1.513,-0.565,0.56,...,-1.678,-1.232,-0.612,-0.997,-0.724,-0.187,-0.307,-0.967,-1.858,-1.136
103103,2011,1,2,3,-22.908,-43.178,3.617,-1.557,-0.72,-0.47,...,-0.634,-0.166,-0.211,-0.717,-0.169,-0.027,-0.677,-0.677,-1.617,-1.141
387082,2011,1,2,8,-22.925,-43.233,3.617,-1.909,-0.194,-1.068,...,-0.897,-1.283,-0.267,-0.184,-1.028,0.274,-0.773,-0.583,0.495,-1.061
326746,2011,1,2,7,-22.898,-43.222,3.617,-1.658,-0.567,0.548,...,-1.591,-1.206,-1.3,-0.612,-1.099,-0.204,-0.484,-0.307,-0.967,-1.136


In [4]:
df_train = air_data[air_data.train].drop(columns='train')
df_test = air_data[~air_data.train].drop(columns='train')

x_train = df_train.drop(columns=["O3", 'CO', 'PM10', 'aiq', 'Lat', 'Lon'])
x_test = df_test.drop(columns=["O3", 'CO', 'PM10', 'aiq', 'Lat', 'Lon'])

## Linear regression

We first start with the ozone. 

In [5]:
y_train = df_train['O3']
y_test = df_test['O3']

In [6]:
x_train_sm = sm.add_constant(x_train)

reg = sm.OLS(y_train, x_train_sm)
res = reg.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                     O3   R-squared:                       0.882
Model:                            OLS   Adj. R-squared:                  0.882
Method:                 Least Squares   F-statistic:                 1.104e+05
Date:                Wed, 16 Jun 2021   Prob (F-statistic):               0.00
Time:                        22:20:08   Log-Likelihood:            -1.5507e+05
No. Observations:              399541   AIC:                         3.102e+05
Df Residuals:                  399513   BIC:                         3.105e+05
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.3921      0.713      6.156      0.0

In [22]:
x_test_sm = sm.add_constant(x_test)
y_pred = res.predict(x_test_sm)

y_train_pred = res.predict(x_train_sm)

print("R2 train set: {}".format(r2_score(y_train, y_train_pred)))
print("MAE train set: {}".format(mean_absolute_error(y_train, y_train_pred)))
print("RMSE train set: {}".format(mean_squared_error(y_train, y_train_pred, squared = False)))

print("R2 test set: {}".format(r2_score(y_test, y_pred)))
print("MAE test set: {}".format(mean_absolute_error(y_test, y_pred)))
print("RMSE test set: {}".format(mean_squared_error(y_test, y_pred, squared = False)))

R2 train set: 0.8817760841297959
MAE train set: 0.2598833632086775
RMSE train set: 0.3567183601835723
R2 test set: 0.8718723078209152
MAE test set: 0.23134658592918386
RMSE test set: 0.3167052629463962


## Support vector machine

In [31]:
x_train_AV = x_train[x_train.CodNum==1]
x_test_AV =  x_test[x_test.CodNum==1]

y_train_AV = df_train[df_train.CodNum==1]['O3']
y_test_AV = df_test[df_test.CodNum==1]['O3']

In [37]:
regr = make_pipeline(StandardScaler(), 
                     LinearSVR(random_state=0, tol=1e-4))

regr.fit(x_train_AV, y_train_AV)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvr', LinearSVR(random_state=0))])