In [2]:
import pandas as pd 
import numpy as np
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split,GridSearchCV


import seaborn as sns

In [3]:
data_reg = pd.read_csv('regression2.csv')

In [6]:
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              768 non-null    int64  
 1   Station Number    768 non-null    int64  
 2   City              768 non-null    object 
 3   Location          768 non-null    object 
 4   Type              768 non-null    object 
 5   Valid Hour        768 non-null    int64  
 6   10th Percentile   768 non-null    int64  
 7   30th Percentile   768 non-null    int64  
 8   50th Percentile   768 non-null    int64  
 9   70th Percentile   768 non-null    int64  
 10  90th Percentile   768 non-null    int64  
 11  99 Percentile     768 non-null    int64  
 12  Mean              768 non-null    float64
 13  1-Hour Maximum    768 non-null    int64  
 14  24-Hour  Maximum  768 non-null    float64
dtypes: float64(2), int64(10), object(3)
memory usage: 90.1+ KB


In [8]:
data_reg.head(3)

Unnamed: 0,Year,Station Number,City,Location,Type,Valid Hour,10th Percentile,30th Percentile,50th Percentile,70th Percentile,90th Percentile,99 Percentile,Mean,1-Hour Maximum,24-Hour Maximum
0,2022,12008,Windsor Downtown,467 University Ave. W.,A,8743,2,4,6,9,15,28,7.83,55,26.2
1,2021,12008,Windsor Downtown,467 University Ave. W.,A,8730,2,5,7,10,16,29,8.34,54,22.9
2,2020,12008,Windsor Downtown,467 University Ave. W.,A,8722,2,4,6,8,13,23,7.05,42,28.5


In [10]:
data_reg.isna().sum()

Year                0
Station Number      0
City                0
Location            0
Type                0
Valid Hour          0
10th Percentile     0
30th Percentile     0
50th Percentile     0
70th Percentile     0
90th Percentile     0
99 Percentile       0
Mean                0
1-Hour Maximum      0
24-Hour  Maximum    0
dtype: int64

In [12]:
data_reg.columns

Index(['Year', 'Station Number', 'City', 'Location', 'Type', 'Valid Hour',
       '10th Percentile', '30th Percentile', '50th Percentile',
       '70th Percentile', '90th Percentile', '99 Percentile', 'Mean',
       '1-Hour Maximum', '24-Hour  Maximum'],
      dtype='object')

In [14]:
data_reg.Year.unique()

array([2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012,
       2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003], dtype=int64)

In [16]:
data_reg.Type.unique()

array(['A'], dtype=object)

In [18]:
x = data_reg[['Year', '10th Percentile', '30th Percentile', '50th Percentile',
       '70th Percentile', '90th Percentile', '99 Percentile']]
y = data_reg[['Mean']]

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y.values.ravel(),test_size=0.2,random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42) 

In [22]:
len(x), len(x_train), len(x_test), len(x_val)

(768, 460, 154, 154)

In [24]:
len(y), len(y_train), len(y_test), len(y_val)

(768, 460, 154, 154)

In [26]:
#linear regression

In [28]:
linear_reg1 = LinearRegression(fit_intercept=True)
linear_reg1.fit(x_train,y_train)

In [30]:
linear_reg2 = LinearRegression(fit_intercept=False)
linear_reg2.fit(x_train,y_train)

In [32]:
pred_lr_1 = linear_reg1.predict(x_test)
pred_lr_2 = linear_reg2.predict(x_test)

In [34]:
print('MAE:', metrics.mean_absolute_error(y_test, pred_lr_1))
print('MSE:', metrics.mean_squared_error(y_test, pred_lr_1))
print('R2:', metrics.r2_score(y_test, pred_lr_1))

MAE: 0.10594000145497472
MSE: 0.017519050872946945
R2: 0.991957057407017


In [36]:
print('MAE:', metrics.mean_absolute_error(y_test, pred_lr_2))
print('MSE:', metrics.mean_squared_error(y_test, pred_lr_2))
print('R2:', metrics.r2_score(y_test, pred_lr_2))

MAE: 0.1062007836198275
MSE: 0.01757922909202824
R2: 0.9919294297709693


In [38]:
linear_reg = linear_reg1

In [40]:
#SVR

In [42]:
parameters = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'kernel': ['linear']
}

In [44]:
svr = SVR()
grid_search_svr = GridSearchCV(estimator=svr, param_grid=parameters, scoring='neg_mean_squared_error', cv=5)

In [46]:
grid_search_svr.fit(x_train, y_train)

In [47]:
grid_search_svr.best_estimator_

In [48]:
pred_svr = grid_search_svr.best_estimator_.predict(x_test)

In [52]:
print('MAE:', metrics.mean_absolute_error(y_test, pred_svr))
print('MSE:', metrics.mean_squared_error(y_test, pred_svr))
print('R2:', metrics.r2_score(y_test, pred_svr))

MAE: 0.10682665283241019
MSE: 0.017629793319484204
R2: 0.9919062158890282


In [54]:
#Decision Tree

In [56]:
parameters_dtree = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

In [58]:
tree = DecisionTreeRegressor()
grid_search_dtree = GridSearchCV(estimator=tree, param_grid=parameters_dtree, scoring='neg_mean_squared_error', cv=5)

In [60]:
grid_search_dtree.fit(x_train, y_train)

In [62]:
grid_search_dtree.best_estimator_

In [64]:
pred_dtree = grid_search_dtree.best_estimator_.predict(x_test)

In [66]:
print('MAE:', metrics.mean_absolute_error(y_test, pred_dtree))
print('MSE:', metrics.mean_squared_error(y_test, pred_dtree))
print('R2:', metrics.r2_score(y_test, pred_dtree))

MAE: 0.18350649350649353
MSE: 0.056551948051948076
R2: 0.9740371738742111


In [68]:
#validation

In [70]:
val_lr = linear_reg.predict(x_val)
val_svr = grid_search_svr.predict(x_val)
val_dtree = grid_search_dtree.predict(x_val)

In [72]:
#linear regression
print('MAE:', metrics.mean_absolute_error(y_val, val_lr))
print('MSE:', metrics.mean_squared_error(y_val, val_lr))
print('R2:', metrics.r2_score(y_val, val_lr))

MAE: 0.111926298999068
MSE: 0.018086131712112707
R2: 0.9921144544869943


In [74]:
#decision tree
print('MAE:', metrics.mean_absolute_error(y_val, val_dtree))
print('MSE:', metrics.mean_squared_error(y_val, val_dtree))
print('R2:', metrics.r2_score(y_val, val_dtree))

MAE: 0.21136363636363645
MSE: 0.0853279220779221
R2: 0.9627970633087322


In [76]:
#SVR
print('MAE:', metrics.mean_absolute_error(y_val, val_svr))
print('MSE:', metrics.mean_squared_error(y_val, val_svr))
print('R2:', metrics.r2_score(y_val, val_svr))

MAE: 0.11202649955836952
MSE: 0.018194643751466576
R2: 0.9920671432853148


In [78]:
#unseen data

In [80]:
unseen_data = pd.read_csv('regression unseen.csv')

In [82]:
unseen_data.head(3)

Unnamed: 0,Year,Station Number,City,Location,Type,Valid Hour,10th Percentile,30th Percentile,50th Percentile,70th Percentile,90th Percentile,99 Percentile,Mean,1-Hour Maximum,24-Hour Maximum
0,2022,12008,Windsor Downtown,467 University Ave. W.,A,8743,2.0,4.0,6.0,,15,27.0,7.85,56.0,26.4
1,2021,12008,Windsor Downtown,467 University Ave. W.,A,8730,2.0,5.0,7.0,10.0,16,30.0,8.36,55.0,
2,2020,12008,Windsor Downtown,467 University Ave. W.,A,8722,2.0,,6.0,8.0,14,,7.08,,28.7


In [84]:
unseen_data.isna().sum()

Year                0
Station Number      0
City                0
Location            0
Type                0
Valid Hour          0
10th Percentile     2
30th Percentile     1
50th Percentile     1
70th Percentile     2
90th Percentile     0
99 Percentile       3
Mean                0
1-Hour Maximum      1
24-Hour  Maximum    2
dtype: int64

In [86]:
unseen_x = data_reg[['Year', '10th Percentile', '30th Percentile', '50th Percentile',
       '70th Percentile', '90th Percentile', '99 Percentile']]
unseen_y = data_reg[['Mean']]

In [88]:
training_means = x_train.mean()
unseen_x = unseen_x.fillna(training_means)

In [90]:
unseen_x.isna().sum()

Year               0
10th Percentile    0
30th Percentile    0
50th Percentile    0
70th Percentile    0
90th Percentile    0
99 Percentile      0
dtype: int64

In [92]:
unseen_lr = linear_reg.predict(unseen_x)

In [94]:
#linear regression
print('MAE:', metrics.mean_absolute_error(y_val, val_lr))
print('MSE:', metrics.mean_squared_error(y_val, val_lr))
print('R2:', metrics.r2_score(y_val, val_lr))

MAE: 0.111926298999068
MSE: 0.018086131712112707
R2: 0.9921144544869943
