In [21]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(display="diagram")


import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

import time

In [78]:
weather_data = pd.read_csv('data/Istanbul Weather Data.csv')

check for & remove rows with null values

In [79]:
weather_data.isnull().sum()

DateTime         0
Condition        0
Rain             0
MaxTemp          0
MinTemp          0
SunRise          0
SunSet           0
MoonRise       130
MoonSet        130
AvgWind          0
AvgHumidity      0
AvgPressure      0
dtype: int64

In [80]:
weather_data = weather_data.dropna()
weather_data.isnull().sum()

DateTime       0
Condition      0
Rain           0
MaxTemp        0
MinTemp        0
SunRise        0
SunSet         0
MoonRise       0
MoonSet        0
AvgWind        0
AvgHumidity    0
AvgPressure    0
dtype: int64

In [81]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3594 entries, 0 to 3853
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DateTime     3594 non-null   object 
 1   Condition    3594 non-null   object 
 2   Rain         3594 non-null   float64
 3   MaxTemp      3594 non-null   int64  
 4   MinTemp      3594 non-null   int64  
 5   SunRise      3594 non-null   object 
 6   SunSet       3594 non-null   object 
 7   MoonRise     3594 non-null   object 
 8   MoonSet      3594 non-null   object 
 9   AvgWind      3594 non-null   int64  
 10  AvgHumidity  3594 non-null   int64  
 11  AvgPressure  3594 non-null   int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 365.0+ KB


convert timestamps into panda datetimes - from epoch

In [82]:
date_pattern = '%d.%m.%Y'
weather_data['DateTime'] = weather_data['DateTime'].apply(lambda row: int(time.mktime(time.strptime(row,date_pattern))))

time_pattern = '%H:%M:%S'
weather_data['SunRise'] = weather_data['SunRise'].apply(lambda row: int(time.mktime(time.strptime(row,time_pattern))))
weather_data['SunSet'] = weather_data['SunSet'].apply(lambda row: int(time.mktime(time.strptime(row,time_pattern))))

weather_data['MoonRise'] = weather_data['MoonRise'].apply(lambda row: int(time.mktime(time.strptime(row,time_pattern))))
weather_data['MoonSet'] = weather_data['MoonSet'].apply(lambda row: int(time.mktime(time.strptime(row,time_pattern))))
weather_data.head()

Unnamed: 0,DateTime,Condition,Rain,MaxTemp,MinTemp,SunRise,SunSet,MoonRise,MoonSet,AvgWind,AvgHumidity,AvgPressure
0,1563778800,Partly cloudy,0.0,29,23,-2208938940,-2208886200,-2208874740,-2208920940,19,57,1017
1,1563692400,Sunny,0.0,28,23,-2208939000,-2208886140,-2208876240,-2208924420,20,59,1018
2,1563606000,Partly cloudy,0.0,28,22,-2208939000,-2208886080,-2208877860,-2208927900,24,62,1016
3,1563519600,Sunny,0.0,28,23,-2208939120,-2208886080,-2208879660,-2208931440,20,60,1014
4,1563433200,Partly cloudy,0.0,27,22,-2208939180,-2208886020,-2208881580,-2208934980,16,56,1010


In [106]:
# for col in weather_data:
#     print(weather_data[col].unique())

encode Condition

In [88]:
condition_df = pd.get_dummies(weather_data['Condition'])
condition_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3594 entries, 0 to 3853
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   Blizzard                             3594 non-null   uint8
 1   Cloudy                               3594 non-null   uint8
 2   Fog                                  3594 non-null   uint8
 3   Heavy rain                           3594 non-null   uint8
 4   Heavy rain at times                  3594 non-null   uint8
 5   Light drizzle                        3594 non-null   uint8
 6   Light rain                           3594 non-null   uint8
 7   Light rain shower                    3594 non-null   uint8
 8   Light sleet                          3594 non-null   uint8
 9   Light sleet showers                  3594 non-null   uint8
 10  Mist                                 3594 non-null   uint8
 11  Moderate or heavy rain shower        3594 non-null   uin

Replace 'Condition' by dropping it & joining condition_df

In [124]:
weather = weather_data.drop(columns=['Condition'])
weather.head()
weather = weather.join(condition_df)
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3594 entries, 0 to 3853
Data columns (total 37 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   DateTime                             3594 non-null   int64  
 1   Rain                                 3594 non-null   float64
 2   MaxTemp                              3594 non-null   int64  
 3   MinTemp                              3594 non-null   int64  
 4   SunRise                              3594 non-null   int64  
 5   SunSet                               3594 non-null   int64  
 6   MoonRise                             3594 non-null   int64  
 7   MoonSet                              3594 non-null   int64  
 8   AvgWind                              3594 non-null   int64  
 9   AvgHumidity                          3594 non-null   int64  
 10  AvgPressure                          3594 non-null   int64  
 11  Blizzard                      

In [125]:
weather.head()

Unnamed: 0,DateTime,Rain,MaxTemp,MinTemp,SunRise,SunSet,MoonRise,MoonSet,AvgWind,AvgHumidity,...,Moderate rain at times,Moderate snow,Overcast,Partly cloudy,Patchy light drizzle,Patchy light rain with thunder,Patchy rain possible,Sunny,Thundery outbreaks possible,Torrential rain shower
0,1563778800,0.0,29,23,-2208938940,-2208886200,-2208874740,-2208920940,19,57,...,0,0,0,1,0,0,0,0,0,0
1,1563692400,0.0,28,23,-2208939000,-2208886140,-2208876240,-2208924420,20,59,...,0,0,0,0,0,0,0,1,0,0
2,1563606000,0.0,28,22,-2208939000,-2208886080,-2208877860,-2208927900,24,62,...,0,0,0,1,0,0,0,0,0,0
3,1563519600,0.0,28,23,-2208939120,-2208886080,-2208879660,-2208931440,20,60,...,0,0,0,0,0,0,0,1,0,0
4,1563433200,0.0,27,22,-2208939180,-2208886020,-2208881580,-2208934980,16,56,...,0,0,0,1,0,0,0,0,0,0


In [16]:
X = weather.drop('Rain', axis=1)
y = weather['Rain']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

GridSearchCV using StandardScalar & Ridge regression
Ridge: applies penalty that shrinks the coefficients of the model

In [40]:
pipe_ridge = Pipeline([
    ('scalar', StandardScaler()), 
    ('ridge', Ridge())
])
param_dict = {'ridge__alpha': [0.0001, 0.001, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0]}
grid_search = GridSearchCV(pipe_ridge, param_grid=param_dict)
grid_search.fit(X_train, y_train)

model_train_mse = mean_squared_error(y_train, grid_search.predict(X_train))
model_test_mse = mean_squared_error(y_test, grid_search.predict(X_test))

model_best_alpha = grid_search.best_params_

print(f'Train MSE: {model_train_mse}')
print(f'Test MSE: {model_test_mse}')
print(f'Best Alpha: {list(model_best_alpha.values())[0]}')

Train MSE: 3.0732670520945002
Test MSE: 3.895120045850751
Best Alpha: 1000.0


GridSearchCV using SequentalFeatureSelector
SequentialFeatureSelector: selects subset of features to construct a model

In [47]:
pipe_selector = Pipeline([
    ('selector', SequentialFeatureSelector(LinearRegression())),
    ('model', LinearRegression())
])
param_dict = {'selector__n_features_to_select' : [2,3,4,5,6,7,8,9,10,20]}
grid_search = GridSearchCV(pipe_selector, param_grid=param_dict)
grid_search.fit(X_train, y_train)

model_train_mse = mean_squared_error(y_train, grid_search.predict(X_train))
model_test_mse = mean_squared_error(y_test, grid_search.predict(X_test))

selector = grid_search.best_estimator_.named_steps['selector']
feature_names = X_train.columns[selector.get_support()]
coefficients = grid_search.best_estimator_.named_steps['model'].coef_


print(f'Train MSE: {model_train_mse}')
print(f'Test MSE: {model_test_mse}')
print('Selected features and coefficient values:')
pd.DataFrame([coefficients.T], columns=feature_names, index=['model'])

Train MSE: 2.9652686748084296
Test MSE: 4.0547508588082914
Selected features and coefficient values:


Unnamed: 0,MinTemp,AvgWind,AvgHumidity,AvgPressure,Cloudy,Fog,Heavy rain,Heavy rain at times,Light rain,Light rain shower,Mist,Moderate or heavy rain shower,Moderate or heavy snow showers,Overcast,Partly cloudy,Patchy light drizzle,Patchy rain possible,Sunny,Thundery outbreaks possible,Torrential rain shower
model,-0.020846,0.017298,0.03453,-0.065157,-10.055335,-12.186561,14.492471,14.920779,-6.407143,-8.092577,-9.99072,4.995395,-10.048671,-9.301344,-9.989941,-8.594544,-9.370924,-10.482021,-8.505175,24.062876


PolynomialFeatures with SequentialFeatureSelector

In [49]:
pipe_sequential = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('selector', SequentialFeatureSelector(LinearRegression(), n_features_to_select=10)),
    ('linreg', LinearRegression())
])
pipe_sequential.fit(X_train, y_train)

model_train_mse = mean_squared_error(y_train, pipe_sequential.predict(X_train))
model_test_mse = mean_squared_error(y_test, pipe_sequential.predict(X_test))

print(f'Train MSE: {model_train_mse}')
print(f'Test MSE: {model_test_mse}')

Train MSE: 3.3306842136575714
Test MSE: 5.016885576074871


PolynomialFeatures with Ridge

In [115]:
pipe_ridge = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])
alpha_to_try_dict = {'ridge__alpha':10**np.linspace(-2,2,100)}
grid_ridge = GridSearchCV(pipe_ridge,
                         param_grid=alpha_to_try_dict,
                         scoring='neg_mean_squared_error')
grid_ridge.fit(X_train, y_train)

model_train_mse = mean_squared_error(y_train, grid_ridge.predict(X_train))
model_test_mse = mean_squared_error(y_test, grid_ridge.predict(X_test))

model_best_alpha = grid_ridge.best_params_

print(f'Train MSE: {model_train_mse}')
print(f'Test MSE: {model_test_mse}')
print(f'Best Alpha: {list(model_best_alpha.values())[0]}')

Train MSE: 2.322780015415052
Test MSE: 9.32719243017728
Best Alpha: 100.0


PolynomialFeatures with Lasso

In [119]:
pipe_lasso = Pipeline([
    ('polyfeatures', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler()),
    ('lasso', Lasso(random_state=42))
])
pipe_lasso.fit(X_train, y_train)

model_train_mse = mean_squared_error(y_train, pipe_lasso.predict(X_train))
model_test_mse = mean_squared_error(y_test, pipe_lasso.predict(X_test))

print(f'Train MSE: {model_train_mse}')
print(f'Test MSE: {model_test_mse}')

# get features with non-zero coefficients
features = pipe_lasso.named_steps['polyfeatures'].get_feature_names_out()
coefficients = pipe_lasso.named_steps['lasso'].coef_

lasso_dataframe = pd.DataFrame({'features':features, 'coefficients':coefficients})
lasso_dataframe

Train MSE: 6.211410182183243
Test MSE: 6.809266960594935


Unnamed: 0,features,coefficients
0,DateTime,-0.0
1,MaxTemp,-0.0
2,MinTemp,-0.0
3,SunRise,0.0
4,SunSet,-0.0
...,...,...
9133,Sunny Torrential rain shower^2,0.0
9134,Thundery outbreaks possible^3,0.0
9135,Thundery outbreaks possible^2 Torrential rain ...,0.0
9136,Thundery outbreaks possible Torrential rain sh...,0.0


PolynomialFeatures with SequentialFeatureSelector

In [118]:
pipe_sequential = Pipeline([
    ('poly_features', PolynomialFeatures(degree = 3, include_bias = False)),
    ('selector', SequentialFeatureSelector(LinearRegression(), n_features_to_select=6)),
    ('linreg', LinearRegression())
])
pipe_sequential.fit(X_train, y_train)

model_train_mse = mean_squared_error(y_train, pipe_sequential.predict(X_train))
model_test_mse = mean_squared_error(y_test, pipe_sequential.predict(X_test))

print(f'Train MSE: {model_train_mse}')
print(f'Test MSE: {model_test_mse}')

Train MSE: 3.975857398290398
Test MSE: 6.579337148359592
