In [51]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

weather = pd.read_csv("weatherHistory.csv")

weather.head(1)

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472,7.389,0.89,14.12,251.0,15.826,0.0,1015.13,Partly cloudy throughout the day.


In [52]:
weather.rename(columns={"Temperature (C)": "temperature","Wind Speed (km/h)":"windspeed","Apparent Temperature (C)":"apparenttemp","Humidity":"humidity","Visibility (km)":"visibility"}, inplace=True)

In [53]:
weather1=weather[["temperature","humidity","windspeed","apparenttemp"]]
weather1.head(3)

Unnamed: 0,temperature,humidity,windspeed,apparenttemp
0,9.472,0.89,14.12,7.389
1,9.356,0.86,14.265,7.228
2,9.378,0.89,3.928,9.378


In [54]:
weather["temp_dif"]=weather["temperature"]-weather["apparenttemp"]
weather.head(3)

Unnamed: 0,Formatted Date,Summary,Precip Type,temperature,apparenttemp,humidity,windspeed,Wind Bearing (degrees),visibility,Loud Cover,Pressure (millibars),Daily Summary,temp_dif
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472,7.389,0.89,14.12,251.0,15.826,0.0,1015.13,Partly cloudy throughout the day.,2.083
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.356,7.228,0.86,14.265,259.0,15.826,0.0,1015.63,Partly cloudy throughout the day.,2.128
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.378,9.378,0.89,3.928,204.0,14.957,0.0,1015.94,Partly cloudy throughout the day.,0.0


In [55]:
weather.corr()

Unnamed: 0,temperature,apparenttemp,humidity,windspeed,Wind Bearing (degrees),visibility,Loud Cover,Pressure (millibars),temp_dif
temperature,1.0,0.993,-0.632,0.009,0.03,0.393,,-0.005,-0.635
apparenttemp,0.993,1.0,-0.603,-0.057,0.029,0.382,,-0.0,-0.724
humidity,-0.632,-0.603,1.0,-0.225,0.001,-0.369,,0.005,0.242
windspeed,0.009,-0.057,-0.225,1.0,0.104,0.101,,-0.049,0.412
Wind Bearing (degrees),0.03,0.029,0.001,0.104,1.0,0.048,,-0.012,-0.014
visibility,0.393,0.382,-0.369,0.101,0.048,1.0,,0.06,-0.197
Loud Cover,,,,,,,,,
Pressure (millibars),-0.005,-0.0,0.005,-0.049,-0.012,0.06,,1.0,-0.03
temp_dif,-0.635,-0.724,0.242,0.412,-0.014,-0.197,,-0.03,1.0


In [56]:
# Y is the target variable
Y = weather["temp_dif"]

# X is the feature set
X = weather[["humidity","windspeed"]]

# We add constant to the model as it's a best practice
# to do so everytime!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results
display(results.summary())

0,1,2,3
Dep. Variable:,temp_dif,R-squared:,0.288
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,19490.0
Date:,"Tue, 07 Jul 2020",Prob (F-statistic):,0.0
Time:,22:36:08,Log-Likelihood:,-170460.0
No. Observations:,96453,AIC:,340900.0
Df Residuals:,96450,BIC:,340900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.4381,0.021,-115.948,0.000,-2.479,-2.397
humidity,3.0292,0.024,126.479,0.000,2.982,3.076
windspeed,0.1193,0.001,176.164,0.000,0.118,0.121

0,1,2,3
Omnibus:,3935.747,Durbin-Watson:,0.264
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4613.311
Skew:,0.478,Prob(JB):,0.0
Kurtosis:,3.484,Cond. No.,88.1


In [59]:
# Y is the target variable
Y = weather["temp_dif"]

# This is the interaction humidity and windspeed
weather["humidity_windspeed"] = weather1.humidity * weather1.windspeed

# X is the feature set
X = weather[["humidity","windspeed","humidity_windspeed"]]

# We add constant to the model as it's a best practice
# to do so everytime!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results
display(results.summary())

0,1,2,3
Dep. Variable:,temp_dif,R-squared:,0.341
Model:,OLS,Adj. R-squared:,0.341
Method:,Least Squares,F-statistic:,16660.0
Date:,"Tue, 07 Jul 2020",Prob (F-statistic):,0.0
Time:,22:36:38,Log-Likelihood:,-166690.0
No. Observations:,96453,AIC:,333400.0
Df Residuals:,96449,BIC:,333400.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0839,0.033,-2.511,0.012,-0.149,-0.018
humidity,-0.1775,0.043,-4.133,0.000,-0.262,-0.093
windspeed,-0.0905,0.002,-36.797,0.000,-0.095,-0.086
humidity_windspeed,0.2971,0.003,88.470,0.000,0.291,0.304

0,1,2,3
Omnibus:,4849.937,Durbin-Watson:,0.262
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9295.404
Skew:,0.378,Prob(JB):,0.0
Kurtosis:,4.32,Cond. No.,193.0


In [60]:
# Y is the target variable
Y = weather["temp_dif"]

# X is the feature set
X = weather[["humidity","windspeed","visibility"]]

# We add constant to the model as it's a best practice
# to do so everytime!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results
display(results.summary())

0,1,2,3
Dep. Variable:,temp_dif,R-squared:,0.304
Model:,OLS,Adj. R-squared:,0.303
Method:,Least Squares,F-statistic:,14010.0
Date:,"Tue, 07 Jul 2020",Prob (F-statistic):,0.0
Time:,22:36:49,Log-Likelihood:,-169380.0
No. Observations:,96453,AIC:,338800.0
Df Residuals:,96449,BIC:,338800.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.5756,0.028,-56.605,0.000,-1.630,-1.521
humidity,2.6066,0.025,102.784,0.000,2.557,2.656
windspeed,0.1199,0.001,179.014,0.000,0.119,0.121
visibility,-0.0540,0.001,-46.614,0.000,-0.056,-0.052

0,1,2,3
Omnibus:,3833.895,Durbin-Watson:,0.279
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4584.022
Skew:,0.459,Prob(JB):,0.0
Kurtosis:,3.545,Cond. No.,131.0


In [None]:
# The R-squared value and adjusted R-squared values are same and 0.288.
# I don't think the values are satisfactory. Because this means that our model
# explains 28.8 % of the variance in target variable which is very low.

# After including the interaction of humidity and windspeed to the model we
# estimated again the model. And the results improved in a good manner. This time
# explained ratio becomes %34,1. Unfortunately these rates are not good enough.

# After adding visibility r square value increased from 0,288 to 0,303.
# Comparing the differences puting on the interaction term provided more advantage
# to the explained ratio.

# To choose the best one from the three models above with respect to their AIC and BIC scores
# we examined the values. Puting on the interaction term provided more advantage again.