# Linear Regression model 

We upload our cleaned datas and perform a linear regression model. 

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

In [2]:
df_lr = pd.read_csv('weather_data_lin_reg.csv')
df_lr['dt_iso'] = pd.to_datetime(df_lr['dt_iso'], format='%Y-%m-%d %H:%M:%S.%f')
df_lr = df_lr.set_index('dt_iso')
df_lr.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15185 entries, 1979-01-04 to 2020-07-31
Data columns (total 10 columns):
wind_speed_1    15185 non-null float64
wind_speed_2    15185 non-null float64
wind_speed_3    15185 non-null float64
snow_1h_1       15185 non-null float64
snow_1h_2       15185 non-null float64
snow_1h_3       15185 non-null float64
temp_1          15185 non-null float64
temp_2          15185 non-null float64
temp_3          15185 non-null float64
temp            15185 non-null float64
dtypes: float64(10)
memory usage: 1.3 MB


In [3]:
# Import the libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

We isolate the dependant variable 'temp' and the independant variables.

In [4]:
X = df_lr[['wind_speed_1', 'wind_speed_2', 'wind_speed_3','snow_1h_1', 'snow_1h_2', 'snow_1h_3', 'temp_1', 'temp_2', 'temp_3']]
y = df_lr[['temp']]

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15185 entries, 1979-01-04 to 2020-07-31
Data columns (total 9 columns):
wind_speed_1    15185 non-null float64
wind_speed_2    15185 non-null float64
wind_speed_3    15185 non-null float64
snow_1h_1       15185 non-null float64
snow_1h_2       15185 non-null float64
snow_1h_3       15185 non-null float64
temp_1          15185 non-null float64
temp_2          15185 non-null float64
temp_3          15185 non-null float64
dtypes: float64(9)
memory usage: 1.2 MB


In [6]:
y.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15185 entries, 1979-01-04 to 2020-07-31
Data columns (total 1 columns):
temp    15185 non-null float64
dtypes: float64(1)
memory usage: 237.3 KB


We split our data set into a train set, validation set and test set. 

In [7]:
#train_ratio = 0.7
#validation_ratio = 0.2
#test_ratio = 0.1

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42) 

In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12148 entries, 2016-06-05 to 1998-11-30
Data columns (total 9 columns):
wind_speed_1    12148 non-null float64
wind_speed_2    12148 non-null float64
wind_speed_3    12148 non-null float64
snow_1h_1       12148 non-null float64
snow_1h_2       12148 non-null float64
snow_1h_3       12148 non-null float64
temp_1          12148 non-null float64
temp_2          12148 non-null float64
temp_3          12148 non-null float64
dtypes: float64(9)
memory usage: 949.1 KB


In [9]:
#x_val.info()

In [10]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3037 entries, 2013-01-07 to 2005-06-02
Data columns (total 9 columns):
wind_speed_1    3037 non-null float64
wind_speed_2    3037 non-null float64
wind_speed_3    3037 non-null float64
snow_1h_1       3037 non-null float64
snow_1h_2       3037 non-null float64
snow_1h_3       3037 non-null float64
temp_1          3037 non-null float64
temp_2          3037 non-null float64
temp_3          3037 non-null float64
dtypes: float64(9)
memory usage: 237.3 KB


In [11]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12148 entries, 2016-06-05 to 1998-11-30
Data columns (total 1 columns):
temp    12148 non-null float64
dtypes: float64(1)
memory usage: 189.8 KB


In [12]:
#y_val.info()

In [13]:
y_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3037 entries, 2013-01-07 to 2005-06-02
Data columns (total 1 columns):
temp    3037 non-null float64
dtypes: float64(1)
memory usage: 47.5 KB


We fit a linear regression model on our training set. 

In [14]:
lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [15]:
pred = model.predict(x_test)

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, pred)
    
print(MSE)

11.6965051431


In [16]:
pred

array([[ 265.25218373],
       [ 276.37709173],
       [ 269.0409575 ],
       ..., 
       [ 281.82027543],
       [ 279.87635735],
       [ 292.21750244]])

In [17]:
y_test

Unnamed: 0_level_0,temp
dt_iso,Unnamed: 1_level_1
2013-01-07,261.667917
2009-10-18,275.703750
1995-12-02,269.072500
1989-07-03,295.644583
1997-06-27,292.361250
...,...
2006-03-19,267.111250
1983-02-10,255.195833
1997-05-23,284.936250
2008-04-08,280.783333


In [18]:
from sklearn.metrics import mean_absolute_error, median_absolute_error
print("The Explained Variance: %.2f" % lm.score(x_test, y_test))
print("The Mean Absolute Error: %.2f degrees celsius" % mean_absolute_error(y_test, pred))
print("The Median Absolute Error: %.2f degrees celsius" % median_absolute_error(y_test, pred))

The Explained Variance: 0.91
The Mean Absolute Error: 2.56 degrees celsius
The Median Absolute Error: 1.93 degrees celsius


We compare with a very simple prediction model. This model predicts that the temperature is the same as the temperature the previous day. 

In [19]:
pred_naive = x_test['temp_1']
pred_naive.head()

dt_iso
2013-01-07    264.034167
2009-10-18    274.977083
1995-12-02    270.730769
1989-07-03    294.348400
1997-06-27    297.340833
Name: temp_1, dtype: float64

In [20]:
print("The Mean Absolute Error: %.2f degrees celsius" % mean_absolute_error(y_test, pred_naive))
print("The Median Absolute Error: %.2f degrees celsius" % median_absolute_error(y_test, pred_naive))

The Mean Absolute Error: 2.82 degrees celsius
The Median Absolute Error: 2.10 degrees celsius


We see that our model does slightly better than the naive approach.

In [21]:
#Next: find more statistical test to check if model is good. Check how to improve model (maybe regularisation e.g.).
#Also, do better with the cross-validation
#Make sure everything is okay. 
#Do an explicit prediction.