In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [8]:
data = pd.read_csv("data/weatherHistory.csv")

In [9]:
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [10]:
data.dtypes

Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

So, some convertion from string to integer type will be necessary before fitting the data to ML model.

I won't use all of the columns (don't want the cleaning part to be too long). I'll drop: *Summary, Formatted Date, Apparent Temperature (C), Daily Summary*:

In [11]:
data.drop(["Summary", "Formatted Date", "Apparent Temperature (C)", "Daily Summary"], axis=1, inplace=True)
data.head()

Unnamed: 0,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
0,rain,9.472222,0.89,14.1197,251.0,15.8263,0.0,1015.13
1,rain,9.355556,0.86,14.2646,259.0,15.8263,0.0,1015.63
2,rain,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94
3,rain,8.288889,0.83,14.1036,269.0,15.8263,0.0,1016.41
4,rain,8.755556,0.83,11.0446,259.0,15.8263,0.0,1016.51


I will need to encode the "Percip Type" column.

First, let's see if there are missing values in the dataset:

In [20]:
data.isnull().sum()

Precip Type               517
Temperature (C)             0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
dtype: int64

Seems like there are some missing values in our only non-numeric column. Since this is a big dataset I'll just drop them:

In [21]:
data.dropna(inplace=True)

In [22]:
data.isnull().sum()

Precip Type               0
Temperature (C)           0
Humidity                  0
Wind Speed (km/h)         0
Wind Bearing (degrees)    0
Visibility (km)           0
Loud Cover                0
Pressure (millibars)      0
dtype: int64

In [23]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(data["Precip Type"].astype(str))

data["Precip Type"] = label_encoder.transform(data["Precip Type"])

In [24]:
data.head()

Unnamed: 0,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
0,0,9.472222,0.89,14.1197,251.0,15.8263,0.0,1015.13
1,0,9.355556,0.86,14.2646,259.0,15.8263,0.0,1015.63
2,0,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94
3,0,8.288889,0.83,14.1036,269.0,15.8263,0.0,1016.41
4,0,8.755556,0.83,11.0446,259.0,15.8263,0.0,1016.51


Okay, now all of the data is in numeric form.

There's actually no need for One Hot Encoding since the "Percip Type" can only be 0 or 1, because there are only 2 possible values for that column.


Now I can procceed to train/test split:

In [27]:
from sklearn.model_selection import train_test_split

I will try to predict the temperature value based on the other columns. So, **y** will be the "Temperature (C)" column, and **X** will be every other column:

In [35]:
X = data.drop("Temperature (C)", axis=1)
y = data["Temperature (C)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [36]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [37]:
y_pred = lr_model.predict(X_test)

The coefficient and intercept values:

In [38]:
print("Coefficients: {}".format(lr_model.coef_))
print("Intercept value: {}".format(lr_model.intercept_))

Coefficients: [-1.28599622e+01 -2.64856147e+01 -2.08986711e-01  2.23170752e-03
  1.72225024e-01  0.00000000e+00 -8.50964115e-04]
Intercept value: 33.7374750182926


R Squared

In [39]:
from sklearn.metrics import r2_score

In [40]:
r2_score(y_test, y_pred)  # not so decent score

0.6064488239312715