In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("data/weatherHistory.csv")
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


Let's see the amount of data we have and where the *NaN* values are:

In [6]:
print(len(data))
print("\n")
print(data.isnull().sum())

96453


Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64


Because there is almost 100k rows of data in the dataset I'll simply remove the missing values - the column where missing values are is categorical, can't replace it with mean or something like that.

In [7]:
data.dropna(inplace=True)
data.isnull().sum()

Formatted Date              0
Summary                     0
Precip Type                 0
Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
Daily Summary               0
dtype: int64

I'll now drop columns that are irrelevant for further analysis and ML model. Those columns are: *Formatted Date, Summary, Apparent Temperature (C), Daily Summary*

In [8]:
data.drop(["Formatted Date", "Summary", "Apparent Temperature (C)", "Daily Summary"], axis=1, inplace=True)
data.head()

Unnamed: 0,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
0,rain,9.472222,0.89,14.1197,251.0,15.8263,0.0,1015.13
1,rain,9.355556,0.86,14.2646,259.0,15.8263,0.0,1015.63
2,rain,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94
3,rain,8.288889,0.83,14.1036,269.0,15.8263,0.0,1016.41
4,rain,8.755556,0.83,11.0446,259.0,15.8263,0.0,1016.51


## Logistic Regression final prep

In [9]:
data["Precip Type"].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

I will want to convert "Percip Type" column to be of a binary type - 0 if it's raining and 1 if it's snowing. I'll use Label encoding to do that:

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data["Precip Type"].astype(str))

data["Precip Type"] = le.transform(data["Precip Type"])
data["Precip Type"].value_counts()

0    85224
1    10712
Name: Precip Type, dtype: int64

I can now proceed with Logistic Regression part. I'll split the data into train and test datasets and then train the model.

In [11]:
from sklearn.model_selection import train_test_split

X = data.drop("Precip Type", axis=1)
y = data["Precip Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
predictions = log_reg.predict(X_test)

And now I can see the models accuracy:

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25526
           1       0.97      1.00      0.98      3255

   micro avg       1.00      1.00      1.00     28781
   macro avg       0.98      1.00      0.99     28781
weighted avg       1.00      1.00      1.00     28781



Accuracy of around 100%. As expected.