In [30]:
import sys
import numpy as np
import pandas as pd

#np.set_printoptions(threshold=sys.maxsize)

Import data:

In [31]:
data = pd.read_csv('weatherHistory.csv')

Extract day, month, year from formatted date.

In [32]:
data['Date'] = pd.to_datetime(data['Formatted Date'], errors='ignore', utc=True)

In [33]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Hour'] = data['Date'].dt.hour

In [34]:
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Date,Year,Month,Day,Hour
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,2006-03-31 22:00:00+00:00,2006,3,31,22
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,2006-03-31 23:00:00+00:00,2006,3,31,23
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,2006-04-01 00:00:00+00:00,2006,4,1,0
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,2006-04-01 01:00:00+00:00,2006,4,1,1
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,2006-04-01 02:00:00+00:00,2006,4,1,2


Check if there is any missing value and if there is find out in which column.

In [35]:
data.isnull().any()

Formatted Date              False
Summary                     False
Precip Type                  True
Temperature (C)             False
Apparent Temperature (C)    False
Humidity                    False
Wind Speed (km/h)           False
Wind Bearing (degrees)      False
Visibility (km)             False
Loud Cover                  False
Pressure (millibars)        False
Daily Summary               False
Date                        False
Year                        False
Month                       False
Day                         False
Hour                        False
dtype: bool

Clear missing values

In [36]:
filtered_data = data.dropna()
filtered_data.isnull().any()
data = filtered_data

Convert categorical labels to numerical labels.

In [37]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
data['Precip Type'] = labelEncoder.fit_transform(data['Precip Type'].fillna('-1'))

Import scikit-learn

In [38]:
from sklearn.model_selection import train_test_split

Split data into train and test.

In [39]:
# y is the label
# X is the independent variables
X = data.iloc[:, 3:11]
y = data.iloc[:, 2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#print("X_train\n----------------------------------------------------------------------------------\n",X_train, "\n")
#print("X_test\n----------------------------------------------------------------------------------\n",X_test, "\n")
#print("y_train\n----------------------------------------------------------------------------------\n",y_train, "\n")
#print("y_test\n----------------------------------------------------------------------------------\n",y_test, "\n")

In [40]:
####################################################################
#############              DECISION TREE           #################
####################################################################

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_test, y_test)
y_pred_decision_tree = classifier.predict(X_train)

count = 0
for i in range(len(y_test)):
    if y_test.iloc[i] == y_pred_decision_tree[i]:
        count += 1
print(count / len(y_test))


0.8015730124135317


In [42]:
from sklearn import tree
import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None)
graph = graphviz.Source(dot_data)
graph.render('weather')

'weather.pdf'

In [43]:
####################################################################
#############              RANDOM FOREST           #################
####################################################################

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=1000, max_depth=4, random_state=0)
rf_classifier.fit(X_train, y_train)

print(rf_classifier.feature_importances_)

[5.46160175e-01 3.49651859e-01 1.26387683e-02 8.40371109e-03
 4.35870503e-04 4.31658619e-02 0.00000000e+00 3.95437547e-02]


Make prediction using classifier.

In [45]:
classified_data = rf_classifier.predict(X_test)
real_data = y_test

true_count = 0
for i in range(len(classified_data)):
    if classified_data[i] == real_data.iloc[i]:
        true_count += 1
print(len(classified_data), '|', true_count)
print('True percentage', true_count / len(classified_data))

31659 | 31659
True percentage 1.0


In [46]:
####################################################################
#############                ADA BOOST             #################
####################################################################

In [47]:
from sklearn.ensemble import AdaBoostClassifier

ada_boost_classifier = AdaBoostClassifier(n_estimators=100)

In [48]:
scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=2)
scores.mean()

1.0

In [49]:
scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=3)
scores.mean()

1.0

In [50]:
scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=5)
scores.mean()

1.0

In [51]:
scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=7)
scores.mean()

1.0

In [52]:
scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=10)
scores.mean()

1.0

In [53]:
####################################################################
#############         GRADIENT TREE BOOSTING       #################
####################################################################

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gradient_boost_classifier.fit(X_train, y_train)
gradient_boost_classifier.score(X_test, y_test)

1.0

In [55]:
gradient_boost_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1, random_state=0)
gradient_boost_classifier.fit(X_train, y_train)
gradient_boost_classifier.score(X_test, y_test)

1.0

In [56]:
gradient_boost_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0)
gradient_boost_classifier.fit(X_train, y_train)
gradient_boost_classifier.score(X_test, y_test)

1.0

In [57]:
gradient_boost_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0)
gradient_boost_classifier.fit(X_train, y_train)
gradient_boost_classifier.score(X_test, y_test)

1.0

In [58]:
gradient_boost_classifier = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=2, random_state=0)
gradient_boost_classifier.fit(X_train, y_train)
gradient_boost_classifier.score(X_test, y_test)

1.0