In [1]:
!pip install matplotlib
!pip install seaborn
!pip install sklearn
!pip install pydotplus
!pip install numpy
!pip install pandas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
import sklearn as skl
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics 
import pydotplus



In [9]:
weather_df = pd.read_csv('weather3.csv')
columns = weather_df.columns
weather_df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [11]:
# Size of data
weather_df.shape

(145460, 23)

In [12]:
# Data description of all the columns
weather_df.describe(include='all')

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
count,145460,145460,143975.0,144199.0,142199.0,82670.0,75625.0,135134,135198.0,134894,...,142806.0,140953.0,130395.0,130432.0,89572.0,86102.0,143693.0,141851.0,142199,142193
unique,3436,49,,,,,,16,,16,...,,,,,,,,,2,2
top,2013-06-12,Canberra,,,,,,W,,N,...,,,,,,,,,No,No
freq,49,3436,,,,,,9915,,11758,...,,,,,,,,,110319,110316
mean,,,12.194034,23.221348,2.360918,5.468232,7.611178,,40.031554,,...,68.880831,51.539116,1017.64994,1015.255889,4.447461,4.50993,16.990631,21.68339,,
std,,,6.398495,7.119049,8.47806,4.193704,3.785483,,13.615465,,...,19.029164,20.795902,7.10653,7.037414,2.887159,2.720357,6.488753,6.93665,,
min,,,-8.5,-4.8,0.0,0.0,0.0,,-33.0,,...,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,,
25%,,,7.6,17.9,0.0,2.6,4.8,,31.0,,...,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6,,
50%,,,12.0,22.6,0.0,4.8,8.4,,39.0,,...,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1,,
75%,,,16.9,28.2,0.8,7.4,10.6,,48.0,,...,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4,,


In [13]:
"""From count column its possible to see that not all the columns have values in all the rows, 
lets check in which columns are missing values."""
weather_df.count().sort_values()

Sunshine          75625
Evaporation       82670
Cloud3pm          86102
Cloud9am          89572
Pressure9am      130395
Pressure3pm      130432
WindDir9am       134894
WindGustDir      135134
WindGustSpeed    135198
Humidity3pm      140953
WindDir3pm       141232
Temp3pm          141851
RainTomorrow     142193
RainToday        142199
Rainfall         142199
WindSpeed3pm     142398
Humidity9am      142806
Temp9am          143693
WindSpeed9am     143693
MinTemp          143975
MaxTemp          144199
Location         145460
Date             145460
dtype: int64

In [18]:
"""We can see that the first four columns have less than 60% of the dataset rows, 
so there are no important to do the prediction so we can drop them."""

weather_df = weather_df.drop(columns=['Sunshine', 'Evaporation', 'Cloud3pm', 'Cloud9am'], axis=1)
weather_df.shape

(145460, 19)

In [19]:
"""From the upper values we can see that only location and date have all the rows, 
so we are going to delete the rows that contains some nan value."""

weather_df = weather_df.dropna()
weather_df.count().sort_values()

Date             112925
Temp3pm          112925
Temp9am          112925
Pressure3pm      112925
Pressure9am      112925
Humidity3pm      112925
Humidity9am      112925
WindSpeed3pm     112925
RainToday        112925
WindSpeed9am     112925
WindDir9am       112925
WindGustSpeed    112925
WindGustDir      112925
Rainfall         112925
MaxTemp          112925
MinTemp          112925
Location         112925
WindDir3pm       112925
RainTomorrow     112925
dtype: int64

In [20]:
weather_df.shape

(112925, 19)

In [21]:
weather_df.describe(include='all')

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
count,112925,112925,112925.0,112925.0,112925.0,112925,112925.0,112925,112925,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925,112925
unique,3417,44,,,,16,,16,16,,,,,,,,,2,2
top,2014-01-08,Darwin,,,,W,,N,SE,,,,,,,,,No,No
freq,44,3117,,,,8380,,9815,8325,,,,,,,,,87556,87906
mean,,,12.664721,23.65567,2.377892,,40.783396,,,15.179163,19.500545,67.404162,50.669055,1017.425847,1015.048962,17.462008,22.127306,,
std,,,6.254135,6.982702,8.602968,,13.329173,,,8.344304,8.580097,18.91161,20.76855,7.032654,6.957822,6.355045,6.83738,,
min,,,-8.2,2.6,0.0,,-33.0,,,2.0,2.0,0.0,0.0,980.5,977.1,-3.1,1.7,,
25%,,,8.1,18.3,0.0,,31.0,,,9.0,13.0,56.0,36.0,1012.8,1010.3,12.7,16.9,,
50%,,,12.4,23.1,0.0,,39.0,,,13.0,19.0,68.0,51.0,1017.4,1015.0,17.1,21.6,,
75%,,,17.2,28.7,0.8,,48.0,,,20.0,24.0,81.0,65.0,1022.1,1019.7,22.0,26.9,,


In [26]:
"""Start working with Categorical columns. Lets see the unique values for each one to check if there is some value
that not correspond to the column. RainToday and RainTomorrow both have 2 unique values, so dont need to check his unique
values."""
# Location has 44 unique values
unique_location_values = weather_df['Location'].unique()
unique_location_values

array(['Albury', 'BadgerysCreek', 'Cobar', 'CoffsHarbour', 'Moree',
       'NorahHead', 'NorfolkIsland', 'Richmond', 'Sydney',
       'SydneyAirport', 'WaggaWagga', 'Williamtown', 'Wollongong',
       'Canberra', 'Tuggeranong', 'Ballarat', 'Bendigo', 'Sale',
       'MelbourneAirport', 'Melbourne', 'Mildura', 'Nhil', 'Portland',
       'Watsonia', 'Dartmoor', 'Brisbane', 'Cairns', 'GoldCoast',
       'Townsville', 'Adelaide', 'MountGambier', 'Nuriootpa', 'Woomera',
       'Witchcliffe', 'PearceRAAF', 'PerthAirport', 'Perth', 'Walpole',
       'Hobart', 'Launceston', 'AliceSprings', 'Darwin', 'Katherine',
       'Uluru'], dtype=object)

In [27]:
# WindGustDir has 16 unique values

unique_WindGustDir_values = weather_df['WindGustDir'].unique()
unique_WindGustDir_values

array(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'ENE', 'SSE',
       'S', 'NW', 'SE', 'ESE', 'E', 'SSW'], dtype=object)

In [28]:
# WindDir9am has 16 unique values

unique_WindDir9am_values = weather_df['WindDir9am'].unique()
unique_WindDir9am_values

array(['W', 'NNW', 'SE', 'ENE', 'SW', 'SSE', 'S', 'NE', 'SSW', 'N', 'WSW',
       'ESE', 'E', 'NW', 'WNW', 'NNE'], dtype=object)

In [29]:
# WindDir3pm has 16 unique values

unique_WindDir3pm_values = weather_df['WindDir3pm'].unique()
unique_WindDir3pm_values

array(['WNW', 'WSW', 'E', 'NW', 'W', 'SSE', 'ESE', 'ENE', 'NNW', 'SSW',
       'SW', 'SE', 'N', 'S', 'NNE', 'NE'], dtype=object)

In [41]:
# Convert categorical columns in numerical

weather_df['RainToday'].replace({'No': 0, 'Yes': 1}, inplace=True)
weather_df['RainTomorrow'].replace({'No': 0, 'Yes': 1}, inplace=True)
weather_df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0
1,2008-12-02,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0
2,2008-12-03,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0
3,2008-12-04,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,0
4,2008-12-05,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,2017-06-20,Uluru,3.5,21.8,0.0,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,9.4,20.9,0,0
145455,2017-06-21,Uluru,2.8,23.4,0.0,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,10.1,22.4,0,0
145456,2017-06-22,Uluru,3.6,25.3,0.0,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,10.9,24.5,0,0
145457,2017-06-23,Uluru,5.4,26.9,0.0,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,12.5,26.1,0,0


In [42]:
weather_df['Location_num'] = pd.Categorical(weather_df['Location'], categories=weather_df['Location'].unique()).codes

In [63]:
np.sort(unique_WindGustDir_values)

array(['E', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NW', 'S', 'SE', 'SSE',
       'SSW', 'SW', 'W', 'WNW', 'WSW'], dtype=object)

In [65]:
np.sort(unique_WindDir9am_values)

array(['E', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NW', 'S', 'SE', 'SSE',
       'SSW', 'SW', 'W', 'WNW', 'WSW'], dtype=object)

In [67]:
np.sort(unique_WindDir3pm_values)

array(['E', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NW', 'S', 'SE', 'SSE',
       'SSW', 'SW', 'W', 'WNW', 'WSW'], dtype=object)

In [62]:
""" We can see that WindGustDir, WindDir9am_num, WindDir3pm have the same values I choose to convert them to numerical
values in order so they will be easy to know on which value we are working."""
wind_dict = np.sort(unique_WindGustDir_values)
weather_df['WindGustDir_num'] = pd.Categorical(weather_df['WindGustDir'], ordered=True, categories= wind_dict).codes
weather_df['WindDir9am_num'] = pd.Categorical(weather_df['WindDir9am'], ordered=True, categories= wind_dict).codes
weather_df['WindDir3pm_num'] = pd.Categorical(weather_df['WindDir3pm'], ordered=True, categories= wind_dict).codes
weather_df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Location_num,WindGustDir_num,WindDir9am_num,WindDir3pm_num
0,2008-12-01,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,...,1007.7,1007.1,16.9,21.8,0,0,0,13,13,14
1,2008-12-02,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,...,1010.6,1007.8,17.2,24.3,0,0,0,14,6,15
2,2008-12-03,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,...,1007.6,1008.7,21.0,23.2,0,0,0,15,13,15
3,2008-12-04,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,...,1017.6,1012.8,18.1,26.5,0,0,0,4,9,0
4,2008-12-05,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,...,1010.8,1006.0,17.8,29.7,0,0,0,13,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,2017-06-20,Uluru,3.5,21.8,0.0,E,31.0,ESE,E,15.0,...,1024.7,1021.2,9.4,20.9,0,0,43,0,2,0
145455,2017-06-21,Uluru,2.8,23.4,0.0,E,31.0,SE,ENE,13.0,...,1024.6,1020.3,10.1,22.4,0,0,43,0,9,1
145456,2017-06-22,Uluru,3.6,25.3,0.0,NNW,22.0,SE,N,13.0,...,1023.5,1019.1,10.9,24.5,0,0,43,6,9,3
145457,2017-06-23,Uluru,5.4,26.9,0.0,N,37.0,SE,WNW,9.0,...,1021.0,1016.8,12.5,26.1,0,0,43,3,9,14


In [68]:
weather_df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Location_num,WindGustDir_num,WindDir9am_num,WindDir3pm_num
count,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0,112925.0
mean,12.664721,23.65567,2.377892,40.783396,15.179163,19.500545,67.404162,50.669055,1017.425847,1015.048962,17.462008,22.127306,0.224654,0.221554,21.156369,7.720655,7.237618,7.759478
std,6.254135,6.982702,8.602968,13.329173,8.344304,8.580097,18.91161,20.76855,7.032654,6.957822,6.355045,6.83738,0.417356,0.415294,12.337493,4.673142,4.550401,4.615792
min,-8.2,2.6,0.0,-33.0,2.0,2.0,0.0,0.0,980.5,977.1,-3.1,1.7,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.1,18.3,0.0,31.0,9.0,13.0,56.0,36.0,1012.8,1010.3,12.7,16.9,0.0,0.0,10.0,3.0,3.0,4.0
50%,12.4,23.1,0.0,39.0,13.0,19.0,68.0,51.0,1017.4,1015.0,17.1,21.6,0.0,0.0,22.0,8.0,7.0,8.0
75%,17.2,28.7,0.8,48.0,20.0,24.0,81.0,65.0,1022.1,1019.7,22.0,26.9,0.0,0.0,32.0,12.0,11.0,12.0
max,33.9,48.1,367.6,135.0,87.0,87.0,100.0,100.0,1041.0,1039.6,40.2,46.7,1.0,1.0,43.0,15.0,15.0,15.0
