In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv('daily_weather.csv')

In [3]:
data.head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [4]:
data.shape

(1095, 11)

In [11]:
data.isnull().any()

air_pressure_9am          False
air_temp_9am              False
avg_wind_direction_9am    False
avg_wind_speed_9am        False
max_wind_direction_9am    False
max_wind_speed_9am        False
rain_accumulation_9am     False
rain_duration_9am         False
relative_humidity_9am     False
relative_humidity_3pm     False
dtype: bool

In [6]:
del data['number']

In [7]:
before_rows = data.shape[0]

In [8]:
before_rows

1095

In [4]:
data = data.dropna()

In [10]:
after_rows = data.shape[0]

In [12]:
after_rows

1064

In [7]:
clean_data = data.copy()

In [8]:
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm']>24.99)*1

In [16]:
clean_data.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm,high_humidity_label
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16,1
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597,0
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46,0
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547,0
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74,1


In [35]:
y = clean_data[['high_humidity_label']].copy()

In [36]:
print(y)
type(y)

      high_humidity_label
0                       1
1                       0
2                       0
3                       0
4                       1
5                       1
6                       0
7                       1
8                       0
9                       1
10                      1
11                      1
12                      1
13                      1
14                      0
15                      0
17                      0
18                      1
19                      0
20                      0
21                      1
22                      0
23                      1
24                      0
25                      1
26                      1
27                      1
28                      1
29                      1
30                      1
...                   ...
1064                    1
1065                    1
1067                    1
1068                    1
1069                    1
1070                    1
1071        

pandas.core.frame.DataFrame

In [37]:
clean_data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [38]:
y.head()

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1


In [5]:
morning_features = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [40]:
type(morning_features)

list

In [9]:
X = clean_data[morning_features].copy()
type(X)

pandas.core.frame.DataFrame

In [42]:
X.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')

In [43]:
y.columns

Index(['high_humidity_label'], dtype='object')

In [44]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state = 324) # get data

In [53]:
type(X_train)

pandas.core.frame.DataFrame

In [52]:
type(X_test)


pandas.core.frame.DataFrame

In [48]:
type(y_train)

pandas.core.frame.DataFrame

In [49]:
type(y_test)

pandas.core.frame.DataFrame

In [50]:
X_train.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am
841,918.37,72.932,184.5,2.013246,186.7,2.773806,0.0,0.0
75,920.1,53.492,186.1,13.444009,193.8,15.367778,0.0,0.0
95,927.61,54.896,55.0,4.988376,53.4,7.202947,0.0,0.0
895,919.235153,65.951112,194.343333,2.942019,216.569792,3.65881,0.0,0.0
699,919.888128,68.687822,228.51773,3.960858,247.954028,5.185547,0.0,0.0


In [51]:
y_train.describe()

Unnamed: 0,high_humidity_label
count,712.0
mean,0.494382
std,0.50032
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [54]:
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0) #choose model
humidity_classifier.fit(X_train, y_train) #apply model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [55]:
type(humidity_classifier)

sklearn.tree.tree.DecisionTreeClassifier

In [56]:
predictions = humidity_classifier.predict(X_test)

In [57]:
predictions[:10]

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [58]:
y_test['high_humidity_label'][:10]

456     0
845     0
693     1
259     1
723     1
224     1
300     1
442     0
585     1
1057    1
Name: high_humidity_label, dtype: int64

In [59]:
accuracy_score(y_true = y_test,y_pred=predictions)

0.81534090909090906

In [12]:
type(X.air_pressure_9am)

SyntaxError: invalid syntax (<ipython-input-12-5a21c459220d>, line 1)