In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from itertools import product

import dtutils

In [3]:
dataset = pd.read_csv('weather.csv')
flabels  = list(dataset)

In [4]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

In [5]:
#Splitting the dataset into the Training Set and Test Set 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.33, random_state=1234)

In [6]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(1819, 43) (897, 43) (1819,) (897,)


## Question	2:	Naïve	Bayes
Train a Naïve Bayes classifier to predict RainTomorrow.
As all attributes are binary vectors, use the BernoulliNB classifier provided by scikit-learn.

In [7]:
clf2 = BernoulliNB()
clf2.fit(X_train, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [8]:
print(clf2.classes_, clf2.class_count_)

[0 1] [1234.  585.]


In [9]:
Y_train_pred = clf2.predict(X_train)
Y_test_pred = clf2.predict(X_test)

In [10]:
print('Correctly predicted on TRAINING SET: {}, errors:{}'.format(sum(Y_train==Y_train_pred), sum(Y_train!=Y_train_pred)))

Correctly predicted on TRAINING SET: 1376, errors:443


In [13]:
print('Correctly predicted on TEST set: {}, errors:{}'.format(sum(Y_test==Y_test_pred), sum(Y_test!=Y_test_pred)))

Correctly predicted on TEST set: 668, errors:229


In [11]:
print(classification_report(Y_train,Y_train_pred))
print('Accuracy on TRAINING set: {:.2f}'.format(accuracy_score(Y_train, Y_train_pred)))

             precision    recall  f1-score   support

          0       0.86      0.76      0.81      1234
          1       0.60      0.74      0.66       585

avg / total       0.78      0.76      0.76      1819

Accuracy on TRAINING set: 0.76


In [13]:
print(classification_report(Y_test, Y_test_pred))
print('Accuracy on TEST set: {:.2f}'.format(accuracy_score(Y_test, Y_test_pred)))

             precision    recall  f1-score   support

          0       0.85      0.76      0.80       607
          1       0.59      0.71      0.64       290

avg / total       0.76      0.74      0.75       897

Accuracy on TEST set: 0.74


In [14]:
print("Confused Matrix:[ TRAINING ] \n"," ".join(["{:3d}".format(d) for d in clf2.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_train, Y_train_pred, labels=clf2.classes_))

Confused Matrix:[ TRAINING ] 
   0   1 <-- PREDICTED LABEL
[[942 292]
 [151 434]]


In [15]:
print("Confused Matrix:[ TEST ] \n"," ".join(["{:3d}".format(d) for d in clf2.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_test, Y_test_pred, labels=clf2.classes_))

Confused Matrix:[ TEST ] 
   0   1 <-- PREDICTED LABEL
[[462 145]
 [ 84 206]]


In [16]:
Y_pred_proba = clf2.predict_proba(X_test)
print(Y_pred_proba[0])

[0.00307416 0.99692584]


# Question	3:	Decision	Tree
Train a DecisionTreeClassifier to predict RainTomorrow. Use argument class_weight=’balanced’ when
constructing the classifier, as the target variable RainTomorrow is not equally distributed in the data set.

In [17]:
dtclf = DecisionTreeClassifier(class_weight = "balanced",)
dtclf.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
print(dtclf.classes_)

[0 1]


In [19]:
dt_Y_train_pred = dtclf.predict(X_train)
dt_Y_test_pred = dtclf.predict(X_test)

In [20]:
print('Correctly predicted on TRAINING SET: {}, errors:{}'.
      format(sum(Y_train==dt_Y_train_pred), sum(Y_train!=dt_Y_train_pred)))

Correctly predicted on TRAINING SET: 1794, errors:25


In [21]:
print('Correctly predicted on TEST set: {}, errors:{}'.format(sum(Y_test==dt_Y_test_pred), sum(Y_test!=dt_Y_test_pred)))

Correctly predicted on TEST set: 615, errors:282


In [22]:
print(classification_report(Y_train,dt_Y_train_pred))
print('Accuracy on TRAINING set: {:.2f}'.format(accuracy_score(Y_train, dt_Y_train_pred)))

             precision    recall  f1-score   support

          0       1.00      0.98      0.99      1234
          1       0.96      1.00      0.98       585

avg / total       0.99      0.99      0.99      1819

Accuracy on TRAINING set: 0.99


In [23]:
print(classification_report(Y_test, dt_Y_test_pred))
print('Accuracy on TEST set: {:.2f}'.format(accuracy_score(Y_test, dt_Y_test_pred)))

             precision    recall  f1-score   support

          0       0.78      0.75      0.76       607
          1       0.51      0.55      0.53       290

avg / total       0.69      0.69      0.69       897

Accuracy on TEST set: 0.69


In [24]:
print("Confused Matrix:[ TRAINING ] \n"," ".join(["{:3d}".format(d) for d in dtclf.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_train, dt_Y_train_pred, labels=dtclf.classes_))

Confused Matrix:[ TRAINING ] 
   0   1 <-- PREDICTED LABEL
[[1209   25]
 [   0  585]]


In [25]:
print("Confused Matrix:[ TEST ] \n"," ".join(["{:3d}".format(d) for d in dtclf.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_test, dt_Y_test_pred, labels=dtclf.classes_))

Confused Matrix:[ TEST ] 
   0   1 <-- PREDICTED LABEL
[[456 151]
 [131 159]]


In [26]:
dt_Y_pred_proba = dtclf.predict_proba(X_test)
print(dt_Y_pred_proba[0])

[0. 1.]


In [27]:
dtutils.print_dt(dtclf, feature_names=flabels)

The binary tree structure has 1029 nodes (depth=19) and has the following tree structure:
node=0 test node: go to node 1 if RainToday <= 0.5 else to node 596.
	node=1 test node: go to node 2 if MinTemp_High <= 0.5 else to node 305.
		node=2 test node: go to node 3 if Sunshine_High <= 0.5 else to node 214.
			node=3 test node: go to node 4 if Temp3pm_Low <= 0.5 else to node 69.
				node=4 test node: go to node 5 if WindSpeed9am_Moderate <= 0.5 else to node 58.
					node=5 test node: go to node 6 if MaxTemp_High <= 0.5 else to node 53.
						node=6 test node: go to node 7 if Evaporation_High <= 0.5 else to node 28.
							node=7 test node: go to node 8 if MinTemp_Low <= 0.5 else to node 27.
								node=8 test node: go to node 9 if MaxTemp_Moderate <= 0.5 else to node 16.
									node=9 test node: go to node 10 if Humidity3pm_Moderate <= 0.5 else to node 11.
										node=10 leaf node. [class=0]  [2.211102106969206,0.0]
										node=11 test node: go to node 12 if WindSpeed9am_High <=

# Question	4:	Diagnosis
Does the Decision Tree model suffer from overfitting or underfitting? Justify why/why not.
If the model exhibits overfitting or underfitting, revise your training procedure to remedy the problem, and
re-evaluate the improved model. The DecisionTreeClassifier has a number of parameters that you can
consider for tuning the model:

* max_depth: maximum depth of the tree
* min_samples_leaf: minimum number of samples in each leaf node
* max_leaf_nodes: maximum number of leaf nodes

## Answer : Our Decision Tree model Suffers from ovefitting.

First, we try to find out the best parameters for our decision tree:

In [28]:
max_msl = max_md = max_mnl = max_acc = None

for msl, md, mnl in product(range(2,15), range(5, 50, 5), range(5, 100, 10)):
    options = dict(min_samples_leaf=msl, max_depth=md, max_leaf_nodes=mnl)
    clf3 = DecisionTreeClassifier(class_weight = "balanced", **options)
    clf3.fit(X_train, Y_train)

    Y_test_pred = clf3.predict(X_test)
    
    curr_acc = accuracy_score(Y_test, Y_test_pred)
    if max_acc == None or max_acc < curr_acc:
        max_msl, max_md, max_mnl, max_acc = msl, md, mnl, curr_acc

Then we use those parameters to evaluate the performance of our new model with best params:

In [29]:
best_params = dict(min_samples_leaf=max_msl, max_depth=max_md, max_leaf_nodes=max_mnl)

clf3 = DecisionTreeClassifier(class_weight = "balanced", **best_params)
clf3.fit(X_train, Y_train)

dtb_Y_train_pred = clf3.predict(X_train)
dtb_Y_test_pred = clf3.predict(X_test)

print(classification_report(Y_train,dtb_Y_train_pred))
print('Accuracy on TRAINING set: {:.2f}'.format(accuracy_score(Y_train,dtb_Y_train_pred))+"\n")

print(classification_report(Y_test, dtb_Y_test_pred))
print('Accuracy on TEST set: {:.2f}'.format(accuracy_score(Y_test, dtb_Y_test_pred))+"\n")

print("Confused Matrix[ TRAINING ]: \n"," ".join(["{:3d}".format(d) for d in clf3.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_train, dtb_Y_train_pred, labels=clf3.classes_))

print("Confused Matrix[ TEST ]: \n"," ".join(["{:3d}".format(d) for d in clf3.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_test, dtb_Y_test_pred, labels=clf3.classes_))

             precision    recall  f1-score   support

          0       0.86      0.77      0.82      1234
          1       0.61      0.74      0.67       585

avg / total       0.78      0.76      0.77      1819

Accuracy on TRAINING set: 0.76

             precision    recall  f1-score   support

          0       0.83      0.78      0.80       607
          1       0.59      0.67      0.63       290

avg / total       0.75      0.74      0.75       897

Accuracy on TEST set: 0.74

Confused Matrix[ TRAINING ]: 
   0   1 <-- PREDICTED LABEL
[[954 280]
 [153 432]]
Confused Matrix[ TEST ]: 
   0   1 <-- PREDICTED LABEL
[[471 136]
 [ 95 195]]


In [30]:
dt_Y_pred_proba = clf3.predict_proba(X_test)
print(dt_Y_pred_proba[0])

[0.22749908 0.77250092]


### Random Forest

In [31]:
rclf  = RandomForestClassifier(max_depth=2, random_state=0)
rclf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [32]:
print(rclf.classes_)

[0 1]


In [33]:
r_Y_train_pred = rclf.predict(X_train)
r_Y_test_pred = rclf.predict(X_test)

In [34]:
print('Correctly predicted on TRAINING SET: {}, errors:{}'.
      format(sum(Y_train==r_Y_train_pred), sum(Y_train!=r_Y_train_pred)))

Correctly predicted on TRAINING SET: 1413, errors:406


In [35]:
print('Correctly predicted on TEST set: {}, errors:{}'.format(sum(Y_test==r_Y_test_pred), sum(Y_test!=r_Y_test_pred)))

Correctly predicted on TEST set: 696, errors:201


In [36]:
print(classification_report(Y_train,r_Y_train_pred))
print('Accuracy on TRAINING set: {:.2f}'.format(accuracy_score(Y_train, r_Y_train_pred)))

             precision    recall  f1-score   support

          0       0.78      0.93      0.85      1234
          1       0.75      0.46      0.57       585

avg / total       0.77      0.78      0.76      1819

Accuracy on TRAINING set: 0.78


In [37]:
print(classification_report(Y_test, r_Y_test_pred))
print('Accuracy on TEST set: {:.2f}'.format(accuracy_score(Y_test, r_Y_test_pred)))

             precision    recall  f1-score   support

          0       0.78      0.93      0.85       607
          1       0.75      0.46      0.57       290

avg / total       0.77      0.78      0.76       897

Accuracy on TEST set: 0.78


In [38]:
print("Confused Matrix:[ TRAINING ] \n"," ".join(["{:3d}".format(d) for d in rclf.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_train, r_Y_train_pred, labels=rclf.classes_))

Confused Matrix:[ TRAINING ] 
   0   1 <-- PREDICTED LABEL
[[1144   90]
 [ 316  269]]


In [39]:
print("Confused Matrix:[ TEST ] \n"," ".join(["{:3d}".format(d) for d in rclf.classes_]),"<-- PREDICTED LABEL")
print(confusion_matrix(Y_test, r_Y_test_pred, labels=rclf.classes_))

Confused Matrix:[ TEST ] 
   0   1 <-- PREDICTED LABEL
[[562  45]
 [156 134]]


In [40]:
r_Y_pred_proba = rclf.predict_proba(X_test)
print(r_Y_pred_proba[0])

[0.44341298 0.55658702]
