In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split, cross_val_score # Import train_test_split function and cross validation
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from statistics import mean 

In [2]:
!pip install pydotplus
!pip install graphviz
import pydotplus
import graphviz



You should consider upgrading via the 'python -m pip install --upgrade pip' command.




You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# Performing Exploratory Data Analysis

Importing train and test data

In [3]:
train = pd.read_csv("tennis.csv")
test = pd.read_csv("test.csv")

Showing few data from train and test dataset

In [4]:
train.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [5]:
test.head()

Unnamed: 0,D15,Overcast,Cool,Normal,Strong
0,D16,Sunny,Mild,High,Weak
1,D17,Sunny,Cool,Normal,Weak
2,D18,Sunny,Hot,High,Weak
3,D19,Sunny,Mild,High,Weak


Notice that there is no colums names assigned to the test data. We need to set column names to the test data which we will see in the upcoming cells

In [6]:
#Assigning colums names to the test data
col_names = ['day', 'outlook', 'temp', 'humidity', 'wind']

#reading the test data with the above names
test = pd.read_csv("test.csv", names = col_names)

In [7]:
#Now our test data looks like this
test

Unnamed: 0,day,outlook,temp,humidity,wind
0,D15,Overcast,Cool,Normal,Strong
1,D16,Sunny,Mild,High,Weak
2,D17,Sunny,Cool,Normal,Weak
3,D18,Sunny,Hot,High,Weak
4,D19,Sunny,Mild,High,Weak


In [8]:
#dropping day column from train data
train.drop(labels = ['day'], axis=1, inplace = True)

In [9]:
train.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [10]:
#dropping day column from test data
test.drop(labels = ['day'], axis=1, inplace = True)

In [11]:
test.head()

Unnamed: 0,outlook,temp,humidity,wind
0,Overcast,Cool,Normal,Strong
1,Sunny,Mild,High,Weak
2,Sunny,Cool,Normal,Weak
3,Sunny,Hot,High,Weak
4,Sunny,Mild,High,Weak


##### Showing pairwise correlation of all columns in the dataframe

In [33]:
train.corr()

Unnamed: 0,outlook,temp,humidity,wind,play
outlook,1.0,0.33541,0.1690309,0.0,0.176383
temp,0.33541,1.0,0.5669467,0.1909407,0.197203
humidity,0.169031,0.566947,1.0,-3.2049380000000005e-17,0.447214
wind,0.0,0.190941,-3.2049380000000005e-17,1.0,-0.258199
play,0.176383,0.197203,0.4472136,-0.2581989,1.0


In [35]:
test.corr()

Unnamed: 0,outlook,temp,humidity,wind
outlook,1.0,0.534522,0.612372,1.0
temp,0.534522,1.0,0.872872,0.534522
humidity,0.612372,0.872872,1.0,0.612372
wind,1.0,0.534522,0.612372,1.0


##  Identify the predictor variables and encode any string variables to equivalent integer codes using factorize method

In [22]:
train['outlook'],_ = pd.factorize(train['outlook'])
train['temp'],_ = pd.factorize(train['temp'])
train['humidity'],_ = pd.factorize(train['humidity'])
train['wind'],_ = pd.factorize(train['wind'])
train['play'],_ = pd.factorize(train['play'])

train.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,0,0,0,0,0
1,0,0,0,1,0
2,1,0,0,0,1
3,2,1,0,0,1
4,2,2,1,0,1


In [None]:
## Check a few information about the data set 

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
outlook     14 non-null int64
temp        14 non-null int64
humidity    14 non-null int64
wind        14 non-null int64
play        14 non-null int64
dtypes: int64(5)
memory usage: 640.0 bytes


The training data has 14 rows and 7 colums.

There is no missing values in the dataset

In [24]:
test['outlook'],_ = pd.factorize(test['outlook'])
test['temp'],_ = pd.factorize(test['temp'])
test['humidity'],_ = pd.factorize(test['humidity'])
test['wind'],_ = pd.factorize(test['wind'])
test.head()

Unnamed: 0,outlook,temp,humidity,wind
0,0,0,0,0
1,1,1,1,1
2,1,0,0,1
3,1,2,1,1
4,1,1,1,1


In [25]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
outlook     5 non-null int64
temp        5 non-null int64
humidity    5 non-null int64
wind        5 non-null int64
dtypes: int64(4)
memory usage: 240.0 bytes


In [26]:
#Selecting predictor feature and the target variable
X_train = train.drop(labels = ['play'], axis=1)
y_train = train['play'].values

### Using Decision Tree Algorithm to predict the outcome



#### Using Entropy method


In [27]:
clf = DecisionTreeClassifier(criterion ='entropy')

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(test)

In [28]:
#Validating the model
scores = cross_val_score(clf, X_train, y_train, cv = 5)

In [29]:
y_pred

array([0, 1, 1, 1, 1], dtype=int64)

In [30]:
avg_cross_val_score = mean(scores)*100
print(avg_cross_val_score)

73.33333333333333


#### Using Gini index method 

In [31]:
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(test)

In [32]:
scores = cross_val_score(clf, X_train, y_train, cv = 5)
y_pred
avg_cross_val_score = mean(scores)*100
print(avg_cross_val_score)

63.33333333333333


In [None]:
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('cartOutput.png')
Image(graph.create_png())