In [49]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [5]:
df = pd.read_excel('KTM Raw data.xlsx')

In [7]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,1,53,Male,Professional,Average,180 to 220,Complicated,Not purchased
1,2,27,Female,Self Employed,Low End,No Bike,Single,Purchased
2,3,39,Female,Unemployed,Average,180 to 220,Married,Not purchased
3,4,20,Female,Unemployed,High End,No Bike,Married,Not purchased
4,5,29,Male,Student,Average,180 to 220,Complicated,Purchased


## Encoding categorical columns

In [10]:
le = LabelEncoder()

In [11]:
df['Gender'] = le.fit_transform(df['Gender'])

In [12]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,1,53,1,Professional,Average,180 to 220,Complicated,Not purchased
1,2,27,0,Self Employed,Low End,No Bike,Single,Purchased
2,3,39,0,Unemployed,Average,180 to 220,Married,Not purchased
3,4,20,0,Unemployed,High End,No Bike,Married,Not purchased
4,5,29,1,Student,Average,180 to 220,Complicated,Purchased


In [13]:
df['Phone Type'] = le.fit_transform(df['Phone Type'])
df['Current Bike'] = le.fit_transform(df['Current Bike'])
df['Response'] = le.fit_transform(df['Response'])

In [14]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,1,53,1,Professional,0,1,Complicated,0
1,2,27,0,Self Employed,2,4,Single,1
2,3,39,0,Unemployed,0,1,Married,0
3,4,20,0,Unemployed,1,4,Married,0
4,5,29,1,Student,0,1,Complicated,1


In [17]:
occ = pd.get_dummies(df['Occupation'])

In [18]:
occ

Unnamed: 0,Professional,Self Employed,Student,Unemployed
0,1,0,0,0
1,0,1,0,0
2,0,0,0,1
3,0,0,0,1
4,0,0,1,0
...,...,...,...,...
1513,0,0,1,0
1514,0,0,1,0
1515,0,0,1,0
1516,0,0,1,0


In [19]:
df[['Professional','Self Employed','Student','Unemployed']] = occ

In [20]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response,Professional,Self Employed,Student,Unemployed
0,1,53,1,Professional,0,1,Complicated,0,1,0,0,0
1,2,27,0,Self Employed,2,4,Single,1,0,1,0,0
2,3,39,0,Unemployed,0,1,Married,0,0,0,0,1
3,4,20,0,Unemployed,1,4,Married,0,0,0,0,1
4,5,29,1,Student,0,1,Complicated,1,0,0,1,0


In [21]:
rel = pd.get_dummies(df['Relationship'])

In [22]:
rel

Unnamed: 0,Committed,Complicated,Married,Single
0,0,1,0,0
1,0,0,0,1
2,0,0,1,0
3,0,0,1,0
4,0,1,0,0
...,...,...,...,...
1513,1,0,0,0
1514,0,0,1,0
1515,0,0,1,0
1516,0,0,0,1


In [23]:
df[['Committed','Complicated','Married','Single']] = rel

In [24]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response,Professional,Self Employed,Student,Unemployed,Committed,Complicated,Married,Single
0,1,53,1,Professional,0,1,Complicated,0,1,0,0,0,0,1,0,0
1,2,27,0,Self Employed,2,4,Single,1,0,1,0,0,0,0,0,1
2,3,39,0,Unemployed,0,1,Married,0,0,0,0,1,0,0,1,0
3,4,20,0,Unemployed,1,4,Married,0,0,0,0,1,0,0,1,0
4,5,29,1,Student,0,1,Complicated,1,0,0,1,0,0,1,0,0


In [26]:
x = df.drop(['ID','Occupation','Relationship','Response'],axis = 1)

In [27]:
x

Unnamed: 0,Age,Gender,Phone Type,Current Bike,Professional,Self Employed,Student,Unemployed,Committed,Complicated,Married,Single
0,53,1,0,1,1,0,0,0,0,1,0,0
1,27,0,2,4,0,1,0,0,0,0,0,1
2,39,0,0,1,0,0,0,1,0,0,1,0
3,20,0,1,4,0,0,0,1,0,0,1,0
4,29,1,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1513,19,1,1,0,0,0,1,0,1,0,0,0
1514,18,1,1,3,0,0,1,0,0,0,1,0
1515,24,0,1,1,0,0,1,0,0,0,1,0
1516,23,0,1,0,0,0,1,0,0,0,0,1


In [28]:
y = df['Response']

In [29]:
y

0       0
1       1
2       0
3       0
4       1
       ..
1513    1
1514    1
1515    1
1516    1
1517    1
Name: Response, Length: 1518, dtype: int32

## Train-Test Split

In [32]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3, random_state = 42)

In [35]:
df.shape

(1518, 16)

In [36]:
x.shape

(1518, 12)

In [37]:
x_train.shape

(1062, 12)

In [38]:
x_test.shape

(456, 12)

In [39]:
y_train.shape

(1062,)

In [40]:
y_test.shape

(456,)

## Model Building

In [45]:
model = DecisionTreeClassifier()

In [46]:
model.fit(x_train,y_train)

DecisionTreeClassifier()

In [51]:
y_pred = model.predict(x_test)

In [52]:
np.array(y_test)

array([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,

In [53]:
accuracy_score(y_test, y_pred)

0.7412280701754386

In [54]:
y_train_pred = model.predict(x_train)

In [55]:
accuracy_score(y_train,y_train_pred)

0.9962335216572504

* Decison Tree is an OVERFITTING MODEL. It basically gives good accuracy for training data and poor accuracy for testing data. In other words it overfits the training data and cannot GENERALIZE the testing data.