## Problem Statement:
A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
Approach - A decision tree can be built with target variable Sale (we will first convert it in categorical variable) & all other variable will be independent in the analysis.  

## 1.Import the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix,accuracy_score

## 2.Import the data

In [2]:
company_data = pd.read_csv('Company_Data.csv')
company_data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


## 3.Data Understanding

In [3]:
company_data.shape

(400, 11)

In [4]:
company_data.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [5]:
company_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


## 4.Data Cleaning(EDA)

In [6]:
#To check if there is any duplicate data - NO DUPLICATES
company_data[company_data.duplicated()]

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US


In [7]:
#Converting Urban and US Columns to integers
company_data['Urban'] = np.where(company_data['Urban'].str.contains("yes"), 1, 0)
company_data['US'] = np.where(company_data['US'].str.contains("yes"), 1, 0)

In [8]:
company_data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,0,0
1,11.22,111,48,16,260,83,Good,65,10,0,0
2,10.06,113,35,10,269,80,Medium,59,12,0,0
3,7.40,117,100,4,466,97,Medium,55,14,0,0
4,4.15,141,64,3,340,128,Bad,38,13,0,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,0,0
396,6.14,139,23,3,37,120,Medium,55,11,0,0
397,7.41,162,26,12,368,159,Medium,40,18,0,0
398,5.94,100,79,7,284,95,Bad,50,12,0,0


In [9]:
#use label encoding to convert ShelveLoc
le = LabelEncoder()

In [10]:
company_data['ShelveLoc'] = le.fit_transform(company_data['ShelveLoc'])

In [11]:
company_data['ShelveLoc']

0      0
1      1
2      2
3      2
4      0
      ..
395    1
396    2
397    2
398    0
399    1
Name: ShelveLoc, Length: 400, dtype: int32

In [12]:
company_data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,0,42,17,0,0
1,11.22,111,48,16,260,83,1,65,10,0,0
2,10.06,113,35,10,269,80,2,59,12,0,0
3,7.40,117,100,4,466,97,2,55,14,0,0
4,4.15,141,64,3,340,128,0,38,13,0,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,1,33,14,0,0
396,6.14,139,23,3,37,120,2,55,11,0,0
397,7.41,162,26,12,368,159,2,40,18,0,0
398,5.94,100,79,7,284,95,0,50,12,0,0


In [13]:
company_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    int32  
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    int32  
 10  US           400 non-null    int32  
dtypes: float64(1), int32(3), int64(7)
memory usage: 29.8 KB


## 5.Model Building

In [14]:
X = company_data.drop(labels="Sales",axis=1).values

In [15]:
X

array([[138,  73,  11, ...,  17,   0,   0],
       [111,  48,  16, ...,  10,   0,   0],
       [113,  35,  10, ...,  12,   0,   0],
       ...,
       [162,  26,  12, ...,  18,   0,   0],
       [100,  79,   7, ...,  12,   0,   0],
       [134,  37,   0, ...,  16,   0,   0]], dtype=int64)

In [16]:
#Converting the continious variable to catogirical data
y = le.fit_transform(pd.cut(company_data['Sales'], bins=3, retbins=True)[0])

In [17]:
y

array([1, 2, 1, 1, 0, 1, 1, 2, 1, 0, 1, 2, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2,
       0, 1, 1, 2, 1, 0, 0, 1, 2, 1, 1, 1, 0, 2, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 2, 0, 0, 1, 0, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 0, 1, 1, 2,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 2, 0, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 0, 0, 2, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 1, 0, 2, 1, 1,

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [19]:
X_train.shape

(320, 10)

In [20]:
X_test.shape

(80, 10)

In [21]:
y_train.shape

(320,)

## 6.Model Training

In [39]:
model = DecisionTreeClassifier(criterion='gini',max_depth=4)
model.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=4)

## 7.Model Testing

In [40]:
#Training data
y_train_pred = model.predict(X_train)
y_train_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 2, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 2, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,

In [41]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [42]:
accuracy_score(y_train,y_train_pred)

0.790625

In [43]:
confusion_matrix(y_train,y_train_pred) #104 FN predictions

array([[ 46,  37,   0],
       [  4, 190,   4],
       [  0,  22,  17]], dtype=int64)

In [44]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.92      0.55      0.69        83
           1       0.76      0.96      0.85       198
           2       0.81      0.44      0.57        39

    accuracy                           0.79       320
   macro avg       0.83      0.65      0.70       320
weighted avg       0.81      0.79      0.77       320



In [45]:
#Testing data
y_test_pred = model.predict(X_test)
y_test_pred

array([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [46]:
accuracy_score(y_test,y_test_pred)

0.7

In [47]:
confusion_matrix(y_test,y_test_pred) 

array([[ 4, 15,  0],
       [ 1, 47,  1],
       [ 0,  7,  5]], dtype=int64)

In [48]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.80      0.21      0.33        19
           1       0.68      0.96      0.80        49
           2       0.83      0.42      0.56        12

    accuracy                           0.70        80
   macro avg       0.77      0.53      0.56        80
weighted avg       0.73      0.70      0.65        80



### Finding out the best hyperparameter for building the Tree

In [34]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = model,param_grid = {'criterion' : ['gini','entropy'],'max_depth' : [3,4,5,6,7,8,9,10]},cv = 5)

In [35]:
grid_search.fit(X,y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(max_depth=3),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]})

In [36]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=4)

In [37]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 4}

In [38]:
grid_search.best_score_

0.7125