In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

In [2]:
cd=pd.read_csv("Company_Data.csv")
cd.head(5)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


# EDA

In [3]:
cd.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [4]:
cd.shape

(400, 11)

In [5]:
cd.dtypes

Sales          float64
CompPrice        int64
Income           int64
Advertising      int64
Population       int64
Price            int64
ShelveLoc       object
Age              int64
Education        int64
Urban           object
US              object
dtype: object

# DATA PROCESSING

In [9]:
cd['ShelveLoc'].unique(),cd['Urban'].unique(),cd['US'].unique()

(array(['Bad', 'Good', 'Medium'], dtype=object),
 array(['Yes', 'No'], dtype=object),
 array(['Yes', 'No'], dtype=object))

In [10]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()

In [11]:
cd['ShelveLoc']=LE.fit_transform(cd['ShelveLoc'])
cd['Urban']=LE.fit_transform(cd['Urban'])
cd['US']=LE.fit_transform(cd['US'])

In [12]:
cd.dtypes

Sales          float64
CompPrice        int64
Income           int64
Advertising      int64
Population       int64
Price            int64
ShelveLoc        int32
Age              int64
Education        int64
Urban            int32
US               int32
dtype: object

In [13]:
cd

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,0,42,17,1,1
1,11.22,111,48,16,260,83,1,65,10,1,1
2,10.06,113,35,10,269,80,2,59,12,1,1
3,7.40,117,100,4,466,97,2,55,14,1,1
4,4.15,141,64,3,340,128,0,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,1,33,14,1,1
396,6.14,139,23,3,37,120,2,55,11,0,1
397,7.41,162,26,12,368,159,2,40,18,1,1
398,5.94,100,79,7,284,95,0,50,12,1,1


# CONVERTING SALES INTO CATEGORICAL VARIABLE

In [14]:
cd.insert(11,"SalesVariable",'')
cd

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,SalesVariable
0,9.50,138,73,11,276,120,0,42,17,1,1,
1,11.22,111,48,16,260,83,1,65,10,1,1,
2,10.06,113,35,10,269,80,2,59,12,1,1,
3,7.40,117,100,4,466,97,2,55,14,1,1,
4,4.15,141,64,3,340,128,0,38,13,1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,1,33,14,1,1,
396,6.14,139,23,3,37,120,2,55,11,0,1,
397,7.41,162,26,12,368,159,2,40,18,1,1,
398,5.94,100,79,7,284,95,0,50,12,1,1,


In [15]:
for i in range(0,len(cd['Sales'])):
    if cd['Sales'][i]>=11.0:
        cd['SalesVariable'][i] ='High'
    elif cd['Sales'][i]<=6.0:
        cd['SalesVariable'][i] = 'Low'
    else:
        cd['SalesVariable'][i] = 'Medium'

In [16]:
cd

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,SalesVariable
0,9.50,138,73,11,276,120,0,42,17,1,1,Medium
1,11.22,111,48,16,260,83,1,65,10,1,1,High
2,10.06,113,35,10,269,80,2,59,12,1,1,Medium
3,7.40,117,100,4,466,97,2,55,14,1,1,Medium
4,4.15,141,64,3,340,128,0,38,13,1,0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,1,33,14,1,1,High
396,6.14,139,23,3,37,120,2,55,11,0,1,Medium
397,7.41,162,26,12,368,159,2,40,18,1,1,Medium
398,5.94,100,79,7,284,95,0,50,12,1,1,Low


# MODEL BULDING

In [17]:
X=cd.iloc[:,1:-1]
Y=cd.iloc[:,-1]

In [18]:
X.head(4)

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,138,73,11,276,120,0,42,17,1,1
1,111,48,16,260,83,1,65,10,1,1
2,113,35,10,269,80,2,59,12,1,1
3,117,100,4,466,97,2,55,14,1,1


In [19]:
Y.head()

0    Medium
1      High
2    Medium
3    Medium
4       Low
Name: SalesVariable, dtype: object

# SPLIT INTO TRAIANING AND TESTING DATA

In [20]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=.26,random_state=6)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((296, 10), (104, 10), (296,), (104,))

In [21]:
x_train

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
143,122,88,7,36,159,0,28,17,1,1
27,98,118,0,19,107,2,64,17,1,0
19,129,76,16,58,121,2,69,12,1,1
112,116,99,5,298,125,1,62,12,1,1
239,123,105,0,149,118,0,62,16,1,1
...,...,...,...,...,...,...,...,...,...,...
365,154,30,0,122,162,2,57,17,0,0
106,102,33,0,217,139,2,70,18,0,0
227,113,64,10,68,101,2,57,16,1,1
201,138,83,0,139,134,2,54,18,1,0


In [22]:
x_test

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
137,128,42,0,436,118,2,80,11,1,0
138,125,103,12,371,109,2,44,10,1,1
13,115,28,11,29,86,1,53,18,1,1
95,134,25,10,237,148,2,59,13,1,1
175,115,89,0,38,122,2,25,12,1,0
...,...,...,...,...,...,...,...,...,...,...
244,130,30,0,391,100,2,26,18,1,0
375,132,46,4,206,124,2,73,11,1,0
282,150,96,0,80,154,1,61,11,1,0
397,162,26,12,368,159,2,40,18,1,1


In [23]:
y_train

143       Low
27        Low
19     Medium
112    Medium
239       Low
        ...  
365    Medium
106       Low
227    Medium
201       Low
394       Low
Name: SalesVariable, Length: 296, dtype: object

In [24]:
y_test

137    Medium
138    Medium
13     Medium
95        Low
175    Medium
        ...  
244    Medium
375    Medium
282    Medium
397    Medium
276    Medium
Name: SalesVariable, Length: 104, dtype: object

# GRID SEARCH CV TO FIND BEST HYPERPARAMETER

In [25]:
model=RandomForestClassifier()
parameters={
    "n_estimators":[130,140,150,160],
    "max_features":[4,5,6,7]
}

In [None]:



fn=['CompPrice','Income','Advertising','Population','Price','ShelveLoc','Age','Education','Urban','US']
cn=['Low','Medium','High']
fig,axes=plt.subplots(figsize=(4,4),dpi=1000)
tree.plot_tree(model,
              feature_names=fn,
              class_names=cn,
              filled=True)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
Gcd=GridSearchCV(model,parameters,cv=10)

In [None]:
Gcd.fit(x_train,y_train)

In [None]:
Gcd.best_score_

In [None]:
Gcd.best_params_

In [None]:
model2=RandomForestClassifier(max_features=7, n_estimators= 130)

In [None]:
model2.fit(x_train,y_train)

In [None]:
print(classification_report(y_test,model2.predict(x_test)))

In [None]:
model2.feature_importances_