#   Question 1 

#    Random Forest Company Data 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
company_data = pd.read_csv("Company_Data.csv")
company_data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [3]:
company_data.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


###  Preprocessing the dataset

In [4]:
company_data['Sales'] = pd.cut(company_data.Sales, bins=(-1.0,7.49,16.27),
                               labels=['Low','High'])

In [5]:
company_data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,High,138,73,11,276,120,Bad,42,17,Yes,Yes
1,High,111,48,16,260,83,Good,65,10,Yes,Yes
2,High,113,35,10,269,80,Medium,59,12,Yes,Yes
3,Low,117,100,4,466,97,Medium,55,14,Yes,Yes
4,Low,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,High,138,108,17,203,128,Good,33,14,Yes,Yes
396,Low,139,23,3,37,120,Medium,55,11,No,Yes
397,Low,162,26,12,368,159,Medium,40,18,Yes,Yes
398,Low,100,79,7,284,95,Bad,50,12,Yes,Yes


In [6]:
company_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Sales        400 non-null    category
 1   CompPrice    400 non-null    int64   
 2   Income       400 non-null    int64   
 3   Advertising  400 non-null    int64   
 4   Population   400 non-null    int64   
 5   Price        400 non-null    int64   
 6   ShelveLoc    400 non-null    object  
 7   Age          400 non-null    int64   
 8   Education    400 non-null    int64   
 9   Urban        400 non-null    object  
 10  US           400 non-null    object  
dtypes: category(1), int64(7), object(3)
memory usage: 31.9+ KB


In [7]:
label_encoder = preprocessing.LabelEncoder()
company_data["Sales"] = label_encoder.fit_transform(company_data["Sales"])
company_data["ShelveLoc"] = label_encoder.fit_transform(company_data["ShelveLoc"])
company_data["Urban"] = label_encoder.fit_transform(company_data["Urban"])
company_data["US"] = label_encoder.fit_transform(company_data["US"])

In [8]:
company_data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,0,138,73,11,276,120,0,42,17,1,1
1,0,111,48,16,260,83,1,65,10,1,1
2,0,113,35,10,269,80,2,59,12,1,1
3,1,117,100,4,466,97,2,55,14,1,1
4,1,141,64,3,340,128,0,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,0,138,108,17,203,128,1,33,14,1,1
396,1,139,23,3,37,120,2,55,11,0,1
397,1,162,26,12,368,159,2,40,18,1,1
398,1,100,79,7,284,95,0,50,12,1,1


###  Splitting the dataset

In [9]:
x = company_data.iloc[:,1:11]
y = company_data.iloc[:,0]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)

###   Standardizing the dataset

In [11]:
x_train = StandardScaler().fit_transform(X_train)
x_test = StandardScaler().fit_transform(X_test)

###   Building Random Forest 

In [12]:
model = RandomForestClassifier(n_estimators=200, criterion='entropy',random_state=0)
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=0)

###  Predicting the model 

In [13]:
y_pred = model.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1])

###  Evaluating the model 

In [14]:
confusion_matrix(y_test, y_pred)

array([[50, 15],
       [ 6, 49]], dtype=int64)

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.825


In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.77      0.83        65
           1       0.77      0.89      0.82        55

    accuracy                           0.82       120
   macro avg       0.83      0.83      0.82       120
weighted avg       0.83      0.82      0.83       120



###  Feature importance 

In [17]:
feature_importances_df = pd.DataFrame(
    {"feature": list(x.columns), "importance": model.feature_importances_}
).sort_values("importance", ascending=False)
feature_importances_df

Unnamed: 0,feature,importance
4,Price,0.252591
0,CompPrice,0.137196
6,Age,0.126934
5,ShelveLoc,0.108095
1,Income,0.096436
2,Advertising,0.095182
3,Population,0.094235
7,Education,0.054923
8,Urban,0.018275
9,US,0.016135


### Using random forest the accuracy of model is improved to 82%