### Q. A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
- Approach - A random forest can be built with target variable Sale (we will first convert it in categorical variable) & all other variable will be independent in the analysis.  

In [1]:
# import the required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
# Read the data
data = pd.read_csv('Company_Data.csv')
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [3]:
# Underlying information of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
Sales          400 non-null float64
CompPrice      400 non-null int64
Income         400 non-null int64
Advertising    400 non-null int64
Population     400 non-null int64
Price          400 non-null int64
ShelveLoc      400 non-null object
Age            400 non-null int64
Education      400 non-null int64
Urban          400 non-null object
US             400 non-null object
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [4]:
# Converting the categorical columns into numeric columns using Label encoding
cols = ['ShelveLoc','Urban', 'US']

# Encode labels of above columns
data[cols] = data[cols].apply(LabelEncoder().fit_transform)

data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,0,42,17,1,1
1,11.22,111,48,16,260,83,1,65,10,1,1
2,10.06,113,35,10,269,80,2,59,12,1,1
3,7.4,117,100,4,466,97,2,55,14,1,1
4,4.15,141,64,3,340,128,0,38,13,1,0


In [5]:
# Statistics of the data
data.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,1.3075,53.3225,13.9,0.705,0.645
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,0.833475,16.200297,2.620528,0.456614,0.479113
min,0.0,77.0,21.0,0.0,10.0,24.0,0.0,25.0,10.0,0.0,0.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,1.0,39.75,12.0,0.0,0.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,2.0,54.5,14.0,1.0,1.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,2.0,66.0,16.0,1.0,1.0
max,16.27,175.0,120.0,29.0,509.0,191.0,2.0,80.0,18.0,1.0,1.0


In [6]:
# Converting the sales column into categorical column based on the 50th percentile value (7.49)(below 50th perc- Low, above 50th prec- High)
sales_cat = []
for value in data['Sales']:
    if value >= 7.49:
        sales_cat.append('High')
    else:
        sales_cat.append('Low')
data['Sales_cat'] = sales_cat
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,Sales_cat
0,9.5,138,73,11,276,120,0,42,17,1,1,High
1,11.22,111,48,16,260,83,1,65,10,1,1,High
2,10.06,113,35,10,269,80,2,59,12,1,1,High
3,7.4,117,100,4,466,97,2,55,14,1,1,Low
4,4.15,141,64,3,340,128,0,38,13,1,0,Low


In [7]:
# Deviding the data into input X
X = data.drop(['Sales','Sales_cat'], axis=1)
X.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,138,73,11,276,120,0,42,17,1,1
1,111,48,16,260,83,1,65,10,1,1
2,113,35,10,269,80,2,59,12,1,1
3,117,100,4,466,97,2,55,14,1,1
4,141,64,3,340,128,0,38,13,1,0


In [8]:
# Deviding the data into output y
y = data['Sales_cat']
y.head()

0    High
1    High
2    High
3     Low
4     Low
Name: Sales_cat, dtype: object

In [9]:
#Unique values in result
y.unique()

array(['High', 'Low'], dtype=object)

In [10]:
#Count of output values
y.value_counts()

High    201
Low     199
Name: Sales_cat, dtype: int64

In [11]:
#Splitting the data into train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 1)

## Building a Random Forest Classifier model

In [12]:
#Building the Random Forest model
model = RandomForestClassifier(n_estimators=100, max_features=3, criterion='entropy', random_state=1)
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=3, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [13]:
# Features importance
model.feature_importances_

array([0.113096  , 0.1042161 , 0.10292617, 0.09448795, 0.24111572,
       0.11451506, 0.1420503 , 0.05784478, 0.01124836, 0.01849956])

In [14]:
# Features score and feature names into a table
fn = ['CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'ShelveLoc', 'Age', 'Education', 'Urban', 'US']
F_RF= pd.Series(model.feature_importances_,index=fn).sort_values(ascending=False)
F_RF

Price          0.241116
Age            0.142050
ShelveLoc      0.114515
CompPrice      0.113096
Income         0.104216
Advertising    0.102926
Population     0.094488
Education      0.057845
US             0.018500
Urban          0.011248
dtype: float64

### Predictions:

In [15]:
#Predictions based on the trained model
pred = model.predict(X_test)
pd.Series(pred).value_counts()

High    41
Low     39
dtype: int64

In [16]:
# Cross validation
pd.crosstab(y_test,pred)

col_0,High,Low
Sales_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
High,32,9
Low,9,30


In [17]:
#Accuracy
np.mean(pred == y_test)

0.775

### Selcting the best features of created model using feature selection algorithm- SelectFromModel

In [18]:
# Creating a model 
model_sel = SelectFromModel(model)
model_sel.fit(X_train,y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True,
                                                 class_weight=None,
                                                 criterion='entropy',
                                                 max_depth=None, max_features=3,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,
                                                 random_state=1, verbose=0,
                                                 warm_st

In [19]:
#Useful features
model_sel.get_support()

array([ True,  True,  True, False,  True,  True,  True, False, False,
       False])

In [20]:
# Collecting the best featues and count
selected_feat= X_train.columns[(model_sel.get_support())]
len(selected_feat)

6

In [21]:
#Printing the best features found by feature selection model
print(selected_feat)

Index(['CompPrice', 'Income', 'Advertising', 'Price', 'ShelveLoc', 'Age'], dtype='object')


## Conclusion:

### Best features which causes high sale as per the Random Forest and Feature selction algorithms are as follows
- CompPrice
- Income
- Advertising
- Price
- ShelveLoc
- Age