In [65]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import cross_val_predict

In [2]:
df = pd.read_csv('G:\MangeshDataScience\Practice\WorkEx\MarketSegmentation/store.csv')

In [3]:
df.head()

Unnamed: 0,reps,product,qty,revenue,region
0,Chitra,Galaxy,2,155.1,West
1,Vijay,Jet,2,39.3,North
2,Mala,Beacon,3,74.25,West
3,Suman,Alpen,3,100.98,North
4,Rachna,Orbit,2,44.98,North


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   reps     10000 non-null  object 
 1   product  10000 non-null  object 
 2   qty      10000 non-null  int64  
 3   revenue  10000 non-null  float64
 4   region   10000 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 390.8+ KB


### Detecting outliers

In [5]:
df.describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.85, 0.90, 0.99])

Unnamed: 0,qty,revenue
count,10000.0,10000.0
mean,3.3874,90.570221
std,4.318308,127.803928
min,1.0,18.43
1%,1.0,19.35
5%,1.0,21.0
10%,1.0,22.95
25%,2.0,39.3
50%,2.0,58.425
75%,3.0,75.0


In [6]:
# There seem to be no outliers in the numerical columns

### Splitting into dependent and independent values

In [7]:
X = df.drop(['revenue'], axis = 1)
y = df['revenue']

In [8]:
X.head()

Unnamed: 0,reps,product,qty,region
0,Chitra,Galaxy,2,West
1,Vijay,Jet,2,North
2,Mala,Beacon,3,West
3,Suman,Alpen,3,North
4,Rachna,Orbit,2,North


### Converting into categorical variables

In [12]:
X.columns.unique()

Index(['reps', 'product', 'qty', 'region'], dtype='object')

In [13]:
X.reps.value_counts()

Seet       353
Rachna     318
Santosh    318
Aash       315
Vish       308
          ... 
Ram         84
Jay         84
Anusha      82
Sesh        81
Rohini      79
Name: reps, Length: 72, dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder
df_col = list(X.columns)
for i in range(len(df_col)):
    X[df_col[i]] = LabelEncoder().fit_transform(X[df_col[i]])

In [17]:
X.head()

Unnamed: 0,reps,product,qty,region
0,12,3,1,3
1,70,5,1,1
2,28,2,2,3
3,60,1,2,1
4,41,8,1,1


In [20]:
for i in X.columns:
    print(X[i].unique(), end = " ")

[12 70 28 60 41  0 10 62  7 56 48  5  4 17 49 30 52 42 45 38 37 50 46 57
 58  8 64 47 14 39 26 63  3 61 29 34 53 55 66 36  6 16 71 25 20  9 15  2
 51 22 35 54 68 67 43 11 21 44 65 19 33 40 23 69  1 13 27 31 18 59 32 24] [ 3  5  2  1  8 11  6  7  0  9 10  4] [ 1  2  0  3  8 23 14 10 17  6 24 19 20 18 15 11  7  4 13 21  9 16 12 22
  5] [3 1 0 2] 

### Splitting into train and test datasets

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [22]:
print("Shape of Training Data",X_train.shape)
print("Shape of Testing Data",X_test.shape)
print("Response Rate in Training Data",y_train.mean())
print("Response Rate in Testing Data",y_test.mean())

Shape of Training Data (7500, 4)
Shape of Testing Data (2500, 4)
Response Rate in Training Data 91.19812533333324
Response Rate in Testing Data 88.68650800000032


In [35]:
def print_score(reg, X_train, y_train, X_test, y_test, train = True):
    if train:
        pred = reg.predict(X_train)
        print('Train Result: ')
        print(f'Training Accuracy for {reg}: {r2_score(y_train, pred)*100:.2f}%')
        print('RMSE for Training Data: ',sqrt(mean_squared_error(y_train, pred)))
    elif train == False:
        pred = reg.predict(X_test)
        print('\nTest Result: ')
        print(f'Testing Accuracy for {reg} : {r2_score(y_test, pred)*100:.2f}%')
        print('RMSE for Testing Data: ',sqrt(mean_squared_error(y_test, pred)))

In [36]:
reg = LinearRegression()
reg.fit(X_train, y_train)
print_score(reg, X_train, y_train, X_test, y_test, train = True)
print_score(reg, X_train, y_train, X_test, y_test, train = False)

Train Result: 
Training Accuracy for LinearRegression(): 79.36%
RMSE for Training Data:  58.18337375227301

Test Result: 
Testing Accuracy for LinearRegression() : 85.08%
RMSE for Testing Data:  49.020134458606364


In [66]:
def print_score(reg, X_train, y_train, X_test, y_test, train = True):
    if train:
        pred = reg.predict(X_train)
        print(f'Train Result for {reg}: ')
        scores = [r2_score(y_train, pred)*100, sqrt(mean_squared_error(y_train, pred))]
        table_data = {'Scores':scores}
        column_names= ['Accuracy Score', 'Mean Squared Error']
        RegressionReport = pd.DataFrame(data= table_data, index = column_names )
        print('Regression Report: ', RegressionReport)
    elif train == False:
        pred = reg.predict(X_test)
        print(f'\nTest Result for {reg}: ')
        scores = [r2_score(y_test, pred)*100, sqrt(mean_squared_error(y_test, pred))]
        table_data = {'Scores':scores}
        column_names= ['Accuracy Score', 'Mean Squared Error']
        RegressionReport = pd.DataFrame(data= table_data, index = column_names)
        print('Regression Report: ', RegressionReport)

### 1. Linear Regression

In [67]:
print_score(reg, X_train, y_train, X_test, y_test, train = True)
print_score(reg, X_train, y_train, X_test, y_test, train = False)

Train Result for LinearRegression(): 
Regression Report:                         Scores
Accuracy Score      79.364811
Mean Squared Error  58.183374

Test Result for LinearRegression(): 
Regression Report:                         Scores
Accuracy Score      85.081763
Mean Squared Error  49.020134


### 2. Decision Tree

In [73]:
dec_tree = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None,)
dec_tree.fit(X_train, y_train)

DecisionTreeRegressor()

In [74]:
print_score(dec_tree, X_train, y_train, X_test, y_test, train = True)
print_score(dec_tree, X_train, y_train, X_test, y_test, train = False)

Train Result for DecisionTreeRegressor(): 
Regression Report:                         Scores
Accuracy Score      99.997408
Mean Squared Error   0.652116

Test Result for DecisionTreeRegressor(): 
Regression Report:                         Scores
Accuracy Score      99.878499
Mean Squared Error   4.423913


### 3. Random Forest

In [78]:
rf = RandomForestRegressor(n_estimators=100, max_depth= 5)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=5)

In [79]:
print_score(rf, X_train, y_train, X_test, y_test, train= True)
print_score(rf, X_train, y_train, X_test, y_test, train= False)

Train Result for RandomForestRegressor(max_depth=5): 
Regression Report:                         Scores
Accuracy Score      98.232978
Mean Squared Error  17.026118

Test Result for RandomForestRegressor(max_depth=5): 
Regression Report:                         Scores
Accuracy Score      97.909872
Mean Squared Error  18.348569


### 4. Decision Tree with hyperparameters

In [83]:
from sklearn.model_selection import GridSearchCV
params = { 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}


tree_clf = DecisionTreeRegressor(random_state=42)
tree_cv = GridSearchCV(tree_clf, params, n_jobs=-1, verbose=1, cv=3)
tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

tree_clf = DecisionTreeRegressor(**best_params)
tree_clf.fit(X_train, y_train)

Fitting 3 folds for each of 2166 candidates, totalling 6498 fits
Best paramters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 4, 'splitter': 'best'})


DecisionTreeRegressor(max_depth=15, min_samples_split=4)

In [84]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train= True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train= False)

Train Result for DecisionTreeRegressor(max_depth=15, min_samples_split=4): 
Regression Report:                         Scores
Accuracy Score      99.968155
Mean Squared Error   2.285671

Test Result for DecisionTreeRegressor(max_depth=15, min_samples_split=4): 
Regression Report:                         Scores
Accuracy Score      99.883422
Mean Squared Error   4.333356


### 5. Random Forest

In [85]:
n_estimators = [100, 500, 1000, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2,3,4,5,6]
max_depth.append(None)
#min_samples_split = [2, 5, 10]
#min_samples_leaf = [1, 2, 4, 10]


params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth} #'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf}


rf_clf = RandomForestRegressor(random_state=42)

rf_cv = GridSearchCV(rf_clf, params_grid, cv=3, verbose=2,n_jobs = -1)


rf_cv.fit(X_train, y_train)
best_params = rf_cv.best_params_
print(f"Best parameters: {best_params}")

rf_clf = RandomForestRegressor(**best_params)
rf_clf.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters: {'max_depth': None, 'max_features': 'auto', 'n_estimators': 1000}


RandomForestRegressor(n_estimators=1000)

In [86]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result for RandomForestRegressor(n_estimators=1000): 
Regression Report:                         Scores
Accuracy Score      99.963011
Mean Squared Error   2.463363

Test Result for RandomForestRegressor(n_estimators=1000): 
Regression Report:                         Scores
Accuracy Score      99.869707
Mean Squared Error   4.581160


### 6. Support Vector Machine Regressor

In [91]:
from sklearn.svm import SVR
svm_reg = SVR()
svm_reg.fit(X_train, y_train)

SVR()

In [92]:
print_score(svm_reg, X_train, y_train, X_test, y_test, train=True)
print_score(svm_reg, X_train, y_train, X_test, y_test, train=False)

Train Result for SVR(): 
Regression Report:                         Scores
Accuracy Score      46.193896
Mean Squared Error  93.952956

Test Result for SVR(): 
Regression Report:                         Scores
Accuracy Score      47.824981
Mean Squared Error  91.674186


### 7. AdaBoost Classifier

In [93]:
adb = AdaBoostRegressor()
adb.fit(X_train, y_train)

AdaBoostRegressor()

In [94]:
print_score(adb, X_train, y_train, X_test, y_test, train=True)
print_score(adb, X_train, y_train, X_test, y_test, train=False)

Train Result for AdaBoostRegressor(): 
Regression Report:                         Scores
Accuracy Score      90.921731
Mean Squared Error  38.591904

Test Result for AdaBoostRegressor(): 
Regression Report:                         Scores
Accuracy Score      91.179036
Mean Squared Error  37.694135


### 8. Gradient Boosting

In [95]:
gbx = GradientBoostingRegressor()
gbx.fit(X_train, y_train)

GradientBoostingRegressor()

In [96]:
print_score(gbx, X_train, y_train, X_test, y_test, train=True)
print_score(gbx, X_train, y_train, X_test, y_test, train=False)

Train Result for GradientBoostingRegressor(): 
Regression Report:                         Scores
Accuracy Score      99.438139
Mean Squared Error   9.600837

Test Result for GradientBoostingRegressor(): 
Regression Report:                         Scores
Accuracy Score      99.364391
Mean Squared Error  10.118374
