### Import packages

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings

In [2]:
warnings.filterwarnings("ignore")

### Read data

In [3]:
df = pd.read_csv('classifier_output_index.csv')
df

Unnamed: 0,Review,Total_star,Value,Size,Comfort_Drive,Interior,Appearance_Exterior,Power_Performance,Safety,Mpg_Efficiency,Maintanence,Index_num
0,"I recently traded in my 2017 Honda HR-V in ""Ba...",5,1,1,0,1,1,0,0,1,0,0
1,Recently purchased a Taos in the base trim (S)...,5,0,0,1,0,0,0,0,1,0,2
2,This car feels premium and looks handsome. It...,5,1,0,1,0,1,0,0,0,0,3
3,"Bought the White SEL, love everything about it...",1,0,0,0,0,1,0,1,0,-1,4
4,"The FWD Taos S is a sporty, fun drive. It look...",5,1,1,1,1,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
18444,My wife and I recently retired and wanted a ne...,5,1,0,1,1,0,0,0,0,1,27642
18445,Just bought one yesterday in the SX trim and i...,5,0,0,0,0,0,0,0,0,0,27643
18446,It is great. Enough room for everyone plus car...,5,0,1,0,0,0,0,0,0,0,27644
18447,Other than UVO not available in my state is th...,3,0,0,0,0,0,0,0,0,0,27645


### Clean data

In [4]:
df.dtypes

Review                 object
Total_star              int64
Value                   int64
Size                    int64
Comfort_Drive           int64
Interior                int64
Appearance_Exterior     int64
Power_Performance       int64
Safety                  int64
Mpg_Efficiency          int64
Maintanence             int64
Index_num               int64
dtype: object

In [5]:
df["Total_star"].value_counts()

5    10756
4     3159
1     1658
3     1639
2     1237
Name: Total_star, dtype: int64

In [6]:
df.describe()

Unnamed: 0,Total_star,Value,Size,Comfort_Drive,Interior,Appearance_Exterior,Power_Performance,Safety,Mpg_Efficiency,Maintanence,Index_num
count,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0
mean,4.090466,0.097404,0.104396,0.359965,0.180606,0.095507,0.210472,0.060057,0.167977,-0.122012,14372.884709
std,1.319388,0.308871,0.331961,0.485617,0.454107,0.298496,0.417249,0.263764,0.381889,0.40836,7978.595799
min,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7571.0
50%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14569.0
75%,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21112.0
max,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,27646.0


### Predictors and Response

In [7]:
y = df['Total_star']
X = df.iloc[:,2:11]

In [8]:
y.dtypes

dtype('int64')

In [9]:
X.dtypes

Value                  int64
Size                   int64
Comfort_Drive          int64
Interior               int64
Appearance_Exterior    int64
Power_Performance      int64
Safety                 int64
Mpg_Efficiency         int64
Maintanence            int64
dtype: object

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7, random_state=1)

### Regression models

##### Linear Regression

In [11]:
def linear_Regression(X,y):    
    model = LinearRegression()
    model.fit(X,y)
    y_pred = model.predict(X)
    R2 = model.score(X,y)
    MAE = mean_absolute_error(y, y_pred)
    print("R square for train set: ", R2)
    print("MAE for train set: ", MAE)
    return model

##### Decision Tree

In [12]:
def DecisionTree_Regressor(X,y):
    model = DecisionTreeRegressor(random_state=0)
    params = {'max_depth':[i for i in range(3,30)]}
    grid = GridSearchCV(model, param_grid = params, cv = 5)
    grid.fit(X,y)
    print(grid.best_params_)

    y_pred = grid.predict(X)
    R2 = grid.score(X,y)
    MAE = mean_absolute_error(y, y_pred)
    print("R square for train set: ", R2)
    print("MAE for train set: ", MAE)
    return grid

##### Random Forest

In [13]:
def Random_Forest(X,y):
    model = RandomForestRegressor(random_state=0)
    params = {'n_estimators':[75,100,125],
              'max_features':['auto', 'sqrt'], 
              'max_depth':[i for i in range(3,10)]}
    grid = GridSearchCV(model, param_grid = params, cv = 5)
    grid.fit(X.values,y)
    print(grid.best_params_)
    
    y_pred = grid.predict(X.values)
    R2 = grid.score(X.values,y)
    MAE = mean_absolute_error(y, y_pred)
    print("R square for train set: ", R2)
    print("MAE for train set: ", MAE)
    return grid

##### GBDT

In [14]:
def GBDT(X,y):
    model = GradientBoostingRegressor(random_state=0)
    model.fit(X,y)

    y_pred = model.predict(X)
    R2 = model.score(X,y)
    MAE = mean_absolute_error(y, y_pred)
    print("R square for train set: ", R2)
    print("MAE for train set: ", MAE)
    return model

In [15]:
R_squares = []

# Result for Linear Equation

In [16]:
lr = linear_Regression(X_train,y_train)
print("-"*50)
print('R squared for test set: ',lr.score(X_test,y_test))
R_squares.append(lr.score(X_test,y_test))
lr_all = LinearRegression()
lr_all.fit(X,y)
print("Coefficients: ", lr_all.coef_)
print("Intercepts: ", lr_all.intercept_)

R square for train set:  0.24952165959622907
MAE for train set:  0.9018986411854876
--------------------------------------------------
R squared for test set:  0.22515771105163795
Coefficients:  [0.27341201 0.16133098 0.51853215 0.16826508 0.13120592 0.16332377
 0.2511444  0.23124762 1.13040901]
Intercepts:  3.8670388203063926


# Result for Decision Tree

In [17]:
dt = DecisionTree_Regressor(X_train,y_train)
print("-"*50)
print('R squared for test set: ',dt.score(X_test,y_test))
dt_fi = dt.best_estimator_.feature_importances_
print('Feature Importance: ',dt_fi)
R_squares.append(dt.score(X_test,y_test))

{'max_depth': 8}
R square for train set:  0.30232997271318407
MAE for train set:  0.8381510884764773
--------------------------------------------------
R squared for test set:  0.2501921678411497
Feature Importance:  [0.02557482 0.01789653 0.1708768  0.0316319  0.00908722 0.02716872
 0.00706336 0.0308719  0.67982873]


# Result for Random Forest

In [18]:
rf = Random_Forest(X_train,y_train)
print("-"*50)
print('R squared for test set: ',rf.score(X_test,y_test))
rf_fi = rf.best_estimator_.feature_importances_
print('Feature Importance: ',rf_fi)
R_squares.append(rf.score(X_test,y_test))

{'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 125}
R square for train set:  0.3052328734950568
MAE for train set:  0.8468293128083894
--------------------------------------------------
R squared for test set:  0.26144002065731364
Feature Importance:  [0.03067045 0.03201138 0.21171438 0.04059471 0.01534402 0.04076603
 0.01142127 0.0340631  0.58341465]


# Result for Gradient Boosting

In [19]:
gbdt = GBDT(X_train,y_train)
print("-"*50)
print('R squared for test set: ',gbdt.score(X_test,y_test))
gbdt_fi = gbdt.feature_importances_
print('Feature Importance: ',gbdt_fi)
R_squares.append(gbdt.score(X_test,y_test))

R square for train set:  0.3018961477143771
MAE for train set:  0.845488517162971
--------------------------------------------------
R squared for test set:  0.2650119760990052
Feature Importance:  [0.02453717 0.01871398 0.17536796 0.02805954 0.0116054  0.02652404
 0.0086159  0.02834639 0.6782296 ]


## R squared chart

In [20]:
df_R = pd.DataFrame(R_squares,index = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting'], columns=['R^2'])

In [67]:
df_R

Unnamed: 0,R^2
Linear Regression,0.225158
Decision Tree,0.250192
Random Forest,0.26144
Gradient Boosting,0.265012


# Feature importance for Decision Tree


In [None]:
df_importance = pd.DataFrame(100*dt_fi,
                   index = X.columns,
                   columns=['importance'])
df_importance = df_importance.sort_values(by = 'importance',axis=0,
                      ascending=False)
df_importance

In [None]:
plt.bar(df_importance.index, df_importance['importance'], width=0.5)
plt.xticks(rotation=90)
plt.yticks(rotation=90)

# Feature importance for Random Forest

In [None]:
df_importance = pd.DataFrame(100*rf_fi,
                   index = X.columns,
                   columns=['importance'])
df_importance = df_importance.sort_values(by = 'importance',axis=0,
                      ascending=False)
df_importance

In [None]:
plt.bar(df_importance.index, df_importance['importance'], width=0.5)
plt.xticks(rotation=90)
plt.yticks(rotation=90)

# Feature Importance for Gradient Boosting

In [None]:
df_importance = pd.DataFrame(100*gbdt_fi,
                   index = X.columns,
                   columns=['importance'])
df_importance = df_importance.sort_values(by = 'importance',axis=0,
                      ascending=False)
df_importance

In [None]:
plt.bar(df_importance.index, df_importance['importance'], width=0.5)
plt.xticks(rotation=90)
plt.yticks(rotation=90)

### Validation using the ratings on Edmunds

In [None]:
df1 = pd.read_csv("data_cleaned.csv")

In [None]:
X = df1.iloc[:,8:15]
y = df1["Total_star"]

In [None]:
y.value_counts()

In [None]:
X.describe()

In [None]:
DecisionTree_Regressor(X,y)