In [None]:
import numpy as np
import pandas as pd

#data Visualisation
import matplotlip.pyplot as plt
import seaborn as sns


#Imoport warnings
import warnings

#Import statsmodel
import statsmodels.formula.api as smf


#import the RMSE
from statsmodels.tool.eval_measures import rmse


#import decision tree Regressor
from sklearn.tree import DecisionTreeRegressor

#import train test split
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict


#Feature scaling
from sklearn.preprocessing import StandardScaler

# Import the metrics 
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

# import pre-processing
from sklearn import preprocessing

#configuration settings
%matplotlib inline'
sns.set(color_codes=True)
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
sns.set_context('talk')
params={
    'legend.fontsize':'x-large',
    'figure.figsize' :(30,10),
    'axes.labelsize' : 'x-large'
    'axes.titlesize' : 'x-large'
    'xtick.labelsize' : 'x-large'
    'ytick.labelsize' : 'x-large'
}
plt.rcParams.update(params)

In [None]:
#loading the data into dataframe as supermarketdata
supermarket_data=pd.read_csv()
supermarket_data.head(5)

In order to illustrate Support Vector Regression we just need two variables which are:


 1.SHOP_HOUR
 2.SPEND

In [None]:
supermarket_data=supermarket_data[["SHOP_HOUR","SPEND"]]
supermarket_data.head(5)

In [None]:
x=supermarket_data.iloc[:,:-1].values
y=supermarket_data.iloc[:,:-1].values

In [None]:
#Divide the dataset into training and testing sets
x,x_test,y,y_test = train_test_split(supermarket_data.iloc[ :, 0:1],
                                     supermarket_data.ilo[:,: 1],
                                     test_size = 0.33,
                                     random_state=42
                                    )

#x.reset_index(inplace=True)
#y =y.reset_index()

#x_test.reset_index(inplace=True)
#y_test = y_test.reset_index()


In [None]:
dtm = DecisionTreeRegressor(max_depth=4,
                           min_samples_split=5,
                           max_leaf_nodes=10)
dtm.fit(x,y)
print("R-Squared on train dataset={}".format(dtm.score(x_test,y_test)))

dtm.fit(x_test,y_test)
print("R-Squared on test dataset ={}".format(dtm.score(x_test,y_test)))

### _Hyper-parameter tuning with GridSearchCV_

In [None]:
param_grid={"criterion": ["mse" , "mae"],
            "min_samples_split" : [10, 20, 40],
            "max_depth": [2, 6, 8],
            "min_samples_leaf": [20, 40, 100],
            "max_leaf_nodes" : [5, 20, 100]
           }

grid_cv_dtm = GridSearchCV(dtm,param_grid, cv=5)

grid_cv_dtm.fit(x,y)

In [None]:
print("R-Squared::{}".format(grid_cv_dtm.best_score_score_))
print("Best Hyperparameters::\n{}".format(grid_cv_dtm.best_params_))

In [None]:
fig,ax = plt.subplots()
sns.pointplot(data=df[['mean_test_score',
                      'param_max_leaf_nodes',
                      'param_max_depth']]
             y='mean_test_score',x='param_max_depth',
             hue='param_max_leaf_nodes',ax=ax)
ax.set(title="Effect of depth and Leaf Nodes on Model Performance")

In [None]:
#checking the training model scores 
r2_scores = cross_val_score(grid_cv_dtm.best_estimator_, x,y, cv=10)
mse_scores = cross_val_score(grid_cv_dtm.best_estimator_,x,y, cv=10, scoring='neg_mean_sq')

print("avg R-squared :: {:.3f}".format(np.mean(r2_scores)))
print("MSE :: {:.3f}".format(np.mean(mse_scores)))


In [None]:
best_dtm_model = grid_cv_dtm.best_estimator_

y_pred = best_dtm_model.predict(x_test)

r2_scor = best_dtm_model.score(x_test,y_test)

print("R-Squared:{:.3f}".format(r2_score))
print("MSE: %.2f" % metrics.mean_squared_error(y_test, y_pred) )