In [1]:
#Imported the default modules
import matplotlib.pyplot as py
%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [None]:
onlineRetail = pd.read_excel("Online Retail.xlsx")
onlineRetail.head(10) #displays only the 1st 10 records

In [None]:
onlineRetail.shape #displays the number of observations & columns

In [None]:
onlineRetail.columns #displays the column names

In [None]:
onlineRetailNoDup = onlineRetail.drop_duplicates() #duplicates removed
onlineRetailNoDup

In [None]:
onlineRetailNoDup.shape #number of records got dropped from 541909 to 536641

In [None]:
onlineRetailNAs= onlineRetailNoDup.dropna() #axis = 'columns'

In [None]:
onlineRetailNAs.shape #number of records dropped from 536641 to 401604 

In [None]:
onlineRetailNAs.describe() 

In [None]:
onlineRetailNAs.corr() 

In [None]:
onlineRetailNAs.dtypes

In [None]:
relation =onlineRetailNAs.corr() #Generated correlated matrix
relation_index=relation.index

In [None]:
sns.heatmap(onlineRetailNAs[relation_index].corr(),annot=True)

In [None]:
#Splitting the input & output variables
X_vars = ["UnitPrice","CustomerID"] #Chosed "UnitPrice" &"CustomerID" columns as input
y_vars = ["Quantity"] #Chosed "Quantity" column as output
X=onlineRetailNAs[X_vars]
y=onlineRetailNAs[y_vars]
print(X.shape)

In [None]:
#Define the test & train data - split the train & test data in the ratio 70%:30%
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)


In [None]:
 #Checked the feature importance using ETR
reg= ExtraTreesRegressor()
reg.fit(X_train,y_train["Quantity"])
reg.feature_importances_

In [None]:
feat_importances = pd.Series(reg.feature_importances_, index=X_train.columns)
feat_importances.nlargest(5).plot(kind='barh')
py.show()

In [None]:
#Intially experimented with Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)

In [None]:
lr_model.coef_, lr_model.intercept_

In [None]:
#Prediction 
y_pred_test = lr_model.predict(X_test)
y_pred_test.shape

In [None]:
#Performance evaluation: 
error = y_test - y_pred_test
error
#Mean Square Error
MSE = np.mean(error**2)

#Root Mean Square Error
RMSE = MSE**0.5
print("Mean Square Error: ",MSE, "\n\nRoot Mean Square Error: ",RMSE)

In [None]:
lr_model.score(X_test,y_test)

In [None]:
#R^2 value is calculated using the .score method. Higher the value indicates the better 
#model performance but this model having very low R^2 value. So will fine tune the dataset
#using hyperparameter & Grid Search 

In [None]:
# Further explored Lasso & Ridge model using GridSearchCV
ridge_grid = {'alpha':[0.001, 0.01, 0.1,1, 10,100]} 

ridge=Ridge()
ridge_cv_reg=GridSearchCV(ridge,ridge_grid, cv=5) #experimented cv's are 4,5,8,9
ridge_cv_reg.fit(X_train,y_train)

In [None]:
#Mean validation of the Ridge model:
print('Mean validaton score is:', ridge_cv_reg.best_score_)

#Performance measure of Ridge model(Using R2): Test set
print('Unbiased Performance of best model:',ridge_cv_reg.score(X_test,y_test))

# Finding the best hyperparameter
print('Optimal Hyperparam:',ridge_cv_reg.best_params_)

# Finding the best parameter estimates
ridge_cv_reg.best_estimator_.coef_,ridge_cv_reg.best_estimator_.intercept_

In [None]:
#LASSO Regression
ridge_grid = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10,100,1000,10000]}
lasso=Lasso()
lasso_cv_reg=GridSearchCV(lasso,ridge_grid, cv=5)
lasso_cv_reg.fit(X_train,y_train)

In [None]:
#Mean validation of the Lasso model:
print('Mean validaton score is:', lasso_cv_reg.best_score_)
#Performance measure of Lasso model(Using R2)
print('Unbiased Performance of best model:',lasso_cv_reg.score(X_test,y_test))
# Finding the best hyperparameter
print('Optimal Hyperparam:',lasso_cv_reg.best_params_)
# Finding the best parameter estimates
lasso_cv_reg.best_estimator_.coef_,lasso_cv_reg.best_estimator_.intercept_

In [None]:
lasso_cv_reg.predict(X_test)

In [None]:
#As accuracy of the model hasn't improved, further explored with DecisionTreeRegressor
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

In [None]:
print("Accuracy of test is:", dt_reg.score(X_test, y_test))

In [None]:
pred=dt_reg.predict(X_test)
pred

In [None]:
#CONCLUSION - DecisionTreeRegressor have performed well in comparison to other models like
#Lasso & Ridge models. Accuracy have improved to 0.00533(rounded). So this is best model
#for using to predict future data