# <center> Linear Regression , Decision Tree , Random Forest , Missing Data , KNN imputer

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
from plotnine import *
%matplotlib inline
import seaborn as sn
from sklearn.model_selection import train_test_split
import statsmodels.api as sm 
from sklearn import metrics 
import math
from sklearn.tree import DecisionTreeRegressor # since the dependent variable is continous else we use "DecisionTreeClassifier"
from sklearn.ensemble import RandomForestRegressor  # import the class RandomForestRegressor

# Read Data

In [None]:
commerce = pd.read_csv("D:\data\Ecommerce.csv")
commerce.head()

# Explore Data

In [None]:
commerce.shape

In [None]:
commerce.head()

In [None]:
commerce.columns

# Delete First 3 Variables

In [None]:
commerce = commerce.iloc[:,3:]
commerce.head()

In [None]:
commerce.describe()

# Identifying Missing Values

In [None]:
commerce.isnull().sum()

In [None]:
commerce.dtypes

# Histogram of Yearly Amout Spent

In [None]:
ggplot(commerce , aes(x ='Yearly Amount Spent'))+geom_histogram(bins=50 , alpha = 0.5 , fill = "blue")+theme_minimal()

# Correlation Matrix 

In [None]:
sn.pairplot(commerce)

In [None]:
CorrMatrix = commerce.corr()
CorrMatrix

# Splitting Data 

In [None]:
X =commerce[['Avg. Session Length', 'Time on App', 'Time on Website','Length of Membership']]

Y = commerce['Yearly Amount Spent']

X=sm.add_constant(X)

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.3, random_state = 1 )

In [None]:
len(X_train)

# Linear Regression 

In [None]:
lm_model =sm.OLS(Y_train , X_train).fit()
lm_model.summary()

# Linear Regression Evaluation 

In [None]:
Y_test_lm =lm_model.predict(X_test)
MSE = metrics.mean_squared_error(Y_test , Y_test_lm)
lm_RMSE = math.sqrt(MSE)
print("The Root Mean Squared Error of Linear Regression is :" , lm_RMSE)
print("The Coefficient of Determination : " , metrics.r2_score(Y_test , Y_test_lm))


# Delete Column Constant From X

In [None]:
X_train = X_train.iloc[:,1:]
X_train.head()

X_test = X_test.iloc[:,1:]
X_test.head()

# Decision Tree Model 

In [None]:
# decision tree model 
tree_model = DecisionTreeRegressor(random_state = 42)
tree_model.fit(X_train,Y_train)

# Decision Tree Evaluation 

In [None]:
Y_test_tree = tree_model.predict(X_test)
tree_MSE = metrics.mean_squared_error(Y_test , Y_test_tree)
tree_RMSE = math.sqrt(tree_MSE)
print("The Root Mean Squared Error of Decision Tree is :" , tree_RMSE)
print("The Coefficient of Determination : " , metrics.r2_score(Y_test , Y_test_tree))

In [None]:
# creat data frame of the two vars : Y_test and it's estimated value Y_test_tree by Decision Tree Regressor 
data = {'Y_test':Y_test , 'Y_test_tree':Y_test_tree}
tree_data=pd.DataFrame(data)

In [None]:
# scatter plot of Y_test and Y_test_tree 
T = ggplot(tree_data , aes(Y_test , Y_test_tree))+ geom_point(color = "red" , size = 0.3)
T+labs(x = 'Yearly Amount Spent' , y = 'Yearly Amount Spent predicted', title='Decision Tree Regressor')+theme_minimal()

# Rnadom Forest Model 

In [None]:
rf_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_model.fit(X_train , Y_train)

# Random Forest Evaluation 

In [None]:
Y_test_rf = rf_model.predict(X_test)
rf_MSE = metrics.mean_squared_error(Y_test , Y_test_rf)
rf_RMSE = math.sqrt(rf_MSE)
print("The Root Mean Squared Error of Rnadom Forest is :" , rf_RMSE)
print("The Coefficient of Determination : " , metrics.r2_score(Y_test , Y_test_rf))

In [None]:
# creat data frame of the two vars : Y_test and it's estimated value Y_test_rf by random forest classifier 
data = {'Y_test':Y_test , 'Y_test_rf':Y_test_rf}
rf_data=pd.DataFrame(data)

In [None]:
# scatter plot of Y_test and Y_test_rf 
p = ggplot(rf_data , aes(Y_test , Y_test_rf))+ geom_point(color = "red" , size = 0.3)
p+labs(x = 'Yearly Amount Spent' , y = 'Yearly Amount Spent predicted', title='Random Forest Classifier')+theme_minimal()

# Read Ecommerce Data With Missing Values 

In [None]:
comm_nan = pd.read_csv("D:\data\Ecommerce_with_NaN.csv")

In [None]:
comm_nan.head()

# Identifying Missing Values

In [None]:
# True = there is a missing values 
# False  = there is no missing values 
comm_nan.isna().any()

In [None]:
# number of missing values in each column : 
comm_nan.isnull().sum()

# Remove Some Variables 

In [None]:
# remove first three columns : 'Email', 'Address' and 'Avatar'
comm_nan = comm_nan.drop(['Email', 'Address', 'Avatar'], axis = 1)

# KNN Imputer

https://www.analyticsvidhya.com/blog/2020/07/knnimputer-a-robust-way-to-impute-missing-values-using-scikit-learn/

The KNN Imptuer is a distance-based imputation method and it requires us to normalize our data. Otherwise, the different scales of our data will lead the KNN Imputer to generate biased replacements for the missing values. For simplicity, we will use Scikit-Learn’s MinMaxScaler which will scale our variables to have values between 0 and 1.

In [None]:
from sklearn.preprocessing import MinMaxScaler # import the class MinMaxScaler to normalize our dataset 
scaler = MinMaxScaler()
comm_nan = pd.DataFrame(scaler.fit_transform(comm_nan) , columns = comm_nan.columns)

In [None]:
comm_nan.head()

Now  our dataset has normalized, we can move on to the KNN Imputation. Let’s import it from Scikit-Learn’s Impute package and apply it to our data. In this example, we are setting the parameter ‘n_neighbors’ as 5. So, the missing values will be replaced by the mean value of 5 nearest neighbors measured by Euclidean distance.

In [None]:
from sklearn.impute import KNNImputer 
imputer = KNNImputer(n_neighbors = 5)
comm_nan = pd.DataFrame(imputer.fit_transform(comm_nan) , columns = comm_nan.columns )

In [None]:
# checking the missing values : 
comm_nan.isna().any()

In [None]:
# no missing values 
comm_nan.isnull().sum()

In [None]:
comm_nan.head()

# Splitting Data 

In [None]:
X =comm_nan[['Avg. Session Length', 'Time on App', 'Time on Website','Length of Membership']]

Y = comm_nan['Yearly Amount Spent']

X=sm.add_constant(X)

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.3 , random_state = 1)

In [None]:
X_train.shape

# Linear Regression With KNN Imputation

In [None]:
lm_nan = sm.OLS(Y_train , X_train).fit()
lm_nan.summary()

# Linear Regression With KNN Imputation Evaluation 

In [None]:
Y_test_lm_nan =lm_nan.predict(X_test)
lm_nan_MSE = metrics.mean_squared_error(Y_test , Y_test_lm_nan)
lm_nan_RMSE = math.sqrt(lm_nan_MSE)
print("The Root Mean Squared Error of Linear Regression with KNN imputyation is :" , lm_nan_RMSE)
print("The Coefficient of Determination with KNN imputyation is: " , metrics.r2_score(Y_test , Y_test_lm_nan))

# Remove Constant Column From X_train and X_test

In [None]:
X_train = X_train.iloc[: , 1:]
X_test = X_test.iloc[: , 1 :]

# Decision Tree With KNN Imputation

In [None]:
tree_nan = DecisionTreeRegressor()
tree_nan.fit(X_train , Y_train)

# Decision Tree With KNN Imputation Evaluation 

In [None]:
Y_test_tree_nan = tree_nan.predict(X_test)
tree_nan_MSE = metrics.mean_squared_error(Y_test , Y_test_tree_nan)
tree_nan_RMSE= math.sqrt(tree_nan_MSE)
print("The Root Mean Squared Error of Decision Tree with KNN imputation is :" , tree_nan_RMSE)
print("The Coefficient of Determination of Decision Tree with KNN imputation is:",metrics.r2_score(Y_test,Y_test_tree_nan))

# Random Forest With KNN Imputation 

In [None]:
rf_nan = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_nan.fit(X_train , Y_train)

# Random Forest With KNN Imputation Evaluation 

In [None]:
Y_test_rf_nan = rf_nan.predict(X_test)
rf_nan_MSE = metrics.mean_squared_error(Y_test , Y_test_rf_nan)
rf_nan_RMSE = math.sqrt(rf_nan_MSE)
print("The Root Mean Squared Error of Rnadom Forest with KNN imputation is :" , rf_nan_RMSE)
print("The Coefficient of Determination with KNN imputation is : " , metrics.r2_score(Y_test , Y_test_rf_nan))

# Read Ecommerce Data With Missing Values

In [None]:
commerce_nan = comm_nan = pd.read_csv("D:\data\Ecommerce_with_NaN.csv")

# Identifying Missing Values

In [None]:
commerce_nan.isna().any()

In [None]:
commerce_nan.isnull().sum()

# Remove The Missing Values Cells

In [None]:
commerce_nan = commerce_nan.dropna()

In [None]:
commerce_nan.shape

# Remove First 3 Columns

In [None]:
commerce_nan = commerce_nan.drop(['Email', 'Address', 'Avatar'] , axis = 1)

In [None]:
commerce_nan.columns

# Splitting Data 

In [None]:
X = commerce_nan[['Avg. Session Length', 'Time on App', 'Time on Website','Length of Membership']]

Y = commerce_nan['Yearly Amount Spent']

X = sm.add_constant(X)

In [None]:
X.head()

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.3 , random_state = 1)

In [None]:
X_train.shape

# Linear Regression With Remove Missing Value Cells 

In [None]:
lm_nan_remove = sm.OLS(Y_train , X_train).fit()
lm_nan_remove.summary()

# Linear Regression With Remove Missing Value Cells Evaluation

In [None]:
Y_test_lm_nan_remove =lm_nan_remove.predict(X_test)
lm_nan_remove_MSE = metrics.mean_squared_error(Y_test , Y_test_lm_nan_remove)
lm_nan_remove_RMSE = math.sqrt(lm_nan_remove_MSE)
print("The Root Mean Squared Error of Linear Regression with Missing Value Cells is :" , lm_nan_remove_RMSE)
print("The Coefficient of Determination with with Missing Value Cells is: ",metrics.r2_score(Y_test , Y_test_lm_nan_remove))

# Remove Constant Column From X_train and X_test

In [None]:
X_train = X_train.iloc[: , 1:]
X_test = X_test.iloc[: , 1 :]

# Decision Tree With Remove Missing Value Cells

In [None]:
tree_nan_remove = DecisionTreeRegressor()
tree_nan_remove.fit(X_train , Y_train)

# Decision Tree With Remove Missing Value Cells Evaluation

In [None]:
Y_test_tree_nan_remove = tree_nan_remove.predict(X_test)
tree_nan_remove_MSE = metrics.mean_squared_error(Y_test , Y_test_tree_nan_remove)
tree_nan_remove_RMSE= math.sqrt(tree_nan_remove_MSE)
print("The Root Mean Squared Error of Decision Tree after removing empty cells is :" , tree_nan_remove_RMSE)
print("The Coefficient of Determination of Decision Tree after removing empty cells is:",metrics.r2_score(Y_test ,Y_test_tree_nan_remove))

# Random Forest With Remove Missing Value Cells

In [None]:
rf_nan_remove = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_nan_remove.fit(X_train , Y_train)

In [None]:
Y_test_rf_nan_remove = rf_nan_remove.predict(X_test)
rf_nan_remove_MSE = metrics.mean_squared_error(Y_test , Y_test_rf_nan_remove)
rf_nan_remove_RMSE = math.sqrt(rf_nan_remove_MSE)
print("The Root Mean Squared Error of Rnadom Forest after removing the missing values is :" , rf_nan_RMSE)
print("The Coefficient of Determination after removing the missing values is : ",metrics.r2_score(Y_test,Y_test_rf_nan_remove))