In [None]:
# Step1: Import packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline

In [None]:
# Step2:  Load our data
df = pd.read_csv('./data/Mall_Customers.csv')
df.rename(columns={'CustomerID':'id','Spending Score (1-100)':'score','Annual Income (k$)':'income'},inplace=True)
df.head() # Visualize first 5 rows of data
df.tail()

In [None]:
# Step3: Feature Engineering - transforming variables as appropriate for inputs to Machine Learning Algorithm
# transforming categorical variable Gender using One hot encodding
gender_onhot = pd.get_dummies(df['Gender'])
gender_onhot.tail()

In [None]:
# Create input dataset aka X
X = pd.merge(df[['Age','income']], gender_onhot, left_index=True, right_index=True)
X.head()

In [None]:
# Create target variable
Y = df['score']
Y.head()

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
# Step3: Split data in train & test set
X_train, X_test, y_train, y_test = train_test_split(X_std,Y,test_size=0.20,random_state = 40)
print('Shape of Training Xs:{}'.format(X_train.shape))
print('Shape of Test Xs:{}'.format(X_test.shape))

In [None]:
# Step4: Build Linear Regression Analysis Model
learner = SGDRegressor(max_iter=1000,penalty= "l2",eta0=0.0001,
                      learning_rate = "constant")#initializing linear regression model

learner.fit(X_train,y_train); #training the linear regression model
y_predicted = learner.predict(X_test)
score=learner.score(X_test,y_test);#testing the linear regression model

In [None]:
print(score)
print(y_predicted)

In [None]:
# Step6: Diagnostic analysis
from sklearn.metrics import mean_squared_error, r2_score
print("Intercept is at: %.2f"%(learner.intercept_))
# The coefficients
print('Coefficients: \n', learner.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_predicted))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.4f' % r2_score(y_test, y_predicted))

In [None]:
# Step5: Check Accuracy of Model
df_new = pd.DataFrame({"true_score":y_test,"predicted_score":y_predicted})
# df_new