In [1]:
# Step1: Import packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline

In [2]:
# Step2:  Load our data
df = pd.read_csv('./data/Mall_Customers.csv')
df.rename(columns={'CustomerID':'id','Spending Score (1-100)':'score','Annual Income (k$)':'income'},inplace=True)
df.head() # Visualize first 5 rows of data
df.tail()

Unnamed: 0,id,Gender,Age,income,score
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18
199,200,Male,30,137,83


In [3]:
# Step3: Feature Engineering - transforming variables as appropriate for inputs to Machine Learning Algorithm
# transforming categorical variable Gender using One hot encodding
gender_onhot = pd.get_dummies(df['Gender'])
gender_onhot.tail()

Unnamed: 0,Female,Male
195,1,0
196,1,0
197,0,1
198,0,1
199,0,1


In [4]:
# Create input dataset aka X
X = pd.merge(df[['Age','income']], gender_onhot, left_index=True, right_index=True)
X.head()

Unnamed: 0,Age,income,Female,Male
0,19,15,0,1
1,21,15,0,1
2,20,16,1,0
3,23,16,1,0
4,31,17,1,0


In [5]:
# Create target variable
Y = df['score']
Y.head()

0    39
1    81
2     6
3    77
4    40
Name: score, dtype: int64

In [6]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [7]:
X_std[0:5]

array([[-1.42456879, -1.73899919, -1.12815215,  1.12815215],
       [-1.28103541, -1.73899919, -1.12815215,  1.12815215],
       [-1.3528021 , -1.70082976,  0.88640526, -0.88640526],
       [-1.13750203, -1.70082976,  0.88640526, -0.88640526],
       [-0.56336851, -1.66266033,  0.88640526, -0.88640526]])

In [8]:
# Step3: Split data in train & test set
X_train, X_test, y_train, y_test = train_test_split(X_std,Y,test_size=0.20,random_state = 40)
print('Shape of Training Xs:{}'.format(X_train.shape))
print('Shape of Test Xs:{}'.format(X_test.shape))

Shape of Training Xs:(160, 4)
Shape of Test Xs:(40, 4)


In [9]:
# Step4: Build Linear Regression Analysis Model
learner = SGDRegressor(max_iter=1000,penalty= "l2",eta0=0.0001,
                      learning_rate = "constant")#initializing linear regression model

learner.fit(X_train,y_train); #training the linear regression model
y_predicted = learner.predict(X_test)
score=learner.score(X_test,y_test);#testing the linear regression model

In [10]:
print(score)
print(y_predicted)

0.13174905289114425
[46.1924418  58.36391241 37.34494244 59.68307473 58.19493276 41.9760458
 43.69101625 59.96366054 34.22489035 56.72684113 53.66177891 57.7292456
 56.69467915 45.82470415 44.39556211 41.22052016 56.08006268 51.46499959
 59.24430695 54.62122372 49.41322094 48.70990761 43.17712684 50.16596904
 44.66936031 55.66535531 39.45047842 61.04519671 51.6300506  44.54325888
 59.22016511 43.55010708 53.70473856 53.80947569 51.07844412 48.08603848
 48.85359432 62.22745993 33.7082234  55.50562833]


In [11]:
# Step6: Diagnostic analysis
from sklearn.metrics import mean_squared_error, r2_score
print("Intercept is at: %.2f"%(learner.intercept_))
# The coefficients
print('Coefficients: \n', learner.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_predicted))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.4f' % r2_score(y_test, y_predicted))

Intercept is at: 49.89
Coefficients: 
 [-7.9096131   1.23055871  0.5375498  -0.5375498 ]
Mean squared error: 499.37
Variance score: 0.1317


In [12]:
# Step5: Check Accuracy of Model
df_new = pd.DataFrame({"true_score":y_test,"predicted_score":y_predicted})
# df_new

In [None]:
Y_hat = 49.91 - 7.9*Age + 1.23*Income + 0.53* Female - 0.53*M