In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [13]:
data = pd.read_csv('logistic_r.csv')

In [14]:
data.head()

Unnamed: 0,user_id,gender,age,salary,puchased
0,151890,male,22,20000,0
1,157927,female,46,50000,0
2,163964,female,47,25000,0
3,170001,female,44,70000,1
4,176038,male,45,32000,0


In [15]:
# Now, to predict whether a user will purchase the product or not, one needs to find out the relationship 
# between Age and Estimated Salary. Here User ID and Gender are not important factors for finding out this.

# input 
x = data.iloc[:,[2,3]].values ## basically: only age and salary values

# output
y = data.iloc[:,4].values ## only whether it was purchased or not

In [6]:
# Splitting the dataset to train and test. 75% of data is used for training the model and 25% of it is used to test 
# the performance of our model.

from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state =0)

In [7]:
# Now, it is very important to perform feature scaling here because Age and Estimated Salary values lie in 
# different ranges. If we don’t scale the features then Estimated Salary feature will dominate Age feature when 
# the model finds the nearest neighbor to a data point in data space.

from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
xtrain = sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)

print(xtrain[0:10, :])

[[-0.82351334  1.43436951]
 [ 1.07322384 -1.39619238]
 [-0.26564946 -0.49630276]
 [-0.26564946 -0.19361261]
 [ 1.63108771 -0.11180447]
 [ 0.18064164  0.89443574]
 [-1.26980444 -0.13634691]
 [ 0.06906886 -0.31632484]
 [-0.37722224  0.53447989]
 [-1.04665889  0.17452405]]


In [8]:
# Here we see that the Age and Estimated salary feature values are scaled and now they are in the -1 to 1 range. 
# Hence, each feature will contribute equally in decision making i.e. finalizing the hypothesis.

# Finally, we are training our Logistic Regression model.

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 0)
classifier.fit(xtrain, ytrain)

LogisticRegression(random_state=0)

In [9]:
y_pred = classifier.predict(xtest)

In [10]:
# Let’s test the performance of our model – Confusion Matrix

from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(ytest, y_pred) 

print ("Confusion Matrix : \n", cm) 

Confusion Matrix : 
 [[2 2]
 [0 3]]


In [11]:
# Performance measure - accuracy

from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(ytest, y_pred)) 

Accuracy :  0.7142857142857143
