# Logistic Regression


## Importing the libraries

In [1]:
# numpy , matplotlib.pyplot , pandas , seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Importing the dataset

In [2]:
# Name of the dataset is "Social_Network_Ads.csv"
dataset = pd.read_csv('Social_Network_Ads.csv')

In [3]:
# Q) How many data points and features ?
print(dataset.shape)

(400, 5)


In [4]:
# Q) What are the column names in our dataset ?
print(dataset.columns)

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')


In [5]:
# Q) How many datapoints for each class are present ?
dataset["Purchased"].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [6]:
# First five rows of the dataset
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [7]:
# Dataset Stats / Description
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


## Plot of Age vs EstimatedSalary (Independent Variables)

In [8]:
%matplotlib
sns.set_style("whitegrid");
sns.FacetGrid(dataset, hue="Purchased",size=4) \
    .map(plt.scatter, "Age","EstimatedSalary") \
    .add_legend();
plt.show()

Using matplotlib backend: Qt5Agg


## Matrix of features X and Vector of Independent Variable

In [9]:
# Matrix of features will contain Age and Estimated salary (Independent Variables)
X = dataset.iloc[:, [2, 3]].values
# Vector of Predictions will contain the Dependent variables
y = dataset.iloc[:, 4].values


## Splitting the dataset into the Training set and Test set

A convenient way to randomly partition this dataset into a separate test and training dataset is to use the 
train_test_split function

In [10]:
# X_train  = > Matrix of Features in the training set
# X_test   = > Matrix of Features in the test set
# y_train  = > Matrix of Features in the training set
# X_train  = > Matrix of Features in the test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Feature Scaling

  Most of the times, your dataset will contain features highly varying in magnitudes, units and range. But 
  since, most of the machine learning algorithms use Eucledian distance between two data points in their 
  computations, this is a problem.

  If left alone, these algorithms only take in the magnitude of features neglecting the units. The results 
  would vary greatly between different units, 5kg and 5000gms. The features with high magnitudes will weigh
  in a lot more in the distance calculations than features with low magnitudes.

  To suppress this effect, we need to bring all features to the same level of magnitudes. This can be
  achieved by scaling.
<img src="scale.png">

In [11]:
#Use StandardScaler Class for Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



## Fitting Logistic Regression to the Training set

In [12]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Predicting the Test set results

In [13]:
#Store the test set results in a vector of predictions "y_pred"
y_pred = classifier.predict(X_test)

# Evaluating the Model's Performance

## 1) False Positives and False negatives
<img src="error.png">
### a) False Positives or Type I Errors :
  
A false positive is an error when the model incorrectly predicts the positive class  ie., the model
predicts a positive outcome whereas in reality the outcome is negative.
  
### b) False Negatives or Type II Errors :
 A false negative is an error when the model incorrectly predicts the negative class  ie., the model
 predicts a negative outcome whereas in reality the outcome is positive.

## 2) Reading a Confusion Matrix
 
The confusion matrix lays out the performance of a learning algorithm. It is simply a square matrix 
that reports the counts of the true positive , true negative , false positive and false negative 
predictions of a classifier as shown in the figure :
<img src="c.png">

## Making the Confusion Matrix

In [14]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[65  3]
 [ 8 24]]


## Calculate Accuracy rate and Error rate :


In [15]:
#Print the Accuracy and Error rate of prediction
total=cm[0][0] + cm [1][1] + cm [0][1] + cm[0][1]
correct= cm[0][0] + cm [1][1]

print("Accuracy Rate = " ,correct/total)
print("Error Rate = " ,(total-correct)/total)

Accuracy Rate =  0.9368421052631579
Error Rate =  0.06315789473684211


## Visualising the Training set results

In [16]:
plt.close()
%matplotlib
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

Using matplotlib backend: Qt5Agg


## Visualising the Test set results

In [35]:
%matplotlib
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

Using matplotlib backend: Qt5Agg
