In [1]:
# Implementing Logistic Regression

In [2]:
# source Title:
#    Logistic Regression in Python Using Scikit-learn
#
# Source URL: 
#    https://heartbeat.fritz.ai/logistic-regression-in-python-using-scikit-learn-d34e882eebb1

# Data Download:
#    https://raw.githubusercontent.com/dhirajk100/LogReg01/master/Student-Pass-Fail-Data.csv

In [3]:
# Import required Libraries

import pandas as pd
import numpy as np
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
# Load the data

# At this point, we’re ready to load our data into a Pandas dataframe. 
# To do this, we’ll use Pandas’ read_csv method, given that our data 
# is stored in the .csv format on our local file system. You can download the data here.
df = pd.read_csv('data/Student-Pass-Fail-Data.csv')
df.head()

Unnamed: 0,Self_Study_Daily,Tution_Monthly,Pass_Or_Fail
0,7,27,1
1,2,43,0
2,7,26,1
3,8,29,1
4,3,42,0


In [5]:
df.shape

(1000, 3)

In [6]:
# What are the data features?

# Note that the loaded data has two features — namely, 
#  * Self_Study_Daily
#  * Tuition_Monthly. 

# Self_Study_Daily indicates how many hours the student 
#                  studies daily at home (x1)
#
# Tuition_Monthly indicates how many hours per month the 
#                 student is taking private tutor classes (x2)

In [7]:
# Label?

# we have one label in the dataset named: Pass_or_Fail. 
# This label has two values—either 1 or 0. 
# A value of 1 indicates pass and a value of 0 indicates fail.

In [8]:
# Create x (features) and y (label)
# x = (x1, x2)
x = df.drop('Pass_Or_Fail', axis = 1)
y = df.Pass_Or_Fail

In [9]:
x.shape

(1000, 2)

In [10]:
y.shape

(1000,)

In [11]:
x.head()

Unnamed: 0,Self_Study_Daily,Tution_Monthly
0,7,27
1,2,43
2,7,26
3,8,29
4,3,42


In [12]:
y.head()

0    1
1    0
2    1
3    1
4    0
Name: Pass_Or_Fail, dtype: int64

In [13]:
# Split data into Training and Test

# split the data into train and test sets. 
# This will separate 25%( default value) of the data 
# into a subset for testing part and the remaining 
# 75% will be used for our training subset.
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4)

In [14]:
len(x_train)

750

In [15]:
len(x_test)

250

In [16]:
len(y_train)

750

In [17]:
len(y_test)

250

In [18]:
# Build Logistic Regression Model

# At this stage, we’re ready to create our 
# logistic regression model. We’ll do this 
# using the LogisticRegression class we imported in the beginning.
logistic_regression = LogisticRegression(solver = 'lbfgs')

In [19]:
# Train the Model

# Once the model is defined, we can work to fit our data. 
# We’re going to use the fit method on the model to train the data. 
# Note that the fit method takes two parameters here: 
# variables 
#           x_train and y_train

In [20]:
logistic_regression.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
# Predict

# Once model training is complete, 
# its time to predict the data using the model. 
# For this, we’re going to use the predict method 
# on the model and pass the x_test values for predicting. 
# We’re storing the predicted values in the y_pred variable.
y_pred = logistic_regression.predict(x_test)

In [22]:
# Finding Accuracy

# We need to find the accuracy of our model in order 
# to evaluate its performance. For this, we’ll use 
# the accuracy_score method of the metrics class, as shown below:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
accuracy_percentage

96.8

In [23]:
# What does this mean?
# We found that accuracy of the model is 96.8 % . 
# By accuracy, we mean the number of correct predictions 
# divided by the total number of predictions.

In [24]:
# Predicting Pass or Fail

# Now that we’ve tested our model, 
# we need to predict the pass or fail probability 
# of a few of our friends. For this, we need to use, 
# as input to the model, the features of one of our 
# friends—First_Friend. 

# Our First_Friend has Self_Study_Daily and Tuition_Monthly values as 4 and 38, respectively.

In [25]:
# predicting whether a student will fail or pass
First_Friend = logistic_regression.predict((np.array([4, 38]).reshape(1, -1)))

In [26]:
First_Friend

array([0])

In [27]:
# predicting whether a student will fail or pass
Second_Friend = logistic_regression.predict((np.array([8, 29]).reshape(1, -1)))

In [28]:
Second_Friend

array([1])

In [29]:
# 0 => Failed
# 1 => Passed