Peter Ling 
Logistic Regression Model


In this project, I implemented a logistic regression algorithm to predict the output of a data point given its parameters.

In [197]:
import csv
import numpy as np
import pandas as pd
import math
LEARNING_RATE = 0.0001

testing_file_location = "./simple-test.csv"

The sigmoid funciton is used as the squashing function in my algorithm

In [None]:
def sigmoid(x):
  return (1 / (1 + math.exp(-x)))

def dot(a, b):

    if len(a) != len(b):
        return "error"

    return sum(i[0] * i[1] for i in zip(a, b))

I used gradient ascent to train my parameters. This function takes in a dataframe of the training data, which will have parameters as the columns and the rows as each training data point. The last column will be the output of the training data point, which will come out to 0 or 1. It also takes in the number of training steps the user wants the gradient ascent to go through, along with the learning rate that would be appropritate.

In [243]:
def gradient_ascent(training_df, training_steps, learning_rate):
    
    rows = len(training_df) 
    n_params = len(training_df.columns) - 1
    thetas = np.zeros(n_params, np.float64)

    for i in range (training_steps):
        gradients = np.zeros(training_df.shape[1] - 1)
        
        for row in range (rows):
            y = training_df.iloc[row][n_params]
            
            x_list = []
            col = 0
            while col < n_params:
                x_list.append(training_df.iloc[row][col])
                col += 1

            x_numpy_array = np.asarray(x_list)
            gradients = np.add(gradients, x_numpy_array * (y - sigmoid(np.matmul(x_numpy_array, thetas.T))))
    

                   
        thetas = np.add(thetas, learning_rate * gradients)
        
    return thetas
        

The get_predicions function would look at all of the parametets for a given example and make a predicitons given the thetas we found in the previous step.  

In [253]:
def get_predictions(thetas, testing_df): 
    
    n_tests = len(testing_df)
    s = [1] * len(testing_df)
    n_params = len(testing_df.columns) - 1
    predictions = []
    
    test = 0
    while test < n_tests:

        x_vector = []
        col = 0
        while col < n_params:
            x_vector.append(testing_df.iloc[test][col])
            col += 1

        prediction = sigmoid(dot(thetas, x_vector))

        if prediction > 0.5:
            predictions.append(1)
        else:
            predictions.append(0)


        test += 1

    return predictions
    
    

I would use the check_validity function to see how accurate the predictions were by comparing my predicted outputs with the true outputs. 

In [261]:
def check_validity(predictions, testing_df):
    
#     file_names = {}
#     file_names["./simple-test.csv"] = "Simple"
#     file_names["./netflix-test.csv"] = "Netflix"
#     file_names["ancestry-test.csv"] = "Ancestry"
#     file_names["./heart-test.csv"] = "Heart"


    n_tests = len(testing_df)
    n_params = len(testing_df.columns) - 1
    n_y0_tests = 0
    n_y1_tests = 0

    correct_predictions = 0
    correct_0_predictions = 0
    correct_1_predictions = 0
    test = 0
    while test < n_tests:
        if (testing_df.iloc[test][n_params] == 0):
            n_y0_tests += 1
        else:
            n_y1_tests += 1
        if (predictions[test] == testing_df.iloc[test][n_params]):
            correct_predictions += 1
            if predictions[test] == 0:
                correct_0_predictions += 1
            else:
                correct_1_predictions += 1

        test += 1

    model_accuracy = correct_predictions / n_tests
#     print(model_accuracy)

    print(f"Using Logistic Regression")

    print(f"Class 0: tested {n_y0_tests}, correctly classified {correct_0_predictions}")
    print(f"Class 1: tested {n_y1_tests}, correctly classified {correct_1_predictions}")
    print(f"Overall: tested {n_tests}, correctly classified {correct_predictions}")
    print(f"Accuracy: {model_accuracy}")
    
    

In [269]:
simple_train_df = pd.read_csv('./simple-train.csv')
s = [1] * len(simple_train_df)
simple_train_df.insert(0, 'x0', s)
training_steps = 10000

In [270]:
simple_thetas = gradient_ascent(simple_train_df, training_steps, .0001)

In [271]:
print(simple_thetas)

[-0.14577434  0.82004294 -0.06660849]


In [272]:
simple_test_df = pd.read_csv('./simple-test.csv')
s = [1] * len(simple_test_df)
simple_test_df.insert(0, 'x0', s)
training_steps = 10000

In [273]:
simple_predictions = get_predictions(simple_thetas, simple_test_df)

In [274]:
check_validity(simple_predictions, simple_test_df)

Using Logistic Regression
Class 0: tested 2, correctly classified 2
Class 1: tested 2, correctly classified 2
Overall: tested 4, correctly classified 4
Accuracy: 1.0


In [276]:
netflix_train_df = pd.read_csv('./netflix-train.csv')
del netflix_train_df['Demographic']
s = [1] * len(netflix_train_df)
netflix_train_df.insert(0, 'x0', s)
training_steps = 3000


In [278]:
netflix_thetas = gradient_ascent(netflix_train_df, training_steps, 0.0001)

In [279]:
print(netflix_thetas)

[-1.50497638  0.23104825 -0.0028295  -0.13271726 -0.09906566  0.2528209
  0.00276221 -0.02652852  0.21079665 -0.01290325 -0.0928241   0.08403367
  0.04436495  0.16957462 -0.07855384 -0.02945679 -0.02897179 -0.01414843
  0.22249834  1.86511166]


In [280]:
netflix_test_df = pd.read_csv('./netflix-test.csv')
del netflix_test_df['Demographic']
s = [1] * len(netflix_test_df)
netflix_test_df.insert(0, 'x0', s)

In [282]:
netflix_predictions = get_predictions(netflix_thetas, netflix_test_df)

In [283]:
check_validity(netflix_predictions, netflix_test_df)

Using Logistic Regression
Class 0: tested 248, correctly classified 150
Class 1: tested 252, correctly classified 189
Overall: tested 500, correctly classified 339
Accuracy: 0.678


In [297]:


n_rows = len(netflix_train_df)
n_params = len(netflix_train_df.columns) - 1

sum = 0

for row in range (n_rows): 
    y = netflix_train_df.iloc[row][n_params]
    sum += (y * np.log(0.5)) + ((1 - y) * (np.log(0.5)))
    
print(f"Log Likelyhood when all paramters are 0: {sum}")
    

    
    
    

Log Likelyhood when all paramters are 0: -3119.1623125199


In [298]:


n_rows = len(netflix_train_df)
n_params = len(netflix_train_df.columns) - 1

sum = 0

for row in range(n_rows):
    
    y = netflix_train_df.iloc[row][n_params]
    
    x_list = []
    col = 0
    while col < n_params:
        x_list.append(netflix_train_df.iloc[row][col])
        col += 1

    x_numpy_array = np.asarray(x_list)
    
    
    sig_theta_x = sigmoid(np.matmul(x_numpy_array, netflix_thetas.T))
    
    sum += (y * np.log(sig_theta_x)) + ((1 - y) * np.log(1 - sig_theta_x))
    
print(f"Log Likelyhood after training: {sum}")
    
    
    


Log Likelyhood after training: -2601.775260916126


In [285]:
ancestry_train_df = pd.read_csv('./ancestry-train.csv')
s = [1] * len(ancestry_train_df)
ancestry_train_df.insert(0, 'x0', s)
training_steps = 100

In [287]:
ancestry_test_df = pd.read_csv('./ancestry-test.csv')
s = [1] * len(ancestry_test_df)
ancestry_test_df.insert(0, 'x0', s)

In [291]:
training_steps = 10000
ancestry_thetas = gradient_ascent(ancestry_train_df, training_steps, 0.0001)

In [292]:
ancestry_predictions = get_predictions(ancestry_thetas, ancestry_test_df)

In [293]:
check_validity(ancestry_predictions, ancestry_test_df)

Using Logistic Regression
Class 0: tested 109, correctly classified 98
Class 1: tested 75, correctly classified 56
Overall: tested 184, correctly classified 154
Accuracy: 0.8369565217391305
