<a href="https://colab.research.google.com/github/peterakdemir1/cs301/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''!mkdir ~/.kaggle
!mv ~/../content/kaggle.json ~/.kaggle/
!kaggle competitions download avazu-ctr-prediction
!unzip avazu-ctr-prediction.zip
!gzip -d train.gz
!gzip -d test.gz '''

# The code above was used to install the dataset, unzip the file containing the train, test, and sampleSubmission sets
# and to unzip the train and test sets

# These libraries were necessary for this program
import pandas as pd 
import numpy as np
import random
import math

# This is a series of all of the labels in the train set
labels = pd.Series(['id','click','hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id','device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21'])



In [None]:
# This function is the sigmoid function which is used to bound Y-hat between 0 and 1
def logistic(x):
  # Some y-hats were very negative (< -40000), which results in a 0 for the sigmoid function
  # but the program crashed with these occurrences, so I implemented this threshold so it returns a 0
  if (x < -40): 
    return 0
  # Sigmoid equation
  return 1.0 / (1+math.exp(-x))

# This function is an implementation of vector addition
def add(v1, v2):
  v3=[]
  for i in range(len(v1)):
    v3.append(v1[i]+v2[i])
  return v3

# This function sums elements vectors within a vector
def vec_sum(vec):
  s=[]
  for i in range(len(vec[0])):
      si=0
      for j in range(len(vec)):
          si+=vec[j][i]
      s.append(si)
  return s

# This function implements vector multiplication
def vec_mult(n, vec):
  p=[]
  for i in range(len(vec)):
    p.append(n*vec[i])
  return p

# This function implements the entropy loss function for a single row of data
def get_single_loss(x, y, w):
  yhat = logistic(np.dot(np.transpose(w),x)) # This is the prediction y hat - it is the model wTx plugged into the sigmoid function which returns a value between [0,1]
  # The program crashed when yhat was 0 or 1 so I included these if statements for such cases. When yhat is 1 the loss should be 0,
  # and when yhat is 1 the loss should be very high so I picked a value of 1000 to be returned in that case.
  if yhat == 1: 
    return 0
  if yhat == 0:
    return 1000
  return -(y*math.log(yhat) + (1-y)*math.log(1-yhat)) # This is the entropy equation

# This is the sum of all of the losses to get the total loss of the data set
def get_loss(xs, ys, w):
  s = 0
  for i in range(len(xs.loc[0])):
      s += get_single_loss(xs.iloc[i], ys[i], w)
  return s

# This function gets the individual partial derivative for the ith row
def get_partial(x, y, w, i):
  part = -(y-logistic(np.dot(x,w)))*x[i] # This is the partial derivative equation
  return part

# This function gets a single gradient for the particular iteration 
def get_single_gradient(x, y, w):
  gradient = []
  for i in range(len(w)):
    gradient.append(get_partial(x,y,w,i)) # This line appends each partial derivative to the gradient vector
  return gradient

# This function gets the entire gradient and sums it as well to get final gradient for the iteration
def get_gradient(xs, ys, w):
  gradient = []
  for i in range(len(xs.iloc[0])):
    gradient.append(get_single_gradient(xs.iloc[i],ys[i],w))
  return vec_sum(gradient)

# This function implements the Wk+1 = Wk - learningRate * gradient equation to update the weight
def get_gradient_step(w, gradient, learningRate):
  step = vec_mult(-learningRate, gradient) # learning rate * gradient 
  return add(w, step)

# This function implements the whole gradient descent algorithm, which results in the final "trained" weight values
def gd(x_train, y_train, w, learningRate):
  gradient = get_gradient(x_train, y_train, w) # Gets the gradient
  w = get_gradient_step(w, gradient, learningRate) # Updates the weights
  loss = get_loss(x_train, y_train, w) # Gets the loss 
  return w

In [None]:
from ast import Num # This was imported automatically when I ran the function
from pandas.errors import EmptyDataError # This is necessary for when the dataset reaches the end
batch=1000 # Batch of size 100
epochs=1 # Only 1 epoch, not even fully finished through the dataset
LR = 0.001 # Learning rate of 0.001

# Initialize random weights
w = []
for i in range(13):
  w.append(random.random())

# Skip factor to iterate through the dataset without reading the same batch 
skip=1

# I was only able to run 300 batches of the dataset because it was taking extremely long to run through
# It took 20 minutes to run through them so it would have taken 2666 minutes to run through it all, which is nearly 2 days.

for xyz in range(300): 
  try: # Try because if the data runs through to the end it will through an EmptyDataError
    data = pd.read_csv("train", header=None, skiprows=skip, nrows=batch, names=labels) # Read the data
    skip+=batch # Update skip factor by batch size so next batch reads from where last batch ended
  except(EmptyDataError):
    print("end of file at step:", skip)
    break
  else:
    xs = pd.DataFrame() # initialize dataframe
    ys = data['click'] # y data of clicks
    ls=[] # this will hold the labels that will be used
    for label in labels: #
      if label == 'click' or label == 'hour' or label == 'id':
        continue
      xs[label] = data[label]
      ls.append(label)

    # change the columns from the label names to indices
    xs.columns = [i for i in range(len(ls))]

    # Convert the string variables to summed ascii values
    for xyz in range(len(labels)): 
      if type(data[labels[xyz]][0]) == str: # only iterate if the column contains strings
        duplicates = data[labels[xyz]].drop_duplicates() # this gets the unique values in a column
        dic = {} # dictionary used to hold the ascii value corresponding to the string
        for dup in duplicates: # iterates through the duplicates to get the ascii values
          ad = 0
          for x in str(dup):
            ad += ord(x) # gets and sums ascii values
          dic[dup] = ad
        for a in range(len(data[labels[xyz]])):
          data[labels[xyz]][a] = dic.get(data[labels[xyz]][a]) # change the labels from str to ascii
    
    remLabels = ['hour','id', 'click', 'C14','C15','C15','C16','C17','C18','C19','C20','C21'] # labels to remove
    xs = data.drop(columns = remLabels) # train set intialized to the clean dataset
    ys = data['click']

    xs.columns = [i for i in range(len(xs.iloc[0]))]

    w = gd(xs, ys, w, LR) # perform gd to update and finalize the weights


In [None]:
# This is the final set of weights after 300 batches

[-5.463121693560493,
 0.7265327965373043,
 1.1621109782153227,
 -9.472196154830863,
 0.18800261171659854,
 -2.808695100078121,
 -2.8482561075307737,
 -1.7014204376688737,
 -3.253334030862665,
 -0.6900279764727759,
 -3.938816242920015,
 0.2549966904925998,
 0.31037848847670757]

In [None]:
skip = 1
batch = 1000
predictions = [] # holds all the yhats
for i in range(30): # Only 30 batches of the test set because of time constraint, also the same ratio of data from training and testing that was provided
  yhats = [] # holds the current batch's predictions
  ### LINES FROM THIS COMMENT TO NEXT ARE THE SAME PREPROCESSING LINES FROM TRAINING
  data = pd.read_csv("test", header=None, skiprows=skip, nrows=batch, names=labels) 
  skip+=batch

  test = pd.DataFrame()
  ls=[]
  for label in labels:
    if label == 'hour' or label == 'id':
      continue
    test[label] = data[label]
    ls.append(label)

  test.columns = [i for i in range(len(ls))]

  for xyz in range(len(labels)):
    if type(data[labels[xyz]][0]) == str:
      duplicates = data[labels[xyz]].drop_duplicates()
      dic = {}
      for dup in duplicates:
        ad = 0
        for x in str(dup):
          ad += ord(x)
        dic[dup] = ad
      for a in range(len(data[labels[xyz]])):
        data[labels[xyz]][a] = dic.get(data[labels[xyz]][a])
  
  remLabels = ['hour','id','C14','C15','C15','C16','C17','C18','C19','C20','C21','click']
  test = data.drop(columns = remLabels)
  test.columns = [i for i in range(len(test.iloc[0]))]

  ### UP TO HERE IS SAME AS TRAIN PREPROCESSING

  for a in range(batch): # read through each line
    yhat = logistic(np.dot(np.transpose(w),test.iloc[a])) # put model with test data and trained weights through sigmoid to get prediction
    yhats.append(yhat)

  print(yhats) # prints out the predicitions, which are below. 






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 