In [None]:
from multiprocessing import Process, Manager
import pandas as pd
from scipy.stats import logistic
import numpy as np
import time
import argparse
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

start_time = time.clock()

#Initialize argument parser
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--filename", help="input file name")
parser.add_argument("-l", "--learning_rate", type=float, help="learning rate")
parser.add_argument("-i", "--iterations", type=int, help="number of iteration")
args = parser.parse_args()

if args.filename:
    filename = args.filename
else:
    print('Please specify a file name')
    print('Execute "python logistic.py -h" for help')
    exit()

if args.learning_rate:
    learning_rate = args.learning_rate
else:
    learning_rate = 1
    print('Using default learning rate', learning_rate)

if args.iterations:
    iterations = args.iterations
else:
    iterations = 50
    print('Using default iterations', iterations)

def compute_sum(parameters, result):
    for parameter in parameters:
        summition = 0
        for sample in data_array:
            summition += (hypothesis(sample[:-1], theta) - sample[-1]) * sample[parameter] 
        if parameter != 0:
            summition -= theta[parameter]/number_of_examples
        result[parameter] = learning_rate * summition / number_of_examples


def sigmoid(value):
    return logistic.cdf(value)

def hypothesis(X, theta):
    hx = np.dot(theta, X)
    return sigmoid(hx)

def predict_proba(X,theta):
    return sigmoid(np.inner(theta[1:], X) + theta[0])

#read data from csv file
data_csv = pd.read_csv("/content/titanic_train.csv")

#number of training examples  
number_of_examples = data_csv.shape[0]

#number of parameters in training set  
number_of_parameters = data_csv.shape[1]

#add column of ones to dataframe  make looping symmetric 
ones = np.ones(number_of_examples)
data_csv.insert(0, 'theta0',ones)

#initialize theta to zeroes array
theta = np.zeros(number_of_parameters)

#convert dataframe to numpy array
data_array = data_csv.to_numpy()

parameters_split = np.array_split(list(range(len(theta))), 4)
print(parameters_split)
manager = Manager()
results = manager.dict()


for i in range(iterations):
    processes = []
    for j in range(len(parameters_split)):
        processes.append(Process(target=compute_sum, args=(parameters_split[j], results)))
    theta_diff = []
    for p in processes:
        p.start()
    for p in processes:
        p.join()
    for j in range(len(theta)):
        theta_diff.append(results[j])
    theta = theta - theta_diff

for i in range(len(theta)):
    print('coefficient for theta ' + str(i), theta[i])



X_train, X_test, y_train, y_test = train_test_split(data_csv.drop('Survived',axis=1), 
                                                    data_csv['Survived'], test_size=0.30, 
                                                    random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))
print(time.clock() - start_time, "seconds")


Using default learning rate 1
Using default iterations 50
[array([0]), array([1]), array([2]), array([], dtype=int64)]


  if sys.path[0] == '':


coefficient for theta 0 -7.075944248742161
coefficient for theta 1 4.5710279013783115
coefficient for theta 2 -21.757066472203274
              precision    recall  f1-score   support

           0       0.66      0.85      0.74       154
           1       0.67      0.41      0.51       114

    accuracy                           0.66       268
   macro avg       0.67      0.63      0.63       268
weighted avg       0.67      0.66      0.65       268

1.6605899999999991 seconds


