In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [2]:
from math import log, exp
class MyLogisticRegression:
    def __init__(self, learning_rate=0, num_of_iterations=0, reg_factor=0, penalty='l2'):
        self.learning_rate = learning_rate
        self.num_of_iterations = num_of_iterations
        self.min_cost=0
        self.M=0
        self.n=0
        self.coef_= np.zeros(self.n)
        self.intercept_= 0
        self.reg_factor= reg_factor
        self.penalty=penalty
    
    def __hypothesis(self, X, m):
        return 1/(1+exp(-1.0*(m.transpose().dot(X))))
    
    def __cost(self, X, Y, m):
        total_cost=0  #Total cost
        for i in range(self.M):
            x=X[i,:]
            y=Y[i]
            hxi= self.__hypothesis(x,m)
            if(hxi==0 and y==1):
                total_cost+= 10**9
                continue
            elif hxi==1 and y==0:
                total_cost+=10**9
                continue
            elif hxi==0 and y==0:
                continue
            elif hxi==1 and y==1:
                continue
            total_cost+=  (-y*log(hxi))-(1-y)*log(1-hxi)    #Total Cost
        for i in range(len(m)):
            if self.penalty=="l2":
                total_cost+= self.reg_factor*(m[i]**2)
            elif self.penalty=='l1':
                total_cost+= self.reg_factor*(abs(m[i]))
        return total_cost
    
    def __step_gradient(self, X, Y, m):
        slope=np.zeros(X.shape[1])   #slope of tangent  
        for i in range(self.M):
            x=X[i,:]
            y=Y[i]
            slope+=(-1/self.M)*(y-self.__hypothesis(x, m))*x   #slope of tangent
        new_m=m-self.learning_rate*slope
        return new_m
    
    def __gradient_descent(self, points):
        try:
            X=points[0]            #Input vector
            self.M=X.shape[0]      #Total number of datapoints in X
            self.n=X.shape[1]      #Total number of features in X
            Y=points[1]       #Output
            X_n=np.append(X,np.ones(self.M).reshape(-1,1),axis=1)     #n+1 column is 1
            m=np.zeros(self.n+1)   #here m is slope of best fit line
            for i in range(self.num_of_iterations):
                m = self.__step_gradient(X_n, Y, m)
                if i%10==0:
                    print(f"Cost after {i}th iterations is: {self.__cost(X_n, Y, m)}.")
            self.min_cost=self.__cost(X_n, Y, m)
            return m[:-1], m[-1]
        except Exception as e:
            print(e)
            return np.zeros(self.n),0
    
    def fit(self, X, Y):
        self.coef_, self.intercept_ = self.__gradient_descent((X,Y))
        
    def predict(self, X_test):
        val= np.zeros(X_test.shape[0])
        add= X_test*self.coef_
        for j in range(X_test.shape[0]):
            for i in range(X_test.shape[1]):
                val[j]+=add[j][i]
        y_pred= val + self.intercept_
        for i in range(len(y_pred)):
            if y_pred[i]>0.5:
                y_pred[i]= 1
            else:
                y_pred[i]=0
        return y_pred
    
    def score(self, Y_truth, Y_pred):
        from collections import Counter
        dict_Y_truth= dict(Counter(Y_truth))
        dict_Y_pred= dict(Counter(Y_pred))
        accurate=0
        for i in dict_Y_truth:
            if i in dict_Y_pred:
                accurate+=min(dict_Y_truth[i],dict_Y_pred[i])
        coeff= accurate/self.M
        return coeff

In [3]:
titanic_train=pd.read_csv("training_titanic_x_y_train.csv")
titanic_test=pd.read_csv("test_titanic_x_test.csv")

In [4]:
df_train=titanic_train.copy()
df_test=titanic_test.copy()

In [5]:
df_test.head(5)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [6]:
df_test.describe(include="all")

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,223.0,223,223,178.0,223.0,223.0,223,223.0,50,222
unique,,223,2,,,,212,,49,3
top,,"de Pelsmaeker, Mr. Alfons",male,,,,CA 2144,,B58 B60,S
freq,,1,150,,,,3,,2,160
mean,2.345291,,,29.694775,0.506726,0.304933,,32.622551,,
std,0.850047,,,15.398053,1.1697,0.634108,,61.062047,,
min,1.0,,,0.42,0.0,0.0,,0.0,,
25%,2.0,,,19.25,0.0,0.0,,7.8792,,
50%,3.0,,,27.0,0.0,0.0,,12.475,,
75%,3.0,,,37.75,1.0,0.0,,30.0354,,


In [7]:
df_train.drop(["Name","Cabin","Ticket"],axis=1,inplace=True)
df_test.drop(["Name","Cabin","Ticket"],axis=1,inplace=True)

In [8]:
df_train.describe(include='all')

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
count,668.0,668,536.0,668.0,668.0,668.0,667,668.0
unique,,2,,,,,3,
top,,male,,,,,S,
freq,,427,,,,,484,
mean,2.296407,,29.70056,0.528443,0.407186,32.064552,,0.402695
std,0.831638,,14.240257,1.080327,0.854695,45.320835,,0.490808
min,1.0,,0.67,0.0,0.0,0.0,,0.0
25%,2.0,,21.0,0.0,0.0,7.925,,0.0
50%,3.0,,29.0,0.0,0.0,14.75,,0.0
75%,3.0,,38.25,1.0,0.0,31.275,,1.0


In [9]:
def f(g):
    if g=="male":
        return 0
    else:
        return 1
def f1(g):
    if(g=="S"):
        return 0
    if(g=="C"):
        return 1
    if(g=="Q"):
        return 2

In [10]:
df_train.Age.fillna(df_train.Age.mean(),inplace=True)
df_train["Sex"]=df_train.Sex.apply(f)
df_train.Embarked.fillna("Q",inplace=True)
df_train["Embarked"] = df_train.Embarked.apply(f1)

df_test.Age.fillna(df_test.Age.mean(),inplace=True)
df_test["Sex"]=df_test.Sex.apply(f)
df_test.Embarked.fillna("Q",inplace=True)
df_test["Embarked"] = df_test.Embarked.apply(f1)

In [11]:
df_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,1,29.0,1,0,26.0,0,1
1,3,0,29.70056,0,0,8.05,0,0
2,2,0,39.0,0,0,26.0,0,0
3,3,1,29.0,0,4,21.075,0,0
4,3,0,25.0,0,0,7.05,0,0


In [12]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,0,8.0,1,1,36.75,0
1,1,1,49.0,0,0,25.9292,0
2,3,0,29.694775,0,0,7.7375,2
3,2,1,24.0,2,1,27.0,0
4,1,0,36.0,0,0,26.2875,0


In [13]:
df_train.describe(include="all")

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
count,668.0,668.0,668.0,668.0,668.0,668.0,668.0,668.0
mean,2.296407,0.360778,29.70056,0.528443,0.407186,32.064552,0.351796,0.402695
std,0.831638,0.480586,12.753571,1.080327,0.854695,45.320835,0.617496,0.490808
min,1.0,0.0,0.67,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,23.0,0.0,0.0,7.925,0.0,0.0
50%,3.0,0.0,29.70056,0.0,0.0,14.75,0.0,0.0
75%,3.0,1.0,35.0,1.0,0.0,31.275,1.0,1.0
max,3.0,1.0,80.0,8.0,6.0,512.3292,2.0,1.0


In [14]:
X= df_train.drop("Survived", axis=1)
X_train= X.values
Y_train = df_train.Survived
X_test = df_test.values

# Logistic Regression

In [15]:
learning_rate=0.001
num_iterations=2000
clf1= MyLogisticRegression(learning_rate, num_iterations)

In [16]:
clf1.fit(X_train, Y_train)

Cost after 0th iterations is: 452.57514474602675.
Cost after 10th iterations is: 426.1146851096128.
Cost after 20th iterations is: 424.0567408027569.
Cost after 30th iterations is: 423.67198458817325.
Cost after 40th iterations is: 423.4614278155816.
Cost after 50th iterations is: 423.272159047288.
Cost after 60th iterations is: 423.08638336315244.
Cost after 70th iterations is: 422.9019308187987.
Cost after 80th iterations is: 422.7185254286828.
Cost after 90th iterations is: 422.5361245352978.
Cost after 100th iterations is: 422.35471443963894.
Cost after 110th iterations is: 422.1742851078912.
Cost after 120th iterations is: 421.99482703761856.
Cost after 130th iterations is: 421.816330869898.
Cost after 140th iterations is: 421.63878734123574.
Cost after 150th iterations is: 421.46218727729575.
Cost after 160th iterations is: 421.28652159169405.
Cost after 170th iterations is: 421.1117812853722.
Cost after 180th iterations is: 420.9379574460206.
Cost after 190th iterations is: 420.

Cost after 1590th iterations is: 402.55082032812015.
Cost after 1600th iterations is: 402.4487927466657.
Cost after 1610th iterations is: 402.3470228697772.
Cost after 1620th iterations is: 402.2455087821271.
Cost after 1630th iterations is: 402.1442485890632.
Cost after 1640th iterations is: 402.04324041637744.
Cost after 1650th iterations is: 401.94248241006983.
Cost after 1660th iterations is: 401.84197273612654.
Cost after 1670th iterations is: 401.7417095802886.
Cost after 1680th iterations is: 401.6416911478328.
Cost after 1690th iterations is: 401.5419156633451.
Cost after 1700th iterations is: 401.4423813705074.
Cost after 1710th iterations is: 401.34308653188015.
Cost after 1720th iterations is: 401.24402942868414.
Cost after 1730th iterations is: 401.14520836059535.
Cost after 1740th iterations is: 401.0466216455304.
Cost after 1750th iterations is: 400.94826761944296.
Cost after 1760th iterations is: 400.8501446361155.
Cost after 1770th iterations is: 400.75225106696286.
Cos

In [38]:
clf1.coef_, clf1.intercept_

(array([-0.14911568,  0.20696401, -0.0183334 , -0.08692162, -0.00270161,
         0.01333052,  0.05179308]), -0.0005973472402324196)

In [36]:
y_test_pred= clf1.predict(X_test)
y_train_pred= clf1.predict(X_train)

In [37]:
print("Train Score: ", score(Y_train, y_train_pred))

Train Score:  0.6811377245508982


# Sklearn Logistic Regression

In [20]:
clf2 = LogisticRegression(C=1.0)

In [21]:
clf2.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
clf2.coef_

array([[-0.95095323,  2.53688304, -0.03343033, -0.2865677 , -0.13233378,
         0.00272943,  0.24559222]])

In [23]:
clf2.intercept_

array([1.77884482])

In [24]:
y_test_pred_skl= clf2.predict(X_test)
y_train_pred_skl= clf2.predict(X_train)

In [25]:
print("Train Score: ", clf2.score(X_train, Y_train))

Train Score:  0.7919161676646707


In [40]:
#Number of correct predictions with respect to sklearn logistic regression
np.sum(y_test_pred==y_test_pred_skl)/len(y_test_pred)

0.7174887892376681

# Saving Predictions to CSV file

In [26]:
np.savetxt("prediction_titanic.csv",y_train_pred_skl,delimiter=",",fmt="%.5f")