### Importing the Libraries

In [1]:
import numpy as np  # For creating nd arrays
import pandas as pd # For creating dataframes
import math         # For using various mathematical functions
from sklearn.model_selection import train_test_split # For random splitting of data
from sklearn.metrics import confusion_matrix # For checking the true predictions and false predictions
from sklearn import datasets # Checking the class on the breast cancer dataset
from sklearn import linear_model # For using inbuilt Logistic Regression

### Opening the Dataset

In [2]:
train_set = pd.read_csv('titanic_train.csv', skipinitialspace = True, encoding = 'utf-8')
train_set

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [3]:
test_set = pd.read_csv('titanic_test.csv', skipinitialspace = True, encoding = 'utf-8')
test_set

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.7500,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0000,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S
...,...,...,...,...,...,...,...,...,...,...
218,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,STON/O 2. 3101285,7.9250,,S
219,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.5500,B38,S
220,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9000,C65,C
221,3,"Holm, Mr. John Fredrik Alexander",male,43.0,0,0,C 7075,6.4500,,S


### Cleaning the Data as per requirement

In [4]:
# Function changing the values of the columns 
def changeSex(sex):
    if sex == 'male':
        return 0
    else:
        return 1
    
def changeEmbarked(embarked):
    if embarked == 'S':
        return 0
    elif embarked == 'Q':
        return 1
    else:
        return 2

## Cleaning the Training Set
train_set.Age.fillna(0,inplace = True)
train_set.Pclass.fillna(train_set.Pclass.value_counts().index[0],inplace = True)
age = train_set.Age.values
pclass = train_set.Pclass.values
for index in range(len(age)):
    if age[index] == 0:
        age[index] = math.ceil(train_set.Age[train_set.Pclass == pclass[index]].mean())
train_set['Age'] = np.array(age, dtype = int)
train_set.Sex = train_set.Sex.apply(changeSex)
train_set.Embarked = train_set.Embarked.apply(changeEmbarked)
del(train_set['Name'])
del(train_set['Ticket'])
del(train_set['Fare'])
del(train_set['Cabin'])

## Cleaning the Test Set
test_set.Age.fillna(0,inplace = True)
test_set.Pclass.fillna(test_set.Pclass.value_counts().index[0],inplace = True)
age = test_set.Age.values
pclass = test_set.Pclass.values
for index in range(len(age)):
    if age[index] == 0:
        age[index] = math.ceil(test_set.Age[test_set.Pclass == pclass[index]].mean())
test_set['Age'] = np.array(age, dtype = int)
test_set.Sex = test_set.Sex.apply(changeSex)
test_set.Embarked = test_set.Embarked.apply(changeEmbarked)
del(test_set['Name'])
del(test_set['Ticket'])
del(test_set['Fare'])
del(test_set['Cabin'])
survived = train_set['Survived']
del(train_set['Survived'])
# Adding 1 to end of the set
train_set['constant'] = 1
train_set['Survived'] = survived
test_set['constant'] = 1

In [5]:
train_set

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,constant,Survived
0,2,1,29,1,0,0,1,1
1,3,0,19,0,0,0,1,0
2,2,0,39,0,0,0,1,0
3,3,1,29,0,4,0,1,0
4,3,0,25,0,0,0,1,0
...,...,...,...,...,...,...,...,...
663,2,1,17,0,0,0,1,1
664,3,0,25,0,0,1,1,0
665,3,0,32,0,0,0,1,1
666,3,1,22,0,0,0,1,0


In [6]:
test_set

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,constant
0,2,0,8,1,1,0,1
1,1,1,49,0,0,0,1
2,3,0,17,0,0,1,1
3,2,1,24,2,1,0,1
4,1,0,36,0,0,0,1
...,...,...,...,...,...,...,...
218,3,0,20,1,0,0,1
219,1,0,45,0,0,0,1
220,1,1,17,1,0,2,1
221,3,0,43,0,0,0,1


### Creating class for Logistic Regression

In [7]:
class LogisticRegression:
    # Initialising all the variables of the class in the constructor
    def __init__(self):
        print('Algo Initiated')
        self.coeff = None
        self.intercept = None
        self.__constants = None
        self.__m = None
        self.__n = None

    # Sigmoid function for predicting the values
    def __sigmoidFunction(self,x):
        temp = -1*((self.__constants*x).sum())
        exp = math.exp(temp)
        y = 1/(1+exp)
        return y
    
    # Finding the cost for different values of m
    def __cost(self,x,y):
        costArr = np.zeros(self.__m)
        for index in range(self.__m):
            sigmoidValue = self.__sigmoidFunction(x[index])
            costArr[index] = -((y[index]*math.log(sigmoidValue))+((1-y[index])*(math.log(1-sigmoidValue))))
        return costArr.mean()
    
    # Finding the derivative of cost with the respect to each feature in the dataset
    # and subtracting slope with constants
    def __step_gradient_descent(self,x,y,alpha):
        derivatives = np.zeros(self.__n)
        for col in range(self.__n):
            for row in range(self.__m):
                derivatives[col] += (-1/self.__m)*(x[row,col]*(y[row]-self.__sigmoidFunction(x[row])))
        self.__constants = self.__constants-alpha*derivatives
    
    # Gradient Descent for Logistic Regression
    def __gradient_descent(self,x,y,alpha,iterations):
        for count in range(iterations):
            self.__step_gradient_descent(x,y,alpha)
    
    # Fit function for fitting the data into the algorithm
    def fit(self,x,y,alpha,iterations):
        self.__m = x.shape[0]
        self.__n = x.shape[1]
        self.__constants = np.zeros(self.__n)
        self.__gradient_descent(x,y,alpha,iterations)
        self.coeff = self.__constants[:-1]
        self.intercept = self.__constants[-1]
    
    # Predict function to predict the values
    def predict(self,x):
        y_pred = np.zeros(x.shape[0])
        for index in range(x.shape[0]):
            sigmoidValue = self.__sigmoidFunction(x[index])
            if sigmoidValue < 0.5:
                y_pred[index] = 0
            else:
                y_pred[index] = 1
        return y_pred
    
    # Score function to find accuracy in the predicted values
    def score(self,x,y):
        count = 0
        y_pred = self.predict(x)
        for index in range(x.shape[0]):
            if y[index] == y_pred[index]:
                count += 1
        return count/x.shape[0]

### Testing the Algorithm

In [8]:
algo = LogisticRegression()
x = train_set.iloc[:,:-1].values
y = train_set.iloc[:,-1].values
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 1)
algo.fit(x_train,y_train,0.008,2000)

Algo Initiated


In [9]:
algo.score(x_train,y_train)

0.780439121756487

In [10]:
algo.score(x_test,y_test)

0.8562874251497006

In [11]:
y_train_pred = algo.predict(x_train)
y_test_pred = algo.predict(x_test)

In [12]:
print(confusion_matrix(y_train,y_train_pred))

[[268  29]
 [ 81 123]]


In [13]:
print(confusion_matrix(y_test,y_test_pred))

[[94  8]
 [16 49]]


### Comparing with Inbuilt Algorithm

In [14]:
inbuild_algo = linear_model.LogisticRegression()
inbuild_algo.fit(x_train,y_train)

LogisticRegression()

In [15]:
inbuild_algo.score(x_train,y_train)

0.7924151696606786

In [16]:
inbuild_algo.score(x_test,y_test)

0.8263473053892215

In [17]:
y_train_inbuild_predict = inbuild_algo.predict(x_train)
print(confusion_matrix(y_train,y_train_inbuild_predict))

[[260  37]
 [ 67 137]]


In [18]:
y_test_inbuild_predict = inbuild_algo.predict(x_test)
print(confusion_matrix(y_test,y_test_inbuild_predict))

[[85 17]
 [12 53]]


### Predicting for test set

In [19]:
algo1 = LogisticRegression()
algo1.fit(x,y,0.008,2000)

Algo Initiated


In [20]:
algo1.score(x,y)

0.7874251497005988

In [21]:
y_pred = algo1.predict(test_set.values)
np.savetxt('ans.csv',y_pred,delimiter = ',')