In [45]:
import numpy as np  
import pandas as pd 

In [46]:
data_org= pd.read_csv('train.csv')

In [47]:
def pipeline(data):
    data.drop(['Name','Ticket','Cabin','PassengerId'],axis=1,inplace=True)
    data['Sex'] = data['Sex'].replace({'male': 1, 'female': 0})
    data['Embarked'] = data['Embarked'].replace({'S': 1, 'C': 2,'Q':3})
    max_repeated_class = data['Embarked'].value_counts().idxmax()
    data['Embarked'].fillna(max_repeated_class, inplace=True)
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Fare'].fillna(data['Fare'].mean(), inplace=True)
    return data

In [48]:
data=data_org.copy(deep=True)

In [49]:
data=pipeline(data)

In [50]:
data_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [51]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    float64
dtypes: float64(3), int64(5)
memory usage: 55.8 KB


In [52]:
from sklearn.preprocessing import MinMaxScaler

In [53]:
# Normalize the necessary columns between 0 and 1
scaler = MinMaxScaler()
y=data['Survived'].values
X=data.drop(['Survived'],axis=1).values
X_normalized = scaler.fit_transform(X)

In [54]:
X.shape

(891, 7)

In [55]:
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def initialize_parameters(self, num_features):
        self.weights = np.zeros(num_features)
        self.bias = 0

    def compute_cost(self, y, y_pred):
        m = len(y)
        cost = -1/m * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
        return cost

    def gradient_descent(self, X, y, y_pred):
        m = len(y)
        dw = 1/m * np.dot(X.T, (y_pred - y))
        db = 1/m * np.sum(y_pred - y)
        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db
        return self.weights, self.bias  

    def fit(self, X, y):
        m, num_features = X.shape
        self.initialize_parameters(num_features)

        for _ in range(self.num_iterations):
            z = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(z)
            
            cost = self.compute_cost(y, y_pred)
            self.weights, self.bias =self.gradient_descent(X, y, y_pred)
           

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(z)
        predictions = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(predictions)

In [56]:
model=LogisticRegressionScratch(learning_rate=0.1, num_iterations=1000)

In [57]:
model.fit(X_normalized, y)

In [58]:
test_data_org= pd.read_csv('test.csv')

In [59]:
test_data=test_data_org.copy(deep=True)

In [60]:
test_data=pipeline(test_data)

In [61]:
pred=model.predict(test_data)

In [62]:
pred.shape

(418,)

In [63]:

# Create a new DataFrame with the required columns
output_data = test_data_org.copy()
output_data['Survived'] = pred

# Save the new DataFrame to output.csv
output_data.to_csv('output_scr.csv', index=False)
