## importing libraries

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

## loading and reading data

In [34]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
gender_data =pd.read_csv('gender_submission.csv')
print("Train Dataset:")
print(train_data.head(2))
print("\nTest Dataset:")
print(test_data.head(2))

Train Dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   

   Parch     Ticket     Fare Cabin Embarked  
0      0  A/5 21171   7.2500   NaN        S  
1      0   PC 17599  71.2833   C85        C  

Test Dataset:
   PassengerId  Pclass                              Name     Sex   Age  SibSp  \
0          892       3                  Kelly, Mr. James    male  34.5      0   
1          893       3  Wilkes, Mrs. James (Ellen Needs)  female  47.0      1   

   Parch  Ticket    Fare Cabin Embarked  
0      0  330911  7.8292   NaN        Q  
1      0  363272  7.0000   NaN        S  


## Preprocessing and train test split

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def preprocess_data(data, is_train=True):
    age_imputer = SimpleImputer(strategy='median')
    data['Age'] = age_imputer.fit_transform(data[['Age']])

    embarked_imputer = SimpleImputer(strategy='most_frequent')
    train_data['Embarked'].value_counts(dropna=False), train_data['Embarked'].isnull().sum()
    if not is_train and 'Fare' in data.columns:
        fare_imputer = SimpleImputer(strategy='median')
        data['Fare'] = fare_imputer.fit_transform(data[['Fare']])

    label_encoders = {'Sex': LabelEncoder(), 'Embarked': LabelEncoder()}
    for col, encoder in label_encoders.items():
        data[col] = encoder.fit_transform(data[col])
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return data
train_data_cleaned = preprocess_data(train_data)
X = train_data_cleaned.drop('Survived', axis=1)
y = train_data_cleaned['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head(), y_train.head()


(     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
 331       1    1  45.5      0      0  28.5000         2
 733       2    1  23.0      0      0  13.0000         2
 382       3    1  32.0      0      0   7.9250         2
 704       3    1  26.0      1      0   7.8542         2
 813       3    0   6.0      4      2  31.2750         2,
 331    0
 733    0
 382    0
 704    0
 813    0
 Name: Survived, dtype: int64)

## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

accuracy


0.8100558659217877

In [31]:
report

'              precision    recall  f1-score   support\n\n           0       0.83      0.86      0.84       105\n           1       0.79      0.74      0.76        74\n\n    accuracy                           0.81       179\n   macro avg       0.81      0.80      0.80       179\nweighted avg       0.81      0.81      0.81       179\n'

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)
accuracy_rf = accuracy_score(y_val, y_pred_rf)
report_rf = classification_report(y_val, y_pred_rf)
print(accuracy_rf)


0.8212290502793296


In [28]:
report_rf

'              precision    recall  f1-score   support\n\n           0       0.83      0.88      0.85       105\n           1       0.81      0.74      0.77        74\n\n    accuracy                           0.82       179\n   macro avg       0.82      0.81      0.81       179\nweighted avg       0.82      0.82      0.82       179\n'