In [70]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


**Load the files and display head**

In [71]:
train= pd.read_csv('/kaggle/input/titanic/train.csv')
test= pd.read_csv('/kaggle/input/titanic/test.csv')

print("Train DataFrame:")
print(train.head())

print("\nTest DataFrame:")
print(test.head())

Train DataFrame:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   

**Feature Extraction**

In [72]:
def extract_features(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
                                       'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    return df

train=extract_features(train)
test=extract_features(test)

**Finding Missing Values**

In [73]:
missing_val=train.isnull().sum()
print("Missing Values in Training Data:")
print(missing_val[missing_val>0]) # prints count of missing valies in columns 
missing_val=test.isnull().sum()
print("\nMissing Values in Test Data:")
print(missing_val[missing_val>0]) 

Missing Values in Training Data:
Age         177
Cabin       687
Embarked      2
dtype: int64

Missing Values in Test Data:
Age       86
Fare       1
Cabin    327
dtype: int64


**Handle Missing values**

In [74]:
#for training set
train['Age'].fillna(train['Age'].median(),inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)
train.drop(columns=['Cabin'],inplace=True)
# for test set 
test['Age'].fillna(test['Age'].median(),inplace=True)
test['Fare'].fillna(test['Fare'].median(),inplace=True)
test.drop(columns=['Cabin'],inplace=True)

print(train.isnull().sum())
print(test.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
FamilySize     0
IsAlone        0
Title          0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
FamilySize     0
IsAlone        0
Title          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

**One Hot Encoding**

In [75]:
train= pd.get_dummies(train, columns=['Sex','Embarked'],drop_first=True)
test= pd.get_dummies(test, columns=['Sex','Embarked'],drop_first=True)


**Align train and test sets**

In [76]:
test=test.reindex(columns=train.columns,fill_value=0)
test.drop(columns=['Survived'],inplace=True)

train.drop(columns=['Name','Ticket'],inplace=True)
test.drop(columns=['Name','Ticket'],inplace=True)

In [77]:
train['Sex_male']=train['Sex_male'].astype(int)
train['Embarked_S']=train['Embarked_S'].astype(int)
train['Embarked_Q']=train['Embarked_Q'].astype(int)


test['Sex_male']=test['Sex_male'].astype(int)
test['Embarked_S']=test['Embarked_S'].astype(int)
test['Embarked_Q']=test['Embarked_Q'].astype(int)
print(train.head())
print(test.head())


   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  FamilySize  \
0            1         0       3  22.0      1      0   7.2500           2   
1            2         1       1  38.0      1      0  71.2833           2   
2            3         1       3  26.0      0      0   7.9250           1   
3            4         1       1  35.0      1      0  53.1000           2   
4            5         0       3  35.0      0      0   8.0500           1   

   IsAlone Title  Sex_male  Embarked_Q  Embarked_S  
0        0    Mr         1           0           1  
1        0   Mrs         0           0           0  
2        1  Miss         0           0           1  
3        0   Mrs         0           0           1  
4        1    Mr         1           0           1  
   PassengerId  Pclass   Age  SibSp  Parch     Fare  FamilySize  IsAlone  \
0          892       3  34.5      0      0   7.8292           1        1   
1          893       3  47.0      1      0   7.0000           2    

In [78]:
print(train.dtypes)
print(test.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
FamilySize       int64
IsAlone          int64
Title           object
Sex_male         int64
Embarked_Q       int64
Embarked_S       int64
dtype: object
PassengerId      int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
FamilySize       int64
IsAlone          int64
Title           object
Sex_male         int64
Embarked_Q       int64
Embarked_S       int64
dtype: object


In [84]:
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


X=train.drop(columns=['Survived'])
y=train['Survived']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)

In [88]:
print(X_train.columns)

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize',
       'IsAlone', 'Title', 'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


**preprocessing and pipeline**

In [89]:
numeric_features=['Age','Fare','FamilySize']
numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

categorical_features = ['Sex_male', 'Embarked_Q', 'Embarked_S', 'Title']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
         ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

**Model Building and Training**

In [90]:
# hyperparameter tuning
param_grid={
     'classifier__n_estimators': [100, 200, 300],
     'classifier__max_depth': [None, 10, 20, 30],
     'classifier__min_samples_split': [2, 5, 10],
     'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search= GridSearchCV(pipeline,param_grid,cv=5,scoring='accuracy')
grid_search.fit(X_train,y_train)

**validation and prediction**

In [91]:
model=grid_search.best_estimator_
y_pred=model.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val,y_pred)}')

#cross validation
cv_scores=cross_val_score(model,X,y,cv=5,scoring='accuracy')
print(f'Cross Validation Accuracy: {cv_scores.mean()}')
test['Survived']=model.predict(test)

Validation Accuracy: 0.8268156424581006
Cross Validation Accuracy: 0.821542903772519


**submission File**

In [92]:
submission=test[['PassengerId','Survived']]
submission.to_csv('/kaggle/working/submission.csv',index=False)
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
