<a href="https://colab.research.google.com/github/piyush182004/ML_PROJECTS/blob/main/TitanicClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install pandas numpy scikit-learn




# *Import necessary libraries*

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# ***Load the datasets***

In [None]:
# Define file paths
train_path = '/content/drive/MyDrive/titanic/train.csv'
test_path = '/content/drive/MyDrive/titanic/test.csv'
gender_submission_path = '/content/drive/MyDrive/titanic/gender_submission.csv'

# Load datasets
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
gender_submission = pd.read_csv(gender_submission_path)

# Display dataset information
print("Train dataset:")
print(train.head())
print(train.info())
print(train.describe())

print("\nTest dataset:")
print(test.head())
print(test.info())
print(test.describe())

print("\nGender submission dataset:")
print(gender_submission.head())
print(gender_submission.info())
print(gender_submission.describe())


Train dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   Na

# ***DATA PREPROCESSING***

In [None]:
# Fill missing values in training data
train['Age'].fillna(train['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

# Fill missing values in test data
test['Age'].fillna(test['Age'].median(), inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)

# Convert categorical variables to numeric
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Drop irrelevant columns
train.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)
test.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)

# Verify preprocessing
print("\nProcessed train dataset:")
print(train.head())
print(train.info())
print(train.describe())

print("\nProcessed test dataset:")
print(test.head())
print(test.info())
print(test.describe())



Processed train dataset:
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    0  22.0      1      0   7.2500         0
1         1       1    1  38.0      1      0  71.2833         1
2         1       3    1  26.0      0      0   7.9250         0
3         1       1    1  35.0      1      0  53.1000         0
4         0       3    0  35.0      0      0   8.0500         0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB
None
         Survived      Pclass         Se

In [None]:
# Separate features and target variable
X = train.drop('Survived', axis=1)
y = train['Survived']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nFeature selection and data split complete.")



Feature selection and data split complete.


In [None]:
# Initialize and train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

print("\nModel trained successfully!")



Model trained successfully!


In [None]:
# Predict on validation set
y_pred = rf_model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_val, y_pred)
print(f'\nConfusion Matrix:\n{conf_matrix}')

class_report = classification_report(y_val, y_pred)
print(f'\nClassification Report:\n{class_report}')

# Check Accuracy Percentage
accuracy_percentage = accuracy * 100
print(f'\nAccuracy Percentage: {accuracy_percentage:.2f}%')



Accuracy: 0.83

Confusion Matrix:
[[92 13]
 [18 56]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       105
           1       0.81      0.76      0.78        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179


Accuracy Percentage: 82.68%


In [None]:
# Check if 'PassengerId' exists in the test DataFrame
if 'PassengerId' in test.columns:
    passenger_ids = test['PassengerId']
    # Drop 'PassengerId' from test data before making predictions
    test_data = test.drop(['PassengerId'], axis=1)
else:
    passenger_ids = None
    # Use test data as is if 'PassengerId' does not exist
    test_data = test

# Make predictions on test data
test_predictions = rf_model.predict(test_data)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': passenger_ids if passenger_ids is not None else range(1, len(test_predictions) + 1),
    'Survived': test_predictions
})

# Save the submission file locally
submission.to_csv('submission.csv', index=False)

print("\nSubmission DataFrame created:")
print(submission.head())



Submission DataFrame created:
   PassengerId  Survived
0            1         0
1            2         0
2            3         0
3            4         1
4            5         0


In [None]:
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>