In [76]:
#TITANIC SURVIVAL PREDICTION

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib as plt

In [78]:
#Loading the Dataset

In [79]:
data=pd.read_csv("Titanic-Dataset.csv")

In [80]:
print(data)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [81]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [82]:
print(data.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [83]:
#Step 2: Data Preprocessing 

In [84]:
#Drop columns that wont help in prediction

In [85]:
data=data.drop(['Name','PassengerId','Ticket','Cabin'],axis=1)

In [86]:
#fill missing values for 'Age' and 'Fare' with the median

In [87]:
imputer=SimpleImputer(strategy='median')
data['Age']=imputer.fit_transform(data[['Age']])
data['Fare']=imputer.fit_transform(data[['Fare']])

In [88]:
#Fill the missing values for 'Embarked' with most frequent value

In [89]:

data['Embarked']=data['Embarked'].fillna(data['Embarked'].mode()[0])

In [90]:
# Convert categorical columns (Sex, Embarked) into numerical values using Label Encoding

In [91]:
label_encoder=LabelEncoder()
data['Sex']=label_encoder.fit_transform(data['Sex'])
data['Embarked']=label_encoder.fit_transform(data['Embarked'])

In [92]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [93]:
#Step 3: Define Features and Target

In [94]:
x=data.drop('Survived', axis=1)
y=data['Survived']

In [95]:
# Step 3: Define features and target
X = data.drop('Survived', axis=1)  # Features
y = data['Survived']  # Target (whether the passenger survived or not):

In [96]:
#Step 4: Split the data into training and testing sets

In [97]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [98]:
#Step 5:Build a Random Forest Model

In [99]:
model=RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [100]:
#Step 6:Make Prediction and Evaluate the problem

In [101]:
y_pred=model.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
print(f'Accuracy:{accuracy*100:.2f}%')

Accuracy:82.12%


In [102]:
#Step 7: Test the Prediction

In [103]:
# After making predictions and calculating accuracy, add this:
results = x_test.copy()  # Copy the test features
results['Actual Survived'] = y_test  # Add the actual survival data
results['Predicted Survived'] = y_pred  # Add the model's predictions

# Display the first few rows to compare
print(results.head())


     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Actual Survived  \
709       3    1  28.0      1      1  15.2458         0                1   
439       2    1  31.0      0      0  10.5000         2                0   
840       3    1  20.0      0      0   7.9250         2                0   
720       2    0   6.0      0      1  33.0000         2                1   
39        3    0  14.0      1      0  11.2417         0                1   

     Predicted Survived  
709                   0  
439                   0  
840                   0  
720                   1  
39                    0  
