In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [37]:
df=pd.read_csv(r'C:\\Users\\omope\\OneDrive\\Documents\\Python Scripts\\Titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##### Data Cleaning and Inspection

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [39]:
#shape of our dataset
print(f"this dataset contains {df.shape[0]} rows and {df. shape[1]} columns")

this dataset contains 891 rows and 12 columns


In [40]:
#checking for missing data
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

##### 


In [41]:
df.rename(columns={'PassengerId':'ID'},inplace=True)
#checking for duplicate value
print(f"Duplicates : {df['ID'].duplicated().any()}")

Duplicates : False


In [42]:
print(df['Survived'].unique())
print(df["Pclass"].unique())

[0 1]
[3 1 2]


In [43]:
#distribution of passengers in each Class
plt=px.histogram(df,x="Pclass",title="Distribution of passengers by class",color="Pclass")
plt.show()

In [44]:
#distribution of passenger who survived {0 for False and 1 for True}
plt=px.histogram(df,x="Survived",color="Survived")
plt.show()

In [45]:
#distribution of gender
plt=px.pie(df,names="Sex",title="Sex distribution",hole=0.4)
plt.show()

In [46]:
#AGE
df[df["Age"].isnull()]

Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [47]:
#filling null Age values
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Age']=df['Age'].astype(int)
print(df["Age"].isna().sum())

0



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [48]:
data=df[df['Age']<1].sort_values('Age',ascending=False)
data

Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
78,79,1,2,"Caldwell, Master. Alden Gates",male,0,0,2,248738,29.0,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0,1,2,113781,151.55,C22 C26,S
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0,2,1,2666,19.2583,,C
644,645,1,3,"Baclini, Miss. Eugenie",female,0,2,1,2666,19.2583,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0,1,1,250649,14.5,,S
803,804,1,3,"Thomas, Master. Assad Alexander",male,0,0,1,2625,8.5167,,C
831,832,1,2,"Richards, Master. George Sibley",male,0,1,1,29106,18.75,,S


In [49]:
data.iloc[:7,5]=pd.Series([1,1,3,1,1,3,1])
data

Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
78,79,1,2,"Caldwell, Master. Alden Gates",male,1,0,2,248738,29.0,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,1,1,2,113781,151.55,C22 C26,S
469,470,1,3,"Baclini, Miss. Helene Barbara",female,3,2,1,2666,19.2583,,C
644,645,1,3,"Baclini, Miss. Eugenie",female,1,2,1,2666,19.2583,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,1,1,1,250649,14.5,,S
803,804,1,3,"Thomas, Master. Assad Alexander",male,3,0,1,2625,8.5167,,C
831,832,1,2,"Richards, Master. George Sibley",male,1,1,1,29106,18.75,,S


In [50]:
df.update(data)

In [51]:
#Age Distribution
plt=px.box(df,x='Age',title='Age Distribution')
plt.show()

In [52]:
#siblings and parent column
df['Family']=df['Parch']+df['SibSp']
df.head()

Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,0


In [53]:
#Family Distribution
plt=px.box(df,x='Family',title='Family Distribution')
plt.show()

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        891 non-null    int64  
 1   Survived  891 non-null    int64  
 2   Pclass    891 non-null    int64  
 3   Name      891 non-null    object 
 4   Sex       891 non-null    object 
 5   Age       891 non-null    int64  
 6   SibSp     891 non-null    int64  
 7   Parch     891 non-null    int64  
 8   Ticket    891 non-null    object 
 9   Fare      891 non-null    float64
 10  Cabin     204 non-null    object 
 11  Embarked  889 non-null    object 
 12  Family    891 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 90.6+ KB


In [55]:
#fare distribution
plt=px.box(df,x='Fare',title='Fare Distribution')
plt.show()

In [56]:
# Embackation
print(df['Embarked'].unique())
df[df['Embarked'].isnull()]

['S' 'C' 'Q' nan]


Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
61,62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80.0,B28,,0
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80.0,B28,,0


In [57]:
df['Embarked'].fillna('S',inplace=True)
df[df['Embarked'].isnull()]


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family


In [58]:
#Embarkation distribution
plt=px.pie(df,names='Embarked',title='Embarkation distribution',color="Embarked",hole=0.4)
plt.show()


#### Exploratory Analysis

###

##### relationship between passenger's class,Age and survival rate

In [59]:
plt=px.histogram(df,x='Pclass',color='Survived',title='class distribution of survival',barmode='group')
plt.show()


##### relationship between age and survial

In [60]:
df['Age_group']=df['Age'].apply(lambda x: 'Child' if x<=12 else 'Teeneger' if x<=19 else 'Adult' if x<=50 else 'Elderly')

plt=px.histogram(df,x='Age_group',color='Survived',title='Age distribution of survival',barmode='group')
plt.show()

##### relationship between passengers Gender and suvival

In [61]:
df_male=df[df['Sex']=='male']
df_female=df[df['Sex']=='female']

plt=make_subplots(rows=1,cols=2,subplot_titles=('Male distribution','Female distribution'),horizontal_spacing=0.1)
male=px.histogram(df_male,x='Pclass',color='Survived',barmode='group')
female=px.histogram(df_female,x='Pclass',color='Survived',barmode='group')

for data in male.data:
    plt.add_trace(data,row=1,col=1)
for data in female.data:
    plt.add_trace(data,row=1,col=2)

plt.update_layout(title_text='Sex distribution by survival',showlegend=True)
plt.show()

##### probability of suviving 

In [75]:
def probability(df_male,df_female):
    
    prob_male = df_male.groupby('Pclass').agg(total=('Survived','count'),survived=('Survived','sum')).reset_index()
    prob_female = df_female.groupby('Pclass').agg(total=('Survived','count'),survived=('Survived','sum')).reset_index()
    
    prob_male['probability'] = round((prob_male['survived']/prob_male['total'] * 100),1)
    prob_female['probability'] =round((prob_female['survived']/prob_female['total'] * 100),1)
    
    return prob_male,prob_female
prob_male,prob_female = probability(df_male,df_female)
print(prob_male)
print(prob_female)

   Pclass  total  survived  probability
0       1    122        45         36.9
1       2    108        17         15.7
2       3    347        47         13.5
   Pclass  total  survived  probability
0       1     94        91         96.8
1       2     76        70         92.1
2       3    144        72         50.0


In [85]:
plt= make_subplots(rows=1,cols=2,subplot_titles=('male probability distribution of survival','female probability distribution of survival'),horizontal_spacing= 0.1)

male=go.Bar(x=prob_male['Pclass'],y=prob_male['probability'])

female=go.Bar(x=prob_female['Pclass'],y=prob_female['probability'])

plt.add_trace(male,row=1,col=1)
plt.add_trace(female,row=1,col=2)

plt.update_layout(title_text='Probability distribution',xaxis_title='Class',
    yaxis_title='Probability',
    xaxis2_title='Class',
    yaxis2_title='Probability',
    yaxis=dict(range=[0, 100]),
    showlegend=False
    )
plt.show()