In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.DataFrame(pd.read_csv('/content/train.csv'))
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Desscription of the attributes of the Dataset**

* pClass: Passenger Class(1 = 1st; 2 = 2nd; 3 = 3rd)
* survival: Survival (0 = No; 1 = Yes)
* name: Name
* sex: Sex
* age: Age
* sibsp: No. of Siblings/Spouses Aboard
* parch: No. of Parents/Children Aboard
* ticket: Ticket No.
* fare: Passenger Fare (British Pound)
* cabin: Cabin
* embarked: Port of Embarkation: (C = Cherbourg; Q = Queenstown; S = Southampton)


In [7]:
df.shape

(891, 12)



*   There are a total of 891 entries and 12 columns in the dataset


In [8]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64



*   The Dataset is not complete i.e. it has empty cells in it

In [9]:
x = df.isnull().sum()

drop_col = x[x>(0.35 * df.shape[0])]
drop_col

Cabin    687
dtype: int64

* Cabin column has the highes tno of empty cells.

In [10]:
drop_col.index

Index(['Cabin'], dtype='object')

In [11]:
df.drop(drop_col.index, axis=1, inplace=True)
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

Dataset after dropping the column with highest no. of null values in it.

In [12]:
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

Replacing the leftover null values with the mean values

In [13]:
df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [14]:
df['Embarked'].fillna('S',inplace=True)

Replacing the null values in the String Columns with 'S'

In [15]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

No Null Values left

In [16]:
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0


*   The fare charge increases with the increase in class i.e the Fare Charges and the Class of Travel have a positive correclation 

In [17]:
df['FamilySize'] = df['SibSp']+df['Parch']
df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,FamilySize
PassengerId,1.0,-0.005007,-0.035144,0.033207,0.012658,-0.040143
Survived,-0.005007,1.0,-0.338481,-0.069809,0.257307,0.016639
Pclass,-0.035144,-0.338481,1.0,-0.331339,-0.5495,0.065997
Age,0.033207,-0.069809,-0.331339,1.0,0.091566,-0.248512
Fare,0.012658,0.257307,-0.5495,0.091566,1.0,0.217138
FamilySize,-0.040143,0.016639,0.065997,-0.248512,0.217138,1.0


In [18]:
df['Alone'] = [0 if df['FamilySize'][i]>0 else 1 for i in df.index]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,FamilySize,Alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,S,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,S,0,1


In [19]:
df.groupby(['Alone'])['Survived'].mean()

Alone
0    0.505650
1    0.303538
Name: Survived, dtype: float64

*   People who travelled along with Family had higher chance of survival


In [20]:
df[['Alone','Fare']].corr()

Unnamed: 0,Alone,Fare
Alone,1.0,-0.271832
Fare,-0.271832,1.0


In [21]:
df['Sex'] = [0 if df['Sex'][i] == 'male' else 1 for i in df.index]  # 0 = male, 1 = female
df.groupby(['Sex'])['Survived'].mean()

Sex
0    0.188908
1    0.742038
Name: Survived, dtype: float64

*   Females were prioritized over Men

In [22]:
df.groupby(['Embarked'])['Survived'].mean()

Embarked
C    0.553571
Q    0.389610
S    0.339009
Name: Survived, dtype: float64



*   People who boarded from Cherbourg had higher survival rate


### **CONCLUSION**



*   Female Passengers were given more preferance than Male
*   People with High Class or Rich People had higher chance of Survival
*   Passengers with their family onboard had higher Survival Rates
*   Passengers boparding the ship from Cherbourg had the highest no. of survival rates as compared to the ones from Queenstown & Southampton




