# Data exploration

In [1]:
import pandas as pd
import seaborn as sns

In [3]:
titanic = sns.load_dataset('titanic')
print(titanic.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


No. of total rows and columns.

In [5]:
print(titanic.shape)

(891, 15)


Column names and their data type

In [9]:
print(titanic.columns)
print(titanic.dtypes)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object


Missing values and in which columns

In [8]:
print(titanic.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


checking for duplicates.

In [27]:
# print(titanic.duplicated().sum())
unique_passengers = titanic.drop_duplicates()
# print(len(unique_passengers))
# print(unique_passengers['sex'].value_counts())
print(unique_passengers[unique_passengers["survived"]==1].value_counts())

survived  pclass  sex     age   sibsp  parch  fare      embarked  class  who    adult_male  deck  embark_town  alive  alone
1         1       female  14.0  1      2      120.0000  S         First  child  False       B     Southampton  yes    False    1
                          15.0  0      1      211.3375  S         First  child  False       B     Southampton  yes    False    1
                          16.0  0      0      86.5000   S         First  woman  False       B     Southampton  yes    True     1
                                       1      39.4000   S         First  woman  False       D     Southampton  yes    False    1
                                              57.9792   C         First  woman  False       B     Cherbourg    yes    False    1
                                                                                                                              ..
          3       female  4.0   1      1      16.7000   S         Third  child  False       G     Sout

In [13]:
print(titanic.duplicated(keep=False).head(10))

0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9    False
dtype: bool


How many passengers survived vs didn't 

In [20]:
# print(titanic[titanic["survived"]==1].count())
print(titanic["survived"].value_counts())
titanic[titanic["survived"]==1].to_csv("survived_titanic.csv",index=False)

survived
0    549
1    342
Name: count, dtype: int64


How many passengers were male or female.

In [None]:
print(titanic['sex'].value_counts())

sex
male      577
female    314
Name: count, dtype: int64


In [36]:
df = pd.read_csv("survived_titanic.csv")

print(df['sex'].value_counts())
# print(len(df.drop_duplicates()))

sex
female    233
male      109
Name: count, dtype: int64


Distribution of ages

In [31]:
print(titanic['age'].describe())

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64


Survival rates based on sex

In [37]:
print(titanic["sex"].value_counts(normalize=True)*100)
print("Survival rates")
print(df['sex'].value_counts(normalize=True)*100)

sex
male      64.758698
female    35.241302
Name: proportion, dtype: float64
Survival rates
sex
female    68.128655
male      31.871345
Name: proportion, dtype: float64


Survival rates based on Pclass

In [39]:
print("Normal Pclass rates")
print(titanic['pclass'].value_counts(normalize=True)*100)
print(df['pclass'].value_counts(normalize=True)*100)

Normal Pclass rates
pclass
3    55.106622
1    24.242424
2    20.650954
Name: proportion, dtype: float64
pclass
1    39.766082
3    34.795322
2    25.438596
Name: proportion, dtype: float64


What was the survival rate for children (<18) vs adults?

In [48]:
# print(f"Children: {titanic[titanic['age']<18].shape[0]}\nAdults: {titanic[titanic['age']>18].shape[0]}")
# print(f"Children: {df[df['age']<18].shape[0]}\nAdults: {df[df['age']>18].shape[0]}")

children = titanic[titanic["age"]<18]
adults = titanic[titanic['age']>=18]

c_rate = children['survived'].mean()*100
a_rate = adults['survived'].mean()*100
print(f"children survival rate: {c_rate}%\nAdults survival rate: {a_rate}%")

children survival rate: 53.98230088495575%
Adults survival rate: 38.10316139767055%


Which port did most passengers embark from?

In [53]:
print(titanic['embarked'].value_counts().head(1))

embarked
S    644
Name: count, dtype: int64


Did family size (siblings/spouses + parents/children) affect survival?

Who was the oldest survivor? Who was the youngest?

In [67]:
# print(titanic['sibsp'].value_counts())
# print(titanic['parch'].value_counts())

# print("Survivers Info:")
# print(df['parch'].value_counts())
# print(df['sibsp'].value_counts())

titanic['family-mem'] = titanic['sibsp'] + df['parch']
print(titanic.groupby('family-mem')['survived'].mean())
print(titanic['family-mem'].value_counts())

family-mem
0.0    0.375796
1.0    0.449438
2.0    0.403846
3.0    0.363636
4.0    0.300000
5.0    0.400000
6.0    0.000000
8.0    0.000000
Name: survived, dtype: float64
family-mem
0.0    157
1.0     89
2.0     52
3.0     22
4.0     10
5.0      5
8.0      4
6.0      3
Name: count, dtype: int64


In [4]:
print(f"Oldest person onboard: {titanic['age'].max()}\nYoungest person onboard: {titanic['age'].min()}")

Oldest person onboard: 80.0
Youngest person onboard: 0.42
