# Data Imputation with Titanic Dataset

Dataset link - https://www.kaggle.com/c/titanic-dataset/data

## Reading Input Data

In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv("data.csv")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Method #1 (Deleting NULL Values)

### Before deleting the rows 

In [5]:
print(data.isnull().sum())
print(data.shape)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
(891, 12)


### After deleting the rows

In [6]:
data.dropna(inplace=True)
print(data.isnull().sum())
print(data.shape)

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
(183, 12)


# Method #2 (Impute missing values with mean/median)

### Before Imputing the mean/median in the missing values:


In [7]:
print(data["Age"][:20])
print(data.isnull().sum())
print(data.shape)

1      38.0
3      35.0
6      54.0
10      4.0
11     58.0
21     34.0
23     28.0
27     19.0
52     49.0
54     65.0
62     45.0
66     29.0
75     25.0
88     23.0
92     46.0
96     71.0
97     23.0
102    21.0
110    47.0
118    24.0
Name: Age, dtype: float64
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
(183, 12)


### After Imputing the mean in the missing values

In [8]:
data["Age"] = data["Age"].replace(np.NaN, data["Age"].mean())
print(data["Age"][:20])
print(data.isnull().sum())
print(data.shape)

1      38.0
3      35.0
6      54.0
10      4.0
11     58.0
21     34.0
23     28.0
27     19.0
52     49.0
54     65.0
62     45.0
66     29.0
75     25.0
88     23.0
92     46.0
96     71.0
97     23.0
102    21.0
110    47.0
118    24.0
Name: Age, dtype: float64
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
(183, 12)


### After Imputing the median in the missing values:

In [11]:
data["Age"] = data["Age"].replace(np.NaN, data["Age"].median())
print(data["Age"][:20])
print(data.isnull().sum())
print(data.shape)

1      38.0
3      35.0
6      54.0
10      4.0
11     58.0
21     34.0
23     28.0
27     19.0
52     49.0
54     65.0
62     45.0
66     29.0
75     25.0
88     23.0
92     46.0
96     71.0
97     23.0
102    21.0
110    47.0
118    24.0
Name: Age, dtype: float64
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
(183, 12)


# Method #3 (Missing values for categorical values)

In [14]:
data["Age"] = data["Age"].fillna(data["Age"]. value_counts(). index[0])
print(data["Age"][:20])
print(data["Cabin"][:5])
data["Cabin"] = data["Cabin"].fillna(data["Cabin"]. value_counts(). index[0])
print(data["Cabin"][:5])
print(data.isnull().sum())
print(data.shape)

1      38.0
3      35.0
6      54.0
10      4.0
11     58.0
21     34.0
23     28.0
27     19.0
52     49.0
54     65.0
62     45.0
66     29.0
75     25.0
88     23.0
92     46.0
96     71.0
97     23.0
102    21.0
110    47.0
118    24.0
Name: Age, dtype: float64
1      C85
3     C123
6      E46
10      G6
11    C103
Name: Cabin, dtype: object
1      C85
3     C123
6      E46
10      G6
11    C103
Name: Cabin, dtype: object
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
(183, 12)


# Method #4 (Substituting with random Random values)

In [17]:
import random
val =  list(data["Cabin"])
random_value = random.choice(val)
while 1:
    		if str(random_value) == "nan":
        			random_value = random.choice(val)
    		else:
        			break
data["Cabin"] = data["Cabin"].replace(np.NaN,random_value)
print(data["Cabin"][:5])
print(data.isnull().sum())
print(data.shape)

1      C85
3     C123
6      E46
10      G6
11    C103
Name: Cabin, dtype: object
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
(183, 12)
