# Defining Functions for Cleaning and Preprocessing

In [1]:
import pandas as pd
import numpy as np
titanic = pd.read_csv("https://sites.google.com/site/yasinunlu/home/research/new1/Titanic_train.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Defining a preprocessing function to: <br/>
0) Drop "PassengerId" and "Ticket" columns from the dataframe.<br/>
1) Check if there are any rows that have missing entries for all columns.<br/> 
2) Remove such rows from the given dataset.<br/>
3) Convert the data type of "Fare" and "Age" to int.<br/>
4) Rename "SibSp" as "Siblings/Spouses Aboard", and "Parch" as "Parents/Children Aboard".<br/>
5) Under column "Name": replace 'Mlle' with 'Miss', replace 'Ms' with 'Miss' and replace 'Mme'with 'Mrs'.<br/>

In [3]:
def data_preprocessing (titanic):
    
    #dropping columns
    titanic.drop('Ticket', axis = 1, inplace= True)
    titanic.drop('PassengerId', axis = 1, inplace= True)
    
    #removing rows with missing entries
    titanic.dropna(inplace = True)
    
    #filling NaNs and changing data type for age and fare
    titanic["Age"] = titanic["Age"].fillna(0)
    titanic["Age"] = titanic["Age"].astype(int)
                 
    titanic["Fare"] = titanic["Fare"].fillna(0)
    titanic["Fare"] = titanic["Fare"].astype(int)
    
    #renaming columns
    titanic.rename(columns={'SibSp': 'Siblings/Spouses Aboard', 'Parch': 'Parents/Children Aboard' }, inplace = True)
    
    #using applymap with lambda function to replace prefixes
    titanic[['Name']].applymap(lambda x:x.replace("Mlle","Miss"))
    titanic[['Name']].applymap(lambda x:x.replace("Ms","Miss"))
    titanic[['Name']].applymap(lambda x:x.replace("Mme","Mrs"))
    
    return titanic

new_titanic = data_preprocessing(titanic)
new_titanic.head()



Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Cabin,Embarked
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71,C85,C
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53,C123,S
6,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,51,E46,S
10,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,16,G6,S
11,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,26,C103,S


Feature extraction function: <br/>
We are interested in adding new columns which are so-called features.<br/>
Create a new function called "feature_extraction" which does the following updates to the existing dataframe.<br/>
In short, we will fully change the columns "Name", "Embarked", "Sex", "Cabin" and "Age" into numeric.
1) Name column should be of int type.  For those titles in name column, this column should indicate 1 for  "Mr."; 2 for "Miss."; 3 for "Mrs."; 4 for "Master."; 5 for "Rare" and 0 for any others.<br/>
2) Convert "Embarked" column into numeric. Replace "S" with 0; replace "C" with 1 and replace "Q" with 2. For the missing ones (if any), just fill with the most common one.<br/>
3) Convert "Sex" column into numeric.Replace "male" with 0 and replace "female" with 1. For the missing ones (if any), just fill with the most common one.<br/>
4) The column named "Cabin" includes strings with letters and numbers, for example "C85" and the letter refers to the cabin group. This column should indicate integer value of 1 for cabin name starts with letter "A". The rest of the numbers should appear according this dictionary: {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}. Any missing entries should be replaced with 0.<br/>
5) Fill missing values from "Age" column according to the following ad-hoc imputation technique: A random integer withdrawn from the set (mean - standard deviation, mean + standard deviation).<br/>
6) Any missing values from "Fare" column should be replaced with 0.<br/>
7) Update "Fare" column according to the following:<br/>
  if 0 <= 'Fare' < 10, then 'Fare' = 0<br/>
  if 10 <= 'Fare' < 20, then 'Fare' = 1<br/>
  if 20 <= 'Fare' < 30, then 'Fare' = 2<br/>
  if 30 <= 'Fare' < 100, then 'Fare' = 3<br/>
  if 100 <= 'Fare' < 200, then 'Fare' = 4<br/>
  if 200 <= 'Fare' then 'Fare' = 5<br/>
  
8) Update "Age" column according to the following:<br/>
   if 'Age' <= 10, then 'Age' = 0<br/>
   if 'Age' > 10 & 'Age' <= 15 then 'Age' = 1<br/>
   if 'Age' > 15 & 'Age' <= 20 then 'Age' = 2<br/>
   if 'Age' > 20 & 'Age' <= 25 then 'Age' = 3<br/>
   if 'Age' > 25 & 'Age' <= 35 then 'Age' = 4<br/>
   if 'Age' > 35 & 'Age' <= 40 then 'Age' = 5<br/>
   if 'Age' > 40 & 'Age' <= 60 then 'Age' = 6<br/>
   if 'Age' > 60 then 'Age' = 6


In [4]:
import numpy as np
def feature_extraction (new_titanic):
    
    new_titanic['Name'] = new_titanic['Name'].apply(lambda x: '1' if 'Mr. ' in x else x)
    new_titanic['Name'] = new_titanic['Name'].apply(lambda x: '2' if 'Miss' in x else x)
    new_titanic['Name'] = new_titanic['Name'].apply(lambda x: '3' if 'Mrs. ' in x else x)
    new_titanic['Name'] = new_titanic['Name'].apply(lambda x: '4' if 'Master' in x else x)
    new_titanic['Name'] = new_titanic['Name'].apply(lambda x: '5' if 'Rare' in x else x)

    new_titanic['Name'] = new_titanic['Name'].apply(lambda x: '0' if len(x) > 1 else x)
    new_titanic['Name'] = new_titanic['Name'].astype(int)
    
    
    emb_dict = {'S' :0, 'C' :1, 'Q': 2}
    new_titanic['Embarked'] = new_titanic['Embarked'].map(emb_dict, 'ignore')
    new_titanic['Embarked'] = new_titanic['Embarked'].fillna(new_titanic['Embarked'].mode().iloc[0])
    
    
    sex_dict = {'male' :0, 'female' :1}
    new_titanic['Sex'] = new_titanic['Sex'].map(sex_dict, 'ignore')
    new_titanic['Sex'] = new_titanic['Sex'].fillna(new_titanic['Sex'].mode().iloc[0])
    
    
    cab_dict = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

    new_titanic['Cabin'] = new_titanic['Cabin'].astype(str)
    new_titanic['Cabin'] = new_titanic['Cabin'].str[:1]
    new_titanic['Cabin'] = new_titanic['Cabin'].map(cab_dict)

    new_titanic.loc[new_titanic['Cabin'].isnull(), 'Cabin'] = 0 
    new_titanic['Cabin'] = new_titanic['Cabin'].astype(int)
    
    
    min = new_titanic['Age'].mean() - new_titanic['Age'].std()
    max = new_titanic['Age'].mean() + new_titanic['Age'].std()
    ad_age = list(np.random.randint(min, max, size = 1))

    new_titanic.loc[new_titanic['Age'].isnull(), 'Age'] = ad_age
    
    
    new_titanic.loc[new_titanic['Fare'].isnull(), 'Fare'] = 0
    
    
    new_titanic.loc[new_titanic['Fare']<10, 'Fare'] = 0
    new_titanic.loc[(new_titanic['Fare']>=10) & (new_titanic['Fare']<20), 'Fare'] = 1
    new_titanic.loc[(new_titanic['Fare']>=20) & (new_titanic['Fare']<30), 'Fare'] = 2
    new_titanic.loc[(new_titanic['Fare']>=30) & (new_titanic['Fare']<100), 'Fare'] = 3
    new_titanic.loc[(new_titanic['Fare']>=100) & (new_titanic['Fare']<200), 'Fare'] = 4
    new_titanic.loc[(new_titanic['Fare']>=200), 'Fare'] = 1
    
     
    new_titanic.loc[new_titanic['Age']<10, 'Age'] = 0
    new_titanic.loc[(new_titanic['Age']>=10) & (new_titanic['Age']<15), 'Age'] = 1
    new_titanic.loc[(new_titanic['Age']>=15) & (new_titanic['Age']<20), 'Age'] = 2
    new_titanic.loc[(new_titanic['Age']>=20) & (new_titanic['Age']<25), 'Age'] = 3
    new_titanic.loc[(new_titanic['Age']>=25) & (new_titanic['Age']<35), 'Age'] = 4
    new_titanic.loc[(new_titanic['Age']>=35) & (new_titanic['Age']<40), 'Age'] = 5
    new_titanic.loc[(new_titanic['Age']>=40) & (new_titanic['Age']<60), 'Age'] = 6
    new_titanic.loc[(new_titanic['Age']>=60), 'Age'] = 6
   
    return new_titanic

#2) Call your function here
new_titanic2 = feature_extraction(new_titanic)
new_titanic2.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Cabin,Embarked
1,1,1,3,1,5,1,0,3,3,1
3,1,1,3,1,5,1,0,3,3,0
6,0,1,1,0,6,0,0,3,5,0
10,1,3,2,1,0,1,1,1,7,0
11,1,1,2,1,6,0,0,2,3,0
