# Titanic Data

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn import preprocessing 
from sklearn.impute import SimpleImputer, KNNImputer

### Setting the Seed

In [3]:
theSeed = np.random.randint(100,500)

### Read Data

In [4]:
dataPath = "/Users/nururrahman//Desktop/allDesktop/MacBookDesktop/DataScience/project_Kaggle/titanic/"
dataFile = "data_titanic.csv"

In [5]:
df = pd.read_csv( dataPath + dataFile)

### Process Data

In [6]:
LastName = [ (x.split(",")[0].strip()) for x in df.Name.tolist() ]
NameLength = [ len(x) for x in df.Name.tolist() ]
Title = [ (x.split(",")[1].strip()).split(" ")[0] for x in df.Name.tolist() ]

In [7]:
np.unique( np.array(Title) )

array(['Capt.', 'Col.', 'Don.', 'Dona.', 'Dr.', 'Jonkheer.', 'Lady.',
       'Major.', 'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.',
       'Ms.', 'Rev.', 'Sir.', 'the'], dtype='<U9')

In [8]:
#ind = np.where("the"==np.array(Title))
#df.iloc[ind[0], ]

In [9]:
#ind = np.where("Jonkheer."==np.array(Title))
#df.iloc[ind[0], ]

#### GroupBy Features to Find Extreme Groups

In [10]:
df['LastName'] = np.array(LastName)
df['NameLength'] = np.array(NameLength)
df['Title'] = np.array(Title)

In [11]:
#df.groupby('Title').size()

#### Clean Data 

In [12]:
ind = np.where("the" == np.array(Title))
#print( ind[0] )
Title[ ind[0][0] ] = "Countess."

In [13]:
indList = []
for title in ["Capt.","Col.","Major."]:
    ind = np.where(title == np.array(Title))
    indList.extend( ind[0] )
    #print(indList )
for ind in indList: 
    Title[ ind ] = "Army"

In [14]:
indList = []
for title in ["Ms.", "Mlle."]:
    ind = np.where(title == np.array(Title))
    indList.extend( ind[0] )
    #print(indList )
for ind in indList: 
    Title[ ind ] = "Miss."

In [15]:
indList = []
for title in ["Mme.","Dona."]:
    ind = np.where(title == np.array(Title))
    indList.extend( ind[0] )
    #print(indList )
for ind in indList: 
    Title[ ind ] = "Mrs."

In [16]:
indList = []
for title in ["Don."]:
    ind = np.where(title == np.array(Title))
    indList.extend( ind[0] )
    #print(indList )
for ind in indList: 
    Title[ ind ] = "Mr."

In [17]:
np.unique( np.array(Title) )

array(['Army', 'Countess.', 'Dr.', 'Jonkheer.', 'Lady.', 'Master.',
       'Miss.', 'Mr.', 'Mrs.', 'Rev.', 'Sir.'], dtype='<U9')

In [18]:
#df['Title'] = np.array(Title)
#df.groupby('Title').size()

In [19]:
#lenCat = df['Sex'].apply(lambda row: len(str(row)) )
#print( lenCat.unique() )

#### Check for Null Values

In [20]:
df.isnull().sum()

Pclass           0
Survived         0
Name             0
Sex              0
Age            263
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin         1014
Embarked         2
Boat           823
Body          1188
Home.dest      564
LastName         0
NameLength       0
Title            0
dtype: int64

#### Fill Null Values

Categoroical Column : 'UNKNOWN'

Numerical Column    : 'Median'

In [21]:
S = df.isnull().sum() 
S = S[S.values>0] 
indexS = S.index.tolist()
valueS = S.values.tolist()

In [22]:
indexS 
valueS

[263, 1, 1014, 2, 823, 1188, 564]

In [23]:
for col in indexS:
    if df[col].dtype == 'object':
        df[col].fillna('UNKNOWN', inplace=True)
    elif df[col].dtype == 'float':           
        imputer = SimpleImputer(strategy='median')
        imputed = imputer.fit_transform( df[[col]] )
        df[col] = imputed

In [24]:
df.isnull().sum()

Pclass        0
Survived      0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin         0
Embarked      0
Boat          0
Body          0
Home.dest     0
LastName      0
NameLength    0
Title         0
dtype: int64

In [25]:
df['Ticket']  = df.apply(lambda row: str(row.Ticket).replace(" ", "_").replace(".","").replace("/","_"), axis=1)
df['Cabin']   = df.apply(lambda row: str(row.Cabin).replace(" ", "_").replace(".","").replace("/","_"), axis=1)
df['Boat']    = df.apply(lambda row: str(row.Boat).replace(" ", "_").replace(".","").replace("/","_"), axis=1)
df['LastName']= df.apply(lambda row: str(row.LastName).replace(" ", "_"), axis=1)

In [26]:
home_dest = [str(x).replace(" ","") for x in df["Home.dest"].tolist()]
df['HomeDest'] = np.array(home_dest)
df['HomeDest'] = df.apply(lambda row: str(row.HomeDest).replace(",","_").replace("/","_"), axis=1)

#### Drop Un-Warranted Columns

In [27]:
df = df.drop(["Name", "Body", "Home.dest"], axis=1)

#### Shuffle Records Randomly 

In [28]:
df = df.sample(n=df.shape[0], replace=False).reset_index(drop=True)

### Data in Original Format 
#### Do not Convert Object Type

In [29]:
train_df    = df.sample(frac=0.75, replace=False, random_state=theSeed)
train_index = train_df.index.tolist()
test_df     = df[ ~df.index.isin(train_index) ]

In [30]:
train_df = train_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

In [31]:
train_test_original = (train_df, test_df)

### OrdinalEncoder
#### Convert Object Type to Integer Type 

In [32]:
def convertObjectType(feature, df):
    """
        OrdinalEncoder: Convert Categorical Features of Object Type to Integer Type
        Cabin & Embarked have Problematic Levels
    """
    #df[ feature[0] ] = df.apply(lambda row: str(row.Cabin).replace(" ",""), axis=1)
    #df[ feature[1] ] = df.apply(lambda row: str(row.Embarked).replace(" ",""), axis=1)
    df[ feature[0] ] = df.loc[:,feature[0]].apply(lambda row: str(row).replace(" ",""))
    df[ feature[1] ] = df.loc[:,feature[1]].apply(lambda row: str(row).replace(" ",""))
    
    catIndex = np.where(df.dtypes == np.object)[0]
    
    columns = []
    for ind in catIndex:
        label_encoder = preprocessing.LabelEncoder()
        feature = label_encoder.fit_transform(df.iloc[:,ind])
        columns.append(feature)
        
    d1 = df.drop( df.columns[catIndex], axis=1)
    d2 = pd.DataFrame( np.column_stack( columns ), columns=df.columns[catIndex] )
    dd = pd.concat([d1,d2], axis=1)
    return dd

In [33]:
feature = ["Cabin", "Embarked"]
dat = convertObjectType(feature, df)

In [34]:
train_df = dat.sample(frac=0.75, replace=False, random_state=None)
train_index = train_df.index.tolist()
test_df = dat[ ~dat.index.isin(train_index) ]

In [35]:
train_df = train_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

In [36]:
train_test_ordinal = (train_df, test_df)

### OneHotEncoder 
#### Convert Object Type to Integer Type 

In [37]:
def convertObjectType(feature, df):
    """
        OneHotEncoder : Convert Categorical Features of Object Type to Integer Type
        Cabin & Embarked have Problematic Levels
    """

    df[ feature[0] ] = df.loc[:,feature[0]].apply(lambda row: str(row).replace(" ",""))
    df[ feature[1] ] = df.loc[:,feature[1]].apply(lambda row: str(row).replace(" ",""))
    
    catIndex = np.where(df.dtypes == np.object)[0]
    df_cat   = df.iloc[:, catIndex]
    #print(f"Total catIndex : {len(catIndex)}")

    intIndex = np.where(df.dtypes == np.int)[0]
    df_int   = df.iloc[:, intIndex]
    #print(f"Total intIndex : {len(intIndex)}")

    floatIndex = np.where(df.dtypes == np.float)[0]
    df_float   = df.iloc[:, floatIndex]
    #print(f"Total intIndex : {len(floatIndex)}")
    
    imputer = SimpleImputer(strategy='most_frequent')
    imputed_int = imputer.fit_transform(df_int.values)
    df_int = pd.DataFrame(imputed_int, columns=df_int.columns)

    imputer = SimpleImputer(strategy='median')
    imputed_float = imputer.fit_transform(df_float.values)
    df_float = pd.DataFrame(imputed_float, columns=df_float.columns)

    enc = preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')
    fit = enc.fit( df_cat.values )
    trns= fit.transform( df_cat.values )
    
    colName = enc.get_feature_names( df_cat.columns.tolist() )
    df_cat_trns = pd.DataFrame( trns.toarray(), columns=colName )
    
    dd = pd.concat([df_cat_trns, df_int, df_float], axis=1)
    return dd

In [38]:
X = df.drop('Survived', axis=1)
Y = df[['Survived']]

In [39]:
feature = ["Cabin", "Embarked"]
X = convertObjectType(feature, X)
dat = pd.concat([Y, X], axis=1)

In [40]:
train_df = dat.sample(frac=0.75, replace=False, random_state=None)
train_index = train_df.index.tolist()
test_df = dat[ ~dat.index.isin(train_index) ]

In [41]:
train_df = train_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

In [42]:
train_test_onehot = (train_df, test_df)