In [147]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression

In [148]:
df=pd.read_csv("./data/titanic.csv",usecols=['Age','Pclass','SibSp','Parch','Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [149]:
df.dropna(inplace=True)

In [150]:
X=df.drop("Survived",axis=1)
y=df['Survived']

In [151]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.6933333333333332

## **Applying Feature Construction**

In [152]:
X['Family_Size']= X['SibSp'] + X['Parch'] + 1

In [153]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_Size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [154]:
def myfun(num):
    if(num==1):
        # alone
        return 0
    elif(num>1 and num<=4):
        # small family
        return 1
    else:
        # large family
        return 2

In [155]:
myfun(4)

1

In [156]:
X['Family_Type']=X['Family_Size'].apply(myfun)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_Size,Family_Type
0,3,22.0,1,0,2,1
1,1,38.0,1,0,2,1
2,3,26.0,0,0,1,0
3,1,35.0,1,0,2,1
4,3,35.0,0,0,1,0


In [157]:
X.drop(columns=['SibSp','Parch','Family_Size'],inplace=True)
X.head()

Unnamed: 0,Pclass,Age,Family_Type
0,3,22.0,1
1,1,38.0,1
2,3,26.0,0
3,1,35.0,1
4,3,35.0,0


In [158]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.7003174603174602

## **Feature Splitting**

In [159]:
df=pd.read_csv("./data/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [160]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [161]:
df["Surname"]=df['Name'].str.split(',',expand=True)[0]

In [162]:
df['Title']=df['Name'].str.split(',',expand=True)[1].str.split(".",expand=True)[0]

In [163]:
df['Name']=df['Name'].str.split(",",expand=True)[1].str.split(".",expand=True)[1]

In [164]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Title
0,1,0,3,Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr
1,2,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs
2,3,1,3,Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss
3,4,1,1,Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs
4,5,0,3,William Henry,male,35.0,0,0,373450,8.05,,S,Allen,Mr


In [168]:
new_df = df[['Title', 'Surname', 'Name']]
new_df.head(10)

Unnamed: 0,Title,Surname,Name
0,Mr,Braund,Owen Harris
1,Mrs,Cumings,John Bradley (Florence Briggs Thayer)
2,Miss,Heikkinen,Laina
3,Mrs,Futrelle,Jacques Heath (Lily May Peel)
4,Mr,Allen,William Henry
5,Mr,Moran,James
6,Mr,McCarthy,Timothy J
7,Master,Palsson,Gosta Leonard
8,Mrs,Johnson,Oscar W (Elisabeth Vilhelmina Berg)
9,Mrs,Nasser,Nicholas (Adele Achem)
