In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import load_dataset # this method will help us to #download the Titanic dataset

In [2]:
data = load_dataset("titanic")
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [3]:
# Check for duplicates: alive ~ survived, embark_town ~ embarked, adult_male,who ~ sex, alone ~ sibsp, pclass ~ class
# deck --> too many NaN values 
# fare can be deduced for pclass 
columns = ['alive', 'alone', 'embark_town', 'who', 'adult_male', 'deck','class','fare']
data_2 = data.drop(columns, axis=1)

In [4]:
# Check for missing values 
# Ways to deal with missing values:

# Delete rows/columns 

# 300 rows, 50 columns ---> 10 rows have more than 45 values missing 
# 300 rows, 50 columns --> 25 columns with more than 200 missing values 


# For numeric -- replace with mean
# For category -- replace with most frequent category

#Check max and min age

print("Max value of age column :", data_2['age'].max())
print("Min value of age column :", data_2['age'].min())

Max value of age column : 80.0
Min value of age column : 0.42


In [5]:
# As the max and min are so different, lets try to visualize categories of people in the dataset
# bins [1, 2, 3, 4] indicate (1,2], (2,3], (3,4]

bins = [0, 5, 17, 25, 50, 80]
labels = ['Infant', 'Kid', 'Young', 'Adult', 'Old']
data_2['age'] = pd.cut(data_2['age'], bins = bins, labels=labels)

In [6]:
print(data_2['age'])

0      Young
1      Adult
2      Adult
3      Adult
4      Adult
       ...  
886    Adult
887    Young
888      NaN
889    Adult
890    Adult
Name: age, Length: 891, dtype: category
Categories (5, object): [Infant < Kid < Young < Adult < Old]


In [7]:
pd.DataFrame(data_2['age'].value_counts())

Unnamed: 0,age
Adult,349
Young,188
Kid,69
Old,64
Infant,44


In [8]:
# Age still has null values so we can fill null values with mode of this column. (Categorical)

data_3 = data_2.fillna({'age' : data_2['age'].mode()[0]})
print(data_3['age'])

0      Young
1      Adult
2      Adult
3      Adult
4      Adult
       ...  
886    Adult
887    Young
888    Adult
889    Adult
890    Adult
Name: age, Length: 891, dtype: category
Categories (5, object): [Infant < Kid < Young < Adult < Old]


In [9]:
# To fill NA in embarked --> check for most frequent occurrence

print(data_3['embarked'].unique())
print("How many 'S' on embarked column :", data_2[data_2['embarked'] == 'S'].shape[0])
print("How many 'C' on embarked column :", data_2[data_2['embarked'] == 'C'].shape[0])
print("How many 'Q' on embarked column :", data_2[data_2['embarked'] == 'Q'].shape[0])

['S' 'C' 'Q' nan]
How many 'S' on embarked column : 644
How many 'C' on embarked column : 168
How many 'Q' on embarked column : 77


In [10]:
data_4 = data_3.fillna({'embarked' : 'S'})

In [11]:
# The Pandas getdummies function creates dummy variables from Pandas 
# A dummy variable is a numeric variable that encodes categorical information.
# In a dummy variable:

# A 1 encodes the presence of a category
# A 0 encodes the absence of a category

dummies = ['age', 'embarked', 'sex']
dummy_data = pd.get_dummies(data_4[dummies])
print(data_4['age'])
print(dummy_data)

0      Young
1      Adult
2      Adult
3      Adult
4      Adult
       ...  
886    Adult
887    Young
888    Adult
889    Adult
890    Adult
Name: age, Length: 891, dtype: category
Categories (5, object): [Infant < Kid < Young < Adult < Old]
     age_Infant  age_Kid  age_Young  age_Adult  age_Old  embarked_C  \
0             0        0          1          0        0           0   
1             0        0          0          1        0           1   
2             0        0          0          1        0           0   
3             0        0          0          1        0           0   
4             0        0          0          1        0           0   
..          ...      ...        ...        ...      ...         ...   
886           0        0          0          1        0           0   
887           0        0          1          0        0           0   
888           0        0          0          1        0           0   
889           0        0          0          1

In [12]:
dummy_data.shape

(891, 10)

In [13]:
data_5 = pd.concat([data_4, dummy_data], axis = 1)
data_5.drop(dummies, axis=1, inplace=True)

In [14]:
data_5

Unnamed: 0,survived,pclass,sibsp,parch,age_Infant,age_Kid,age_Young,age_Adult,age_Old,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,0,3,1,0,0,0,1,0,0,0,0,1,0,1
1,1,1,1,0,0,0,0,1,0,1,0,0,1,0
2,1,3,0,0,0,0,0,1,0,0,0,1,1,0
3,1,1,1,0,0,0,0,1,0,0,0,1,1,0
4,0,3,0,0,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,0,0,0,1,0,0,0,1,0,1
887,1,1,0,0,0,0,1,0,0,0,0,1,1,0
888,0,3,1,2,0,0,0,1,0,0,0,1,1,0
889,1,1,0,0,0,0,0,1,0,1,0,0,0,1


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
X = data_5.drop('survived', axis = 1)
y = data_5['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [17]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1])

In [18]:
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

0.8067796610169492
[[159  32]
 [ 25  79]]
