# [Titanic Data Science Solutions](<https://www.kaggle.com/startupsci/titanic-data-science-solutions>)
## Combined with my one flow

## Step 0: import libraries

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Step 1: Load and preview data

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
combine = [train_df, test_df]

In [3]:
# preview df
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Step 2: Columns drop | selection

In [4]:
remove_cols = ['PassengerId', 'Ticket']
print(f'Before drop: {train_df.columns}')
train_df.drop(columns=remove_cols, inplace=True)
print(f'After drop: {train_df.columns}')
# TODO how to show sorted values in a row

Before drop: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
After drop: Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')


In [5]:
sorted(train_df.columns.values)

['Age',
 'Cabin',
 'Embarked',
 'Fare',
 'Name',
 'Parch',
 'Pclass',
 'Sex',
 'SibSp',
 'Survived']

## Step 3: Regulate columns dtype, handle missing value

In [6]:
# check miss values
print(f'Columns with missing value #: \n{train_df.isnull().sum()}')
print('\n--- another way ---\n')
print(train_df.info())

Columns with missing value #: 
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

--- another way ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB
None


In [7]:
# missing data handle
# Cabin, 7/9 is null, delete this column
train_df.drop(columns='Cabin', inplace=True)

# Age fill with median
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


## Step 4: Separate columns into Categorical, Numerical and Num-Cat 

In [8]:
# find cat and num cols
cols_cat = [col_name for col_name in train_df.columns.values if train_df.dtypes[col_name]
            == 'object' and train_df[col_name].unique().size <= 10]
cols_num = [col_name for col_name in train_df.columns.values if train_df.dtypes[col_name]
            != 'object' and train_df[col_name].unique().size > 10]
cols_numcat = [col_name for col_name in train_df.columns.values if train_df.dtypes[col_name]
               != 'object' and train_df[col_name].unique().size <= 10]

print(f'Categorical columns: {cols_cat}')
print(f'Numerical columns: {cols_num}')
print(f'Numerical Catergorical cloumns: {cols_numcat}')


Categorical columns: ['Sex', 'Embarked']
Numerical columns: ['Age', 'Fare']
Numerical Catergorical cloumns: ['Survived', 'Pclass', 'SibSp', 'Parch']


In [9]:
# further modification if necessary


In [10]:
# units for cols
cols_cat_units = ['','','']
cols_num_units = ['Year', '$']
cols_numcat_units = ['','','#','#']


## Step 5: EDA

### 5.1 columns distribution without interaction with target feature

In [11]:
from typing import List, Optional
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import layout
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [12]:
# numerical
from mc_plot import mc_box_solo
from mc_plot import mc_hist_solo
from mc_plot import mc_violin_solo
mc_hist_solo(train_df, cols_num, cols_num_units).show()
mc_violin_solo(train_df, cols_num, cols_num_units).show()


In [13]:
# Categorical
from mc_plot import mc_bar_solo
mc_bar_solo(train_df, cols_cat + cols_numcat)


In [14]:
# TODO smart grid system, will regulate col number to 4, if more figure come, will send to next row
# TODO more viz