#### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')



#### Import Data

In [3]:
train = pd.read_csv('dataset/titanic/train.csv')
test = pd.read_csv('dataset/titanic/test.csv')

In [4]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
idtest = test['PassengerId']

#### Pre-processing and Exploratory Data Analysis

##### Outliers - if we are taking random sample, and some observations are at an abnormal distance from the other observations, then we're most probably looking at outliers.

https://www.youtube.com/watch?v=9aDHbRb4Bf8&t=10s

In [9]:
def detect_outliers(df,n,features):
    outlier_indices = []
    
    # iterating over dataframe
    for col in features:
        
        # Find 1st quartile
        Q1 = np.percentile(df[col],25)
        
        # Find 3rd quartile
        Q3 = np.percentile(df[col],75)
        
        # Finding Interquartile Range
        IQR = Q3-Q1
        
        # Setting outlier step to 1.5
        outlier_step = 1.5 * IQR
        
        outlier_list_col = df[(df[col] < Q1-outlier_step) | (df[col] > Q3+outlier_step)].index
        
        # appending indices
        outlier_indices.extend(outlier_list_col)
        
        
    outlier_indices = Counter(outlier_indices)
    
    multiple_outliers = list(k for k,v in outlier_indices.items() if v>n)
    
    return multiple_outliers

In [12]:
c = 'x v v v x v v r r w w w w x x f'.split()
Counter(c)

Counter({'x': 4, 'v': 5, 'r': 2, 'w': 4, 'f': 1})

In [13]:
outliers = detect_outliers(train,2,['Age','SibSp','Fare','Parch'])

outliers

[27, 88, 159, 180, 201, 324, 341, 792, 846, 863]

In [17]:
Q1 = np.percentile(train['SibSp'],25)
Q1

0.0

In [18]:
train.shape

(891, 12)

In [19]:
train = train.drop(outliers,axis=0).reset_index(drop=True)
train.shape

(881, 12)

In [20]:
train_len = len(train)

df = pd.concat(objs=[train,test],axis=0).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1299 non-null   int64  
 1   Survived     881 non-null    float64
 2   Pclass       1299 non-null   int64  
 3   Name         1299 non-null   object 
 4   Sex          1299 non-null   object 
 5   Age          1043 non-null   float64
 6   SibSp        1299 non-null   int64  
 7   Parch        1299 non-null   int64  
 8   Ticket       1299 non-null   object 
 9   Fare         1298 non-null   float64
 10  Cabin        292 non-null    object 
 11  Embarked     1297 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 121.9+ KB


##### Check for missing values

In [21]:
df.fillna(np.nan,inplace=True)
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             256
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1007
Embarked          2
dtype: int64

#### Feature Analysis

In [None]:
plt.figure()