In [34]:
import numpy as np
import pandas as pd

In [35]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [36]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
for_EDA = df_train.copy() #for doing EDA

In [38]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

In [39]:
labelen = LabelEncoder()
ordinal = OrdinalEncoder()

In [40]:
for_EDA = for_EDA.drop(columns=['Name','PassengerId']) # not significant

In [41]:
print('Ticket classes:',len(df_train['Ticket'].value_counts()))
print('Cabin classes:',len(df_train['Cabin'].value_counts()))
print('Embarked classes:',len(df_train['Embarked'].value_counts()))

Ticket classes: 681
Cabin classes: 147
Embarked classes: 3


In [42]:
for_EDA = for_EDA.drop(columns=['Ticket',"Cabin"]) # Too many classes (not significant) , Cabin too many classes and too many missing values

In [43]:
for_EDA

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [44]:
#do One hot encoding using get_dummies
EDA_ecnd = pd.get_dummies(for_EDA,dtype=float)
EDA_ecnd

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,0.0,1.0,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,1,3,26.0,0,0,7.9250,1.0,0.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1000,1.0,0.0,0.0,0.0,1.0
4,0,3,35.0,0,0,8.0500,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,0.0,1.0,0.0,0.0,1.0
887,1,1,19.0,0,0,30.0000,1.0,0.0,0.0,0.0,1.0
888,0,3,,1,2,23.4500,1.0,0.0,0.0,0.0,1.0
889,1,1,26.0,0,0,30.0000,0.0,1.0,1.0,0.0,0.0


In [45]:
#Check Correlation for Feature selection
pd.DataFrame(EDA_ecnd.corrwith(EDA_ecnd['Survived']).sort_values(ascending=False))

Unnamed: 0,0
Survived,1.0
Sex_female,0.543351
Fare,0.257307
Embarked_C,0.16824
Parch,0.081629
Embarked_Q,0.00365
SibSp,-0.035322
Age,-0.077221
Embarked_S,-0.15566
Pclass,-0.338481


**_Exploratort Data Analysis_**

In [46]:
#1fill missing values

**_PipeLine_**

In [47]:
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [48]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

In [50]:
class DropFeatures:
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            df = X.copy()
            S = df.drop(columns=['Cabin','Name','PassengerId','Ticket'])
            
            return S
        else:
            return S

In [51]:
DropFeatures = DropFeatures(add_attributes=True)

In [52]:
pad_missing_values = Pipeline(steps=[
    ('KNNImputer',KNNImputer()),
    ('MinMaxScale',MinMaxScaler())
])
ordinal_transformer = Pipeline(steps=[
    ('SimpleImputer',SimpleImputer(strategy='most_frequent')),
    ('Onehot',OneHotEncoder())
])

In [53]:
preprocess = ColumnTransformer(transformers=[
    ('Age',pad_missing_values,['Fare']),
    ('categorial features',ordinal_transformer,['Sex','Embarked'])
],
remainder='passthrough')

In [54]:
class Convert2DataFrame:
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            df = X.copy()
            S = pd.DataFrame(df)
            
            return S
        else:
            return S

In [55]:
Convert2DataFrame = Convert2DataFrame(add_attributes=True)

In [56]:
pipeline = make_pipeline(DropFeatures,preprocess,Convert2DataFrame)
pipeline

**_Prepare Training Set_**

In [57]:
temp = df_train.copy()
temp = temp.drop(columns=['Survived'])

In [60]:
X = pipeline.fit_transform(temp)

In [61]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.014151,0.0,1.0,0.0,0.0,1.0,3.0,22.0,1.0,0.0
1,0.139136,1.0,0.0,1.0,0.0,0.0,1.0,38.0,1.0,0.0
2,0.015469,1.0,0.0,0.0,0.0,1.0,3.0,26.0,0.0,0.0
3,0.103644,1.0,0.0,0.0,0.0,1.0,1.0,35.0,1.0,0.0
4,0.015713,0.0,1.0,0.0,0.0,1.0,3.0,35.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
886,0.025374,0.0,1.0,0.0,0.0,1.0,2.0,27.0,0.0,0.0
887,0.058556,1.0,0.0,0.0,0.0,1.0,1.0,19.0,0.0,0.0
888,0.045771,1.0,0.0,0.0,0.0,1.0,3.0,,1.0,2.0
889,0.058556,0.0,1.0,1.0,0.0,0.0,1.0,26.0,0.0,0.0


In [62]:
y = pd.DataFrame(df_train['Survived'])
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [63]:
from sklearn.model_selection import train_test_split

In [64]:
X_train, X_valid, y_train, y_valid = train_test_split(X , y, test_size=0.2, random_state=0, stratify=y)

**_Model Training_**

In [65]:
from sklearn.metrics import accuracy_score

In [66]:
import xgboost as xgb

In [67]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train,y_train)

In [68]:
xgb_pred = xgb_model.predict(X_valid)
acc_xgb = accuracy_score(y_true=y_valid, y_pred= xgb_pred)

In [69]:
acc_xgb

0.8156424581005587

In [71]:
test_set = pipeline.fit_transform(df_test)

In [72]:
xgb_pred2 = xgb_model.predict(test_set)

Random Forrest

In [85]:
from sklearn.ensemble import RandomForestClassifier

In [103]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=2)
rf_model.fit(X=X_train,y=y_train)

  return fit_method(estimator, *args, **kwargs)


In [104]:
rf_predict = rf_model.predict(X=X_valid)
acc_rf = accuracy_score(y_true=y_valid,y_pred=rf_predict)
acc_rf

0.8212290502793296

In [107]:
rf_pred2 = rf_model.predict(test_set)

In [105]:
Id = df_test['PassengerId']


In [106]:
Id

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [108]:
df_submission = pd.DataFrame({'PassengerId': Id, 'Survived': rf_pred2})

In [109]:
df_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [110]:
df_submission.to_csv('submission.csv',index=False)