In [27]:
import numpy as np
import pandas as pd

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('train.csv')

In [30]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Let's start the process

In [5]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [6]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [7]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [8]:
y_train.sample(5)

352    0
865    1
48     0
765    1
207    1
Name: Survived, dtype: int64

# Step 1

In [9]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')


# Step 2

In [10]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

# Step 3

In [11]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])


# Step 4

In [12]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)       #k=8 means im selecting top 8 features from the 10 columns

# Step 5

In [13]:
# train the model
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [14]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

# Pipeline Vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [151]:
# Alternate Syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [15]:
# train
pipe.fit(X_train,y_train)

# Explore the Pipeline

In [37]:
# Code here
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.49884615])

In [17]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [18]:
# Predict
y_pred = pipe.predict(X_test)

In [19]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

# Cross Validation using Pipeline

In [21]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.6391214419383433

# GridSearch using Pipeline

In [22]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [23]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [24]:
grid.best_score_

0.6391214419383433

In [25]:
grid.best_params_

{'trf5__max_depth': 2}

# Exporting the Pipeline

In [26]:
# export 
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

## creating pipeline to get accuracy of different models

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

In [41]:
# Create an instance of SimpleImputer for numerical column
numerical_imputer = SimpleImputer(strategy='mean')

# Columns to impute for numerical data
numerical_column = ['Age']

# Fit and transform the numerical column and update the original data
df[numerical_column] = numerical_imputer.fit_transform(df[numerical_column])


# Create an instance of SimpleImputer for categorical column
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Columns to impute for categorical data
categorical_column = ['Embarked']

# Fit and transform the categorical column and update the original data
df[categorical_column]=categorical_imputer.fit_transform(df[categorical_column])

In [42]:
df2 = pd.get_dummies(df,drop_first=True)

In [47]:
y = df2['Survived']

X = df2.drop('Survived',axis=1)

In [48]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [49]:
MLA = [
       ensemble.AdaBoostClassifier(),
       ensemble.BaggingClassifier(),
       ensemble.GradientBoostingClassifier(),
       ensemble.RandomForestClassifier(), 
       linear_model.SGDClassifier(),
       naive_bayes.GaussianNB(),
       neighbors.KNeighborsClassifier(),
       tree.DecisionTreeClassifier(),
       tree.ExtraTreeClassifier(),
]

In [50]:
name = []
testscore = []
for alg in MLA:
    name.append(alg.__class__.__name__)
    alg.fit(X_train, y_train)
    testscore.append(alg.score(X_test, y_test))

In [51]:
comparison = pd.DataFrame({"name": name, "testscore": testscore})

In [52]:
comparison

Unnamed: 0,name,testscore
0,AdaBoostClassifier,0.798206
1,BaggingClassifier,0.834081
2,GradientBoostingClassifier,0.852018
3,RandomForestClassifier,0.834081
4,SGDClassifier,0.7713
5,GaussianNB,0.780269
6,KNeighborsClassifier,0.721973
7,DecisionTreeClassifier,0.766816
8,ExtraTreeClassifier,0.753363


## sorting the data from high to low based on the testscore

In [53]:
comparison.sort_values(by='testscore' ,ascending=False)

Unnamed: 0,name,testscore
2,GradientBoostingClassifier,0.852018
1,BaggingClassifier,0.834081
3,RandomForestClassifier,0.834081
0,AdaBoostClassifier,0.798206
5,GaussianNB,0.780269
4,SGDClassifier,0.7713
7,DecisionTreeClassifier,0.766816
8,ExtraTreeClassifier,0.753363
6,KNeighborsClassifier,0.721973
