In [None]:
import pandas as pd
import numpy as np

In [None]:
da = pd.read_csv('train.csv')

In [None]:
da.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
da.drop(columns = ['PassengerId' , 'Name' , 'Ticket' , 'Cabin'] , inplace=True )

In [None]:
da.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [None]:
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(da.drop(columns = ['Survived']) , da['Survived'] , random_state = 42 , test_size = 0.2) #20%891 in testing - 178

# survived will be in o/p

In [None]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


STEP - I

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
# imputation transformer

from sklearn.impute import SimpleImputer

# missing values are in the col embarked & Age also embarked is string so we have to apply the mode - most_frquent

tf1 = ColumnTransformer( [ ('impute_age' , SimpleImputer() , [2] ) , ('impute_embarked' , SimpleImputer(strategy='most_frequent') , [6])  ] , remainder = 'passthrough')

# remainder='passthrough' - means - keep the rest of the columns as they are, otherwise they will be dropped

STEP - II


In [None]:
 # onehotencoding transformer

 # for embarked and Sex col
from sklearn.preprocessing import OneHotEncoder

tf2 = ColumnTransformer(  [( 'ohe_sex', OneHotEncoder(sparse_output=False , handle_unknown = 'ignore') , [3] ) , ( 'ohe_embarked' , OneHotEncoder(sparse_output=False , handle_unknown = 'ignore') , [1] ) ] , remainder = 'passthrough' )

 # tf2 = ColumnTransformer(  [( 'ohe_sex_embarked', OneHotEncoder(sparse_output=False , handle_unknown = 'ignore') , [1,6] ) ] , remainder = 'passthrough' )


STEP - III

In [None]:
# MinMaxScaler all features into fixed range
from sklearn.preprocessing import MinMaxScaler

# all the numerical cols are scaled to a range of indices :- [0,1]
tf3 = ColumnTransformer( [ ('scale_cols' , MinMaxScaler() , slice(0,10)) ] ,remainder = 'passthrough' )

# slice means all scaling is appled to cols of indices 0-9 , two cols of sex and three of embarked so , total10

STEP IV

In [None]:
# feature selection

from sklearn.feature_selection import SelectKBest , chi2
tf4 = SelectKBest(score_func= chi2 , k=8)

# method to select the top k features based on a scoring function
 # score_func= chi2 -:Chi-squared (χ²) statistical test to score features only works on non neg value

STEP V

In [None]:
from sklearn.tree import DecisionTreeClassifier
tf5 = DecisionTreeClassifier()

PIPELINES

In [None]:
from sklearn.pipeline import Pipeline
# Pipeline is used to chain multiple steps (transformers + model) together in ml workflow

pipe = Pipeline( [ ('trf1' , tf1) , ('trf2' , tf2) , ('trf3' , tf3) , ('trf4' , tf4) , ('trf5' , tf5) ] )

 # Each step is a tuple -
  # The first value is the name you assign to that step (string).
# The second value is the transformer or estimator object


In [None]:
# alternative synatx as deos not need to name the steps same as in make_column_transformer
 #from sklearn.pipeline import make_pipeline
#pipe = make_pipeline(tf1 , tf2 , tf3 , tf4 , tf5)

In [None]:
from sklearn.tree import DecisionTreeClassifier
# train
pipe.fit(x_train,y_train)

# The pipeline applies all the transformers to x_train and y_train remain unchanged
 # LogisticRegression gets the processed X_train and the original y_train to learn the mapping.

EXPLORING THE PIPELINE

In [None]:
pipe.named_steps        # in key value form

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [3]),
                                 ('ohe_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1])]),
 'trf3': ColumnTransformer(remainder='passthrough',
                   transformers=[('scale_cols', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'trf4': SelectKBes

In [None]:
pipe.named_steps['trf1'].transformers_[0]

('impute_age', SimpleImputer(), [2])

In [None]:
pipe.named_steps['trf1'].transformers_[0][1]       # here u can see that in the tuple 0'thindex of 1

In [None]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.49884615])

In [None]:
# for embarked we can see
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [None]:
y_pred = pipe.predict(x_test)

In [None]:
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score( y_test , y_pred )

0.7877094972067039