In [6]:
import pandas as pd
import numpy as np

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [9]:
df = pd.read_csv('train.csv')

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#PLAN
age aur embarked me missing values hai. Therefore we will create a pipeline as follows:
missing_values-> onehotencodeing-> Scaling-> Feature Selection-> Training model using decision tree


In [12]:
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [15]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [17]:
y_train.sample(5)

220    1
846    0
230    1
632    1
665    0
Name: Survived, dtype: int64

In [19]:
# 1st step: column transformer create karna: Imputation
t1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]), # rather than calling your columns by age, call it by index value. aage dikkat nahi dega
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [21]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


iss point par humare pass 7 columns hai. Jab hum OneHotEncoding karenge toh sex ke 2 aur embarked ke 3 columns ban jayenge

In [24]:
#onehotencoding
t2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6]),
],remainder='passthrough')

In [26]:
# Scaling
t3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10)) # sabhi 10 columns ko select kiya hai. 
])
# feature selection ke liye minmax scaler use karna padega

In [28]:
# feature selection
t4 = SelectKBest(score_func=chi2,k='all') # k ki value = top k columns select kiye hai

In [30]:
t5 = DecisionTreeClassifier()

# creating Pipeline

In [33]:
# isme hume ek list of tuples pass karne hai jisme transformation ka naam aur uska object pass karna hai
pipe = Pipeline([
    ('t1',t1),
    ('t2',t2),
    ('t3',t3),
    ('t4',t4),
    ('t5',t5)
])

In [35]:
# alternative syntax
#pipe = make_pipeline(t1,t2,t3,t4,t5)

In [37]:
#display pipeline
from sklearn import set_config
set_config(display='diagram')

In [39]:
#train
pipe.fit(x_train,y_train)

In [41]:
#code
pipe.named_steps # pipe. aur fir tab press karo
#named_steps humko ye batata hai ki humari pipeline kon konse steps follow kar rahi hai

{'t1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 't2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 't3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 't4': SelectKBest(k='all', score_func=<function chi2 at 0x1637afec0>),
 't5': DecisionTreeClassifier()}

In [43]:
# Re-fit just t1 separately to access imputation stats
t1.fit(x_train)



In [45]:
#Access statistics_ from the imputers
age_imputer_stats = t1.transformers_[0][1].statistics_
embarked_imputer_stats = t1.transformers_[1][1].statistics_

In [47]:
age_imputer_stats, embarked_imputer_stats

(array([29.49884615]), array(['S'], dtype=object))

In [49]:
#predict
y_pred = pipe.predict(x_test)

In [51]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)*100

62.56983240223464

In [53]:
y_pred # kam aa raha hai kyuki feature selection karke kuch columns hateye the humne

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

# Cross validation using pipelines

In [56]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, x_train, y_train, cv=60, scoring='accuracy').mean()
# ye 'cv' parameter baar baar model ko train test split karta hai for better accuracy

0.6392676767676768

exporting this pipeline

In [58]:
import pickle
pickle.dump(pipe,open('models/pipe.pkl','wb'))

In [59]:
x_train.columns


Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [63]:
print(x_train.shape)
print(x_train.columns)


(712, 7)
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


In [65]:
# from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# After training
print("Expected input shape (before transformation):", pipe.named_steps['t1'].transformers_[0][1].statistics_.shape)
print("Final estimator input features:", pipe.named_steps['t5'].n_features_in_)


Expected input shape (before transformation): (1,)
Final estimator input features: 10


In [67]:
y2_pred = pipe.predict(x_train)

In [71]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [712, 179]