# use piplines predic the tatinic data 

In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,MinMaxScaler,FunctionTransformer
from sklearn .impute import SimpleImputer
from sklearn .tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn .pipeline import Pipeline,make_pipeline
from sklearn .feature_selection import SelectKBest,chi2
import pickle 




In [3]:
data=pd.read_csv(r"C:\Users\mdfir\Downloads\train.csv")

In [4]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
data.drop(columns=["PassengerId","Name","Ticket","Cabin"],inplace=True)

In [6]:
data.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [7]:
x_train,x_test,y_train,y_test=train_test_split(data.drop(columns=["Survived"]),data["Survived"],test_size=.2,random_state=42)

In [8]:
x_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


# impute transformer

In [121]:
data["Fare"].value_counts()

Fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: count, Length: 248, dtype: int64

In [10]:
data["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [11]:
data["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [12]:
step_1=ColumnTransformer([
("impute_age",SimpleImputer(),[2]),
("impute_embaked",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

# onehotencoder

In [45]:
step_2=ColumnTransformer([
("one_hot_sex_emb",OneHotEncoder(sparse_output=False,handle_unknown="ignore"),[1,6])
])

# scaling the value 

In [48]:
step_3=ColumnTransformer([
  ("scale",MinMaxScaler(),slice(0,10))  
],remainder="passthrough")

# feature selection

In [51]:
step_4=SelectKBest(score_func=chi2,k=5)

# train model

In [54]:
step_5=DecisionTreeClassifier()

# use piplines 

In [86]:
model=Pipeline([
    ("step_1",step_1),
    ("step_2",step_2),
    ("step_3",step_3),
    ("step_4",step_4),
    ("step_5",step_5),
    
])

# use pickle in the data set

In [111]:
file="DecisionTreeClassifier.pkl"

In [113]:
pickle.dump(model,open(file,"wb"))

# make_pipline

In [89]:
make_pipelines=make_pipeline(step_1,step_2,step_3,step_4,step_5)

In [91]:
model.fit(x_train,y_train)

0,1,2
,steps,"[('step_1', ...), ('step_2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embaked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('one_hot_sex_emb', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...00228DBA5E840>
,k,5

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


# display the piplines 

In [64]:
#from sklearn import set_config

In [66]:
#set_config(display="digram")

# y_pred

In [93]:
y_pred=model.predict(x_test)

In [95]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [97]:
from sklearn .metrics import accuracy_score

In [99]:
score=accuracy_score(y_pred,y_test)

In [101]:
score

0.6256983240223464

In [103]:
pip.named_steps["step_1"].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [105]:
pip.named_steps["step_1"].transformers_[0][1].statistics_

array([29.49884615])