 Using pipelines

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

Uploading dataset

Use of python widget to upload .csv files

In [None]:
import ipywidgets as widgets
from IPython.display import display
import os

box_layout = widgets.Layout(
    display="flex",
    flex_flow="column",
    align_items="center",
    border="1px solid #E0E0E0",
    width="400px",
    padding="25px",
    border_radius="10px"
)

button_layout = widgets.Layout(width="100%", height="40px")

upload_layout = widgets.Layout(width="100%")

title = widgets.HTML(
    "<h3 style='margin-bottom:10px;'>Upload CSV Files üëá</h3>"
)

subtitle = widgets.HTML(
    "<p style='color:gray; margin-top:0;'>Select one or more xlsx files to upload and save</p>"
)

train_uploader = widgets.FileUpload(
    accept='.csv',
    multiple=True,
    layout=upload_layout
)

save_button = widgets.Button(
    description='Save Files',
    button_style='primary',
    icon='save',
    layout=button_layout
)

status = widgets.HTML("<p style='color:gray;'>Waiting for upload...</p>")

output = widgets.Output()

def save_files(change=None):
    with output:
        output.clear_output()
        
        if train_uploader.value:
            saved_files = []
            for filename, file_info in train_uploader.value.items():
                with open(filename, "wb") as f:
                    f.write(file_info['content'])
                saved_files.append(filename)
            
            status.value = f"<p style='color:green;'>Saved {len(saved_files)} file(s) üëç</p>"
        else:
            status.value = "<p style='color:red;'>‚ö†Ô∏è No files uploaded yet</p>"

train_uploader.observe(save_files, names='value')

save_button.on_click(save_files)

card = widgets.VBox(
    [title, subtitle, train_uploader, save_button, status],
    layout=box_layout
)

display(card)
display(output)

VBox(children=(HTML(value="<h3 style='margin-bottom:10px;'>Upload CSV Files üëá</h3>"), HTML(value="<p style='co‚Ä¶

Output()

In [4]:
df = pd.read_csv('file.csv')

In [5]:
df.sample(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
643,644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S
790,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
584,585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C
754,755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48.0,1,2,220845,65.0,,S
763,764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36.0,1,2,113760,120.0,B96 B98,S
501,502,0,3,"Canavan, Miss. Mary",female,21.0,0,0,364846,7.75,,Q
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


Our pipeline

1. Impute missing values - age, embarked

2. One hot encoding - sex, embarked

3. Scaling

4. Feature selection

5. Model training

In [6]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [8]:
df.sample(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
522,0,3,male,,0,0,7.225,C
661,0,3,male,40.0,0,0,7.225,C


Train Test Split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

Missing value imputation transformer

In [12]:
t1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

One hot encoding

In [14]:
t2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough')

Scaling

In [16]:
t3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

Feature selection

In [17]:
t4 = SelectKBest(score_func=chi2, k=5)

Training model

In [18]:
t5 = DecisionTreeClassifier()

Creating pipeline of all steps done above

In [19]:
pipe = Pipeline([
    ('t1', t1),
    ('t2', t2),
    ('t3', t3),
    ('t4', t4),
    ('t5', t5)
])

Using our pipeline

In [20]:
pipe.fit(X_train, y_train)

Explore pipeline

In [21]:
pipe.named_steps

{'t1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 't2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 't3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 't4': SelectKBest(k=5, score_func=<function chi2 at 0x7a0260a728e0>),
 't5': DecisionTreeClassifier()}

Prediction

In [22]:
y_predict = pipe.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.6256983240223464