In [1]:
import pandas as pd
import numpy as np

In [2]:
d1 = {
    "social_media_followers": [1000, np.NaN, 20000, 130000, 17000, np.NaN, 41000, 160000, 220000, 100000],
    "sold_out": [1,0,0,1,0,0,0,1,0,1]
}

In [3]:
df1 = pd.DataFrame(d1)

In [4]:
df1

Unnamed: 0,social_media_followers,sold_out
0,1000.0,1
1,,0
2,20000.0,0
3,130000.0,1
4,17000.0,0
5,,0
6,41000.0,0
7,160000.0,1
8,220000.0,0
9,100000.0,1


In [5]:
X1 = df1[['social_media_followers']]

In [7]:
y1 = df1[["sold_out"]]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=123)

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
imputer = SimpleImputer(strategy='mean')

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
lr = LogisticRegression()

In [14]:
### there are two methods - make pipeline and just pipeline
from sklearn.pipeline import make_pipeline

In [15]:
pipe1 = make_pipeline(imputer, lr)

In [16]:
pipe1.fit(X1_train, y1_train)

  y = column_or_1d(y, warn=True)


In [17]:
pipe1.score(X1_train, y1_train)

0.7142857142857143

In [18]:
pipe1.score(X1_test, y1_test)

0.3333333333333333

In [19]:
pipe1.named_steps.simpleimputer.statistics_

array([102200.])

In [25]:
pipe1.named_steps.logisticregression.coef_

array([[4.89662604e-06]])

In [26]:
### more advanced pipeline
d2 = {
    "genre": ["Rock", "Metal", "Bluegrass", "Rock", np.NaN, "Rock", "Rock", np.NaN, "Bluegrass", np.NaN],
    "social_media_followers": [1000, np.NaN, 20000, 130000, 17000, np.NaN, 41000, 160000, 220000, 100000],
    "sold_out": [1,0,0,1,0,0,0,1,0,1]
}

In [30]:
df = pd.DataFrame(d2)

In [31]:
df

Unnamed: 0,genre,social_media_followers,sold_out
0,Rock,1000.0,1
1,Metal,,0
2,Bluegrass,20000.0,0
3,Rock,130000.0,1
4,,17000.0,0
5,Rock,,0
6,Rock,41000.0,0
7,,160000.0,1
8,Bluegrass,220000.0,0
9,,100000.0,1


In [32]:
X = df.iloc[:,0:2]
y = df.iloc[:,2]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [35]:
num_cols = ["social_media_followers"]
cat_cols = ["genre"]

In [37]:
num_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy = 'mean')),
        ('scale', StandardScaler())
    ]
)

In [38]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
cat_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

In [40]:
from sklearn.compose import ColumnTransformer

In [41]:
col_transformer = ColumnTransformer(
    transformers = [
        ('num_pipeline', num_pipeline, num_cols),
        ('cat_pipeline', cat_pipeline, cat_cols)
    ],
    remainder='drop',
    n_jobs= -1
)

In [42]:
from sklearn.tree import DecisionTreeClassifier

In [43]:
dtc = DecisionTreeClassifier()

In [44]:
pipefinal = make_pipeline(col_transformer, dtc)

In [45]:
pipefinal.fit(X_train, y_train)

In [47]:
pipefinal.score(X_test, y_test)

0.6666666666666666

In [48]:
### saving the pipeline
import joblib


In [49]:
joblib.dump(pipefinal, 'pipe.joblib')

['pipe.joblib']

In [50]:
example_test_pipeline = joblib.load('pipe.joblib')

In [52]:
example_test_pipeline.fit(X_train, y_train)