In [24]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe

In [19]:
df = pd.read_csv("dataset.csv")
df

Unnamed: 0,jenis_kelamin,umur,gaji,status,transportasi
0,pria,20,8000000,single,1
1,pria,35,14000000,single,0
2,wanita,26,10000000,single,0
3,wanita,27,12000000,menikah,1
4,pria,21,9000000,single,1
5,pria,22,11000000,single,1
6,wanita,32,15000000,menikah,0
7,wanita,26,8000000,menikah,0
8,pria,25,9000000,single,0
9,perempuan,20,10000000,single,1


In [20]:
X = df.drop(columns="transportasi")
y = df.transportasi

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8, 4), (2, 4), (8,), (2,))

In [25]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), ["umur", "gaji"]),
    ('categoric', cat_pipe(encoder='onehot'), ['jenis_kelamin', 'status']),
])

In [26]:
from sklearn.naive_bayes import GaussianNB
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', GaussianNB())
])

In [27]:
pipeline.fit(X_train, y_train)


In [28]:
pipeline.score(X_train, y_train)


0.75

In [29]:
pipeline.score(X_test, y_test)


0.5

In [32]:
X_pred = pd.read_csv("testing.csv")
X_pred

Unnamed: 0,jenis_kelamin,umur,gaji,status
0,wanita,27,12000000,single
1,pria,35,14000000,menikah


In [33]:
pipeline.predict(X_pred)


array([1, 0], dtype=int64)

In [34]:
X_pred["transportasi"] = pipeline.predict(X_pred)
X_pred

Unnamed: 0,jenis_kelamin,umur,gaji,status,transportasi
0,wanita,27,12000000,single,1
1,pria,35,14000000,menikah,0
