In [23]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import pickle


df = pd.read_csv('data_train_encode.csv')

In [24]:
df

Unnamed: 0,Cost,Purchases,Importance,Discount,Weight (gram),Late,Shipment_Flight,Shipment_Road,Shipment_Ship,Warehouse_A,Warehouse_B,Warehouse_C,Warehouse_D,Warehouse_F,Reorder_purchases
0,177,3,0,44,1233,1,1,0,0,0,0,0,1,0,1
1,216,2,0,59,3088,1,1,0,0,0,0,0,0,1,0
2,183,4,0,48,3374,1,1,0,0,1,0,0,0,0,1
3,176,4,1,10,1177,1,1,0,0,0,1,0,0,0,1
4,184,3,1,46,2484,1,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10637,252,5,1,1,1538,1,0,0,1,1,0,0,0,0,1
10638,232,5,1,6,1247,0,0,0,1,0,1,0,0,0,1
10639,242,5,0,4,1155,0,0,0,1,0,0,1,0,0,1
10640,223,6,1,2,1210,0,0,0,1,0,0,0,0,1,1


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10642 entries, 0 to 10641
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Cost               10642 non-null  int64
 1   Purchases          10642 non-null  int64
 2   Importance         10642 non-null  int64
 3   Discount           10642 non-null  int64
 4   Weight (gram)      10642 non-null  int64
 5   Late               10642 non-null  int64
 6   Shipment_Flight    10642 non-null  int64
 7   Shipment_Road      10642 non-null  int64
 8   Shipment_Ship      10642 non-null  int64
 9   Warehouse_A        10642 non-null  int64
 10  Warehouse_B        10642 non-null  int64
 11  Warehouse_C        10642 non-null  int64
 12  Warehouse_D        10642 non-null  int64
 13  Warehouse_F        10642 non-null  int64
 14  Reorder_purchases  10642 non-null  int64
dtypes: int64(15)
memory usage: 1.2 MB


In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# DATA
X = df.drop('Late', axis=1)
y = df['Late']

# Split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Columns to be scaled
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [col for col in numeric_features if col not in ['Reorder_purchases', 'Shipment_Flight', 'Shipment_Road', 'Shipment_Ship', 'Warehouse_A', 'Warehouse_B', 'Warehouse_C', 'Warehouse_D', 'Warehouse_F']]

# Create transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        # Adding the already one-hot encoded columns
        ('cat', 'passthrough', ['Reorder_purchases', 'Shipment_Flight', 'Shipment_Road', 'Shipment_Ship', 'Warehouse_A', 'Warehouse_B', 'Warehouse_C', 'Warehouse_D', 'Warehouse_F'])
    ])

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=1155, 
        min_samples_split=10, 
        min_samples_leaf=2, 
        max_features='sqrt', 
        max_depth=10, 
        criterion='entropy', 
        bootstrap=True

        ccp_alpha=0, max_depth=79, max_features='sqrt',
                       min_samples_leaf=83, min_samples_split=100
    ))
])

# Fit pipeline
pipeline.fit(x_train, y_train)


In [39]:
from joblib import dump, load

# Save the pipeline and the optimal threshold
dump((pipeline), "model_rf_new.joblib")


['model_rf_new.joblib']

In [40]:

# Load the pipeline and the optimal threshold and make predictions
loaded_pipeline= load("model_rf_new.joblib")

new_data = pd.read_csv('data_train_encode.csv')


if 'Late' in new_data.columns:
    new_data = new_data.drop('Late', axis=1)


predictions = loaded_pipeline.predict(new_data)

# Menambahkan kolom prediksi ke dataset
new_data['Predicted_Late'] = predictions



In [41]:
new_data.sample(10)

Unnamed: 0,Cost,Purchases,Importance,Discount,Weight (gram),Shipment_Flight,Shipment_Road,Shipment_Ship,Warehouse_A,Warehouse_B,Warehouse_C,Warehouse_D,Warehouse_F,Reorder_purchases,Predicted_Late
709,227,3,0,14,2589,0,0,1,0,1,0,0,0,1,1
6844,279,4,0,9,1128,0,0,1,0,0,0,0,1,1,0
7701,264,4,1,8,1755,0,0,1,0,0,0,1,0,1,0
5240,218,5,0,5,1238,0,0,1,0,1,0,0,0,1,0
6068,210,2,0,9,5932,0,0,1,0,1,0,0,0,0,0
10407,218,5,0,8,1574,0,0,1,1,0,0,0,0,1,0
827,194,6,1,39,3770,0,0,1,0,0,1,0,0,1,1
2529,177,4,1,60,1159,1,0,0,0,0,0,0,1,1,1
554,194,2,1,4,3496,0,0,1,0,0,0,0,1,0,1
1241,140,4,0,27,1184,0,0,1,0,0,1,0,0,1,1


In [9]:
import pandas as pd
from joblib import load

# Load the pipeline and the optimal threshold
loaded_pipeline, loaded_optimal_thresh = load("model_pandas.joblib")

new_data = pd.read_csv('processed_file.csv')

# Drop the 'Late' column if it exists
new_data.drop('Late', axis=1, inplace=True)

# Transform the new data (excluding the classifier step)
new_data_transformed = loaded_pipeline[:-1].transform(new_data)

# Predict probabilities and convert them to binary predictions based on the optimal threshold
new_predictions_proba = loaded_pipeline.named_steps['classifier'].predict_proba(new_data_transformed)[:, 1]
new_predictions = (new_predictions_proba >= loaded_optimal_thresh).astype(int)

# df
predictions_df = pd.DataFrame(new_predictions, columns=['Predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

Predictions have been saved to predictions.csv


In [55]:
print(new_predictions_proba)

[0.56557715]


In [57]:
print(loaded_optimal_thresh)

0.5946532687057499
