In [11]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import pickle


df = pd.read_csv('train_prototipe.csv')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cost               10999 non-null  float64
 1   Purchases          10999 non-null  float64
 2   Importance         10999 non-null  int64  
 3   Discount           10999 non-null  float64
 4   Weight (gram)      10999 non-null  float64
 5   Late               10999 non-null  float64
 6   Reorder_purchases  10999 non-null  int64  
 7   Shipment_Flight    10999 non-null  int64  
 8   Shipment_Road      10999 non-null  int64  
 9   Shipment_Ship      10999 non-null  int64  
 10  Warehouse_A        10999 non-null  int64  
 11  Warehouse_B        10999 non-null  int64  
 12  Warehouse_C        10999 non-null  int64  
 13  Warehouse_D        10999 non-null  int64  
 14  Warehouse_F        10999 non-null  int64  
dtypes: float64(5), int64(10)
memory usage: 1.3 MB


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# DATA
X = df.drop('Late', axis=1)
y = df['Late']

# Split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Columns to be scaled
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [col for col in numeric_features if col not in ['Reorder_purchases', 'Shipment_Flight', 'Shipment_Road', 'Shipment_Ship', 'Warehouse_A', 'Warehouse_B', 'Warehouse_C', 'Warehouse_D', 'Warehouse_F']]

# Create transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_features),
        # Adding the already one-hot encoded columns
        ('cat', 'passthrough', ['Reorder_purchases', 'Shipment_Flight', 'Shipment_Road', 'Shipment_Ship', 'Warehouse_A', 'Warehouse_B', 'Warehouse_C', 'Warehouse_D', 'Warehouse_F'])
    ])

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=1155, 
        min_samples_split=10, 
        min_samples_leaf=2, 
        max_features='sqrt', 
        max_depth=10, 
        criterion='entropy', 
        bootstrap=True
    ))
])

# Fit pipeline
pipeline.fit(x_train, y_train)


In [14]:
from joblib import dump, load

# Save the pipeline and the optimal threshold
dump((pipeline), "model_rf_new.joblib")


['model_rf_new.joblib']

In [5]:

# Load the pipeline and the optimal threshold and make predictions
loaded_pipeline= load("model_rf_new.joblib")

new_data = pd.read_csv('data_train_encode.csv')


if 'Late' in new_data.columns:
    new_data = new_data.drop('Late', axis=1)


predictions = loaded_pipeline.predict(new_data)

# Menambahkan kolom prediksi ke dataset
new_data['Predicted_Late'] = predictions



In [9]:
new_data.sample(5)

Unnamed: 0,Cost,Purchases,Importance,Discount,Weight (gram),Shipment_Flight,Shipment_Road,Shipment_Ship,Warehouse_A,Warehouse_B,Warehouse_C,Warehouse_D,Warehouse_F,reorder_purchases,Predicted_Late
2233,274,4,1,57,3332,0,0,1,0,0,1,0,0,1,1
8771,249,4,0,10,1318,0,0,1,1,0,0,0,0,1,0
3938,194,3,1,6,5721,0,0,1,0,0,0,0,1,1,0
2778,161,3,0,52,1199,0,1,0,0,0,0,0,1,1,1
4905,177,2,2,7,5599,0,1,0,0,0,1,0,0,0,1


In [9]:
import pandas as pd
from joblib import load

# Load the pipeline and the optimal threshold
loaded_pipeline, loaded_optimal_thresh = load("model_pandas.joblib")

new_data = pd.read_csv('processed_file.csv')

# Drop the 'Late' column if it exists
new_data.drop('Late', axis=1, inplace=True)

# Transform the new data (excluding the classifier step)
new_data_transformed = loaded_pipeline[:-1].transform(new_data)

# Predict probabilities and convert them to binary predictions based on the optimal threshold
new_predictions_proba = loaded_pipeline.named_steps['classifier'].predict_proba(new_data_transformed)[:, 1]
new_predictions = (new_predictions_proba >= loaded_optimal_thresh).astype(int)

# df
predictions_df = pd.DataFrame(new_predictions, columns=['Predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

Predictions have been saved to predictions.csv


In [55]:
print(new_predictions_proba)

[0.56557715]


In [57]:
print(loaded_optimal_thresh)

0.5946532687057499
