In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import pickle


df = pd.read_csv('processed_file.csv')

In [3]:
df.sample(5)

Unnamed: 0,Calls,Cost,Prior Purchases,Importance,Discount,Weight (gram),Late
1582,3,143,3,0,64,1630,1
528,5,261,3,1,8,3257,1
6783,5,249,3,0,9,5978,1
4844,5,263,2,0,3,4474,1
6924,6,245,4,0,10,1606,0


In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, confusion_matrix

# DATA
X = df.drop('Late', axis=1)
y = df['Late']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('classifier', RandomForestClassifier(
        max_depth=10,
        max_features='log2',
        min_samples_leaf=1,
        min_samples_split=10,
        n_estimators=100,
        random_state=42
    ))
])

pipeline.fit(x_train, y_train)

# Optimal threshold
def find_optimal_threshold(model, x_test, y_test):
    y_pred_proba = model.predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    candidate_thresholds = thresholds[(tpr > 0.5) & (fpr > 0.05) & (fpr < 0.15)]
    optimal_threshold = candidate_thresholds[np.argmax(tpr[fpr > 0.05] - fpr[fpr > 0.05])]
    
    y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
    
    precision = precision_score(y_test, y_pred_optimal)
    recall = recall_score(y_test, y_pred_optimal)
    f1 = f1_score(y_test, y_pred_optimal)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_optimal).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    
    print(f'\nOptimal threshold: {optimal_threshold:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-score: {f1:.2f}')
    print(f'TPR: {tpr:.2f}')
    print(f'FPR: {fpr:.2f}')
    
    return optimal_threshold

# Find the optimal threshold
optimal_thresh = find_optimal_threshold(pipeline, x_test, y_test)



Optimal threshold: 0.55
Precision: 0.94
Recall: 0.51
F1-score: 0.66
TPR: 0.51
FPR: 0.05


In [8]:
from joblib import dump, load

# Save the pipeline and the optimal threshold
dump((pipeline, optimal_thresh), "model_pandas.joblib")

# Load the pipeline and the optimal threshold and make predictions
loaded_pipeline, loaded_optimal_thresh = load("model_pandas.joblib")


In [9]:
import pandas as pd
from joblib import load

# Load the pipeline and the optimal threshold
loaded_pipeline, loaded_optimal_thresh = load("model_pandas.joblib")

new_data = pd.read_csv('processed_file.csv')

# Drop the 'Late' column if it exists
new_data.drop('Late', axis=1, inplace=True)

# Transform the new data (excluding the classifier step)
new_data_transformed = loaded_pipeline[:-1].transform(new_data)

# Predict probabilities and convert them to binary predictions based on the optimal threshold
new_predictions_proba = loaded_pipeline.named_steps['classifier'].predict_proba(new_data_transformed)[:, 1]
new_predictions = (new_predictions_proba >= loaded_optimal_thresh).astype(int)

# df
predictions_df = pd.DataFrame(new_predictions, columns=['Predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

Predictions have been saved to predictions.csv


In [55]:
print(new_predictions_proba)

[0.56557715]


In [57]:
print(loaded_optimal_thresh)

0.5946532687057499
