Import

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import json
import pytz

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from pandas_profiling import ProfileReport
import seaborn as sns
from sklearn import metrics

from copy import deepcopy

import h2o

In [None]:
# Read the JSON file
with open("data-sample.json") as f:
    df = pd.json_normalize(json.load(f))

In [None]:
# To datetime
datetime_columns = ["taskCreatedTime", "taskCompletedTime"]
df[datetime_columns] = df[datetime_columns].apply(
    lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S %z", utc=True).dt.tz_convert(
        pytz.timezone("Asia/Bangkok")
    )
)

# dtype mapping
dtype_mapping = {
    "taskLocationDone.lon": "float64",
    "taskLocationDone.lat": "float64",
    "cod.amount": "float64",
    "cod.received": bool,
    "UserVar.weight": "float64",
}
df = df.astype(dtype_mapping)

In [None]:
df.info()

In [None]:
df.describe()

# Pandas Profiling

In [None]:
# df_report = ProfileReport(df)
# df_report

Drop unused columns

In [None]:
# Drop constant values and ID
df.drop(columns=["flow", "taskId"], inplace=True)

# Drop ID
df.drop(
    columns=[
        "UserVar.taskStatus",
        "UserVar.taskDetailStatus",
        "UserVar.taskDetailStatusLabel",
    ],
    inplace=True,
)

Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

Drop ongoing task

In [None]:
# df.dropna(subset=['UserVar.taskStatusLabel'], inplace=True)

In [None]:
# df = df[df['taskStatus'] == 'done']
df.loc[df["taskStatus"] == "ongoing", "UserVar.taskStatusLabel"] = "Ongoing"
df.drop(columns=["taskStatus"], inplace=True)

Fillna

In [None]:
df["UserVar.branch_origin"].fillna("UNKNOWN", inplace=True)
# df['cod.amount'].fillna(0, inplace=True) # Assumptions: no COD is 0

In [None]:
# Select only the numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include="number")

# Iterate over each numeric column
for column in numeric_columns.columns:
    # Create a new figure for each column
    plt.figure()

    # Create the boxplot for the current column
    sns.boxplot(data=numeric_columns[column])

    # Set plot title and labels
    plt.title(f"Boxplot of {column}")
    plt.xlabel("Values")
    plt.ylabel("Column")

    # Display the plot
    plt.show()

Final profiling

In [None]:
df.to_csv("to_ml.csv", index=False)
df_after = deepcopy(df)

In [None]:
# Final profiling
df_report = ProfileReport(df)
df_report

# EDA

In [None]:
# Check unique values
object_columns = df.select_dtypes(include="object").columns

for column in object_columns:
    unique_values = df[column].unique()
    print(f"Unique values in {column}:")
    print(unique_values)
    print()

Task Status

In [None]:
# Count the number
attrited_count = df[df["UserVar.taskStatusLabel"] == "Success"].shape[0]
existing_count = df[df["UserVar.taskStatusLabel"] == "Failed"].shape[0]

# Create labels and counts for the pie chart
labels = ["Success", "Failed"]
counts = [attrited_count, existing_count]

# Create the pie chart
plt.pie(counts, labels=labels, autopct="%1.1f%%", startangle=90)
plt.title("Distribution of Task Status")
plt.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

Distribution of COD Amount

In [None]:
df_success = df[df["UserVar.taskStatusLabel"] == "Success"]
df_failed = df[df["UserVar.taskStatusLabel"] == "Failed"]

In [None]:
sns.histplot(data=df_success, x="cod.amount")

In [None]:
sns.histplot(data=df_failed, x="cod.amount")

In [None]:
sns.histplot(data=df, x="cod.amount", hue="UserVar.taskStatusLabel")

In [None]:
sns.histplot(data=df, x="UserVar.weight", hue="UserVar.taskStatusLabel", kde=True)

skew data, many outlier

## Time Analysis

Daily Task Count

In [None]:
task_count_by_day = (
    df.groupby(df["taskCreatedTime"].dt.date).size().reset_index(name="Task Count")
)

plt.figure(figsize=(10, 6))
plt.plot(
    task_count_by_day["taskCreatedTime"], task_count_by_day["Task Count"], marker="o"
)
plt.xlabel("Date")
plt.ylabel("Task Count")
plt.title("Daily Task Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Hourly Task Count

In [None]:
df_hour = df.copy()
df_hour["Hour"] = df_hour["taskCreatedTime"].dt.hour

# Step 3: Calculate the count of tasks for each hour
hourly_task_count = df_hour.groupby("Hour")["taskCreatedTime"].count()

# Step 4: Plot the hourly task count
plt.figure(figsize=(10, 6))
plt.plot(hourly_task_count.index, hourly_task_count.values, marker="o")
plt.xlabel("Hour")
plt.ylabel("Task Count")
plt.title("Hourly Task Count")
plt.xticks(range(24))
plt.tight_layout()
plt.show()

Daily Task Count "Success vs Fail"

In [None]:
daily_task_outcome = (
    df.groupby(df["taskCreatedTime"].dt.date)["UserVar.taskStatusLabel"]
    .value_counts()
    .unstack()
    .fillna(0)
)

# Step 4: Plot the daily task success vs fail
plt.figure(figsize=(10, 6))
daily_task_outcome.plot(kind="bar", stacked=True)
plt.xlabel("Date")
plt.ylabel("Count")
plt.title("Daily Task Success vs Fail")
plt.xticks(rotation=45)
plt.legend(title="Task Outcome")
plt.tight_layout()
plt.show()

In [None]:
daily_task_outcome = (
    df_hour.groupby(df_hour["Hour"])["UserVar.taskStatusLabel"]
    .value_counts()
    .unstack()
    .fillna(0)
)

# Step 4: Plot the hourly task success vs fail
plt.figure(figsize=(10, 6))
daily_task_outcome.plot(kind="bar", stacked=True)
plt.xlabel("Hour")
plt.ylabel("Count")
plt.title("Hourly Task Success vs Fail")
plt.xticks(rotation=45)
plt.legend(title="Task Outcome")
plt.tight_layout()
plt.show()

Average Task Completion Time per Day

In [None]:
# Calculate the task completion time in hours
df["completion_time_minutes"] = (
    df["taskCompletedTime"] - df["taskCreatedTime"]
).dt.total_seconds() / 60

# Group the data by task creation date and calculate the average completion time per day
average_completion_time_per_day = df.groupby(df["taskCreatedTime"].dt.date)[
    "completion_time_minutes"
].mean()

df_completion_time = pd.DataFrame(
    {
        "Date": average_completion_time_per_day.index,
        "Average Completion Time (minutes)": average_completion_time_per_day.values,
    }
)

# Plot the average completion time per day
plt.figure(figsize=(10, 6))
plt.plot(
    df_completion_time["Date"],
    df_completion_time["Average Completion Time (minutes)"],
    marker="o",
)
plt.xlabel("Date")
plt.ylabel("Average Completion Time (minutes)")
plt.title("Average Task Completion Time per Day")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Task Completion Time Distribution

In [None]:
df["completion_time_minutes"].describe(include="all")

In [None]:
df_success = df[df["UserVar.taskStatusLabel"] == "Success"]
plt.title("Task Completion Time Distribution")
sns.histplot(df_success["completion_time_minutes"], kde=True)

## Task Assignment Analysis

Most Active workers

In [None]:
worker_task_counts = df["taskAssignedTo"].value_counts()

In [None]:
# Get the top N most active workers
top_n_workers = 10  # Set the desired number of top workers
most_active_workers = worker_task_counts.head(top_n_workers)

print("Most Active Workers:")
print(most_active_workers)

Most profitable workers

In [None]:
grouped_df = df.groupby("taskAssignedTo")["cod.amount", "UserVar.weight"].agg(
    ["sum", "mean", "count"]
)

# Sort by sum of cod.amount in descending order
top_sums = grouped_df.sort_values(by=[("cod.amount", "sum")], ascending=False)
# top_sums
top_sums.head(20)

In [None]:
(grouped_df[("cod.amount", "sum")] / grouped_df[("UserVar.weight", "sum")]).sort_values(
    ascending=False
).head(20)

Distribution of Task Completion Counts

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(worker_task_counts, bins=20, kde=True)
plt.xlabel("Task Completion Count")
plt.ylabel("Frequency")
plt.title("Distribution of Task Completion Counts")
plt.tight_layout()
plt.show()

Count Completion vs Average Completion

In [None]:
worker_performance = df.groupby("taskAssignedTo")["completion_time_minutes"].mean()
correlation_df = pd.concat([worker_performance, worker_task_counts], axis=1)
correlation = correlation_df.corr()
correlation

In [None]:
def compare_kdeplots(df, col, hue=None):
    plt.figure(figsize=(8, 6))
    sns.histplot(
        df[df["UserVar.taskStatusLabel"] == "Success"][col],
        label="Success Task",
        kde=True,
        hue=hue,
    )
    sns.histplot(
        df[df["UserVar.taskStatusLabel"] == "Failed"][col],
        label="Failed Task",
        kde=True,
        hue=hue,
    )
    plt.title(f"Distribution of {col} by Customer Status")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.legend()
    plt.show()

In [None]:
# for col in ['UserVar.branch_dest', 'UserVar.taskStatusLabel', 'UserVar.receiver_city',
#                        'UserVar.taskDetailStatusLabel', 'UserVar.branch_origin']:
#     compare_kdeplots(df, col)

Completion time vs amount vs weight

In [None]:
sns.pairplot(data=df[["cod.amount", "UserVar.weight", "completion_time_minutes"]])

In [None]:
# Create a scatter plot with hue
sns.scatterplot(
    data=df, x="cod.amount", y="UserVar.weight", hue="UserVar.taskStatusLabel"
)

# Set plot title and labels
plt.title("Scatter Plot with Hue")

# Display the plot
plt.show()

Mapping

In [None]:
import folium

In [None]:
# # Filter relevant columns
# filtered_df = df[['taskLocationDone.lon', 'taskLocationDone.lat', 'cod.amount', 'UserVar.taskStatusLabel']]

# # Drop rows with missing values
# filtered_df.dropna(subset=['taskLocationDone.lon', 'taskLocationDone.lat'], inplace=True)

# # Create a folium map centered at the mean coordinates
# center_lat = filtered_df['taskLocationDone.lat'].mean()
# center_lon = filtered_df['taskLocationDone.lon'].mean()
# map = folium.Map(location=[center_lat, center_lon], zoom_start=12)

# # Iterate over the filtered dataframe and add markers to the map
# for _, row in filtered_df.iterrows():
#     lat = row['taskLocationDone.lat']
#     lon = row['taskLocationDone.lon']
#     amount = row['cod.amount']
#     status = row['UserVar.taskStatusLabel']

#     # Create marker color based on task status
#     color = 'green' if status == 'Success' else 'red'

#     # Add marker to the map
#     folium.CircleMarker(
#         location=[lat, lon],
#         radius=5,
#         color=color,
#         fill=True,
#         fill_color=color,
#         popup=f"Amount: {amount}, Status: {status}"
#     ).add_to(map)

# # Display the map
# map.save('map.html')

# Predict Task tatus

In [None]:
df = deepcopy(df_after)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=["taskAssignedTo"], inplace=True)

In [None]:
df["taskCompletedTime"].fillna(df["taskCreatedTime"], inplace=True)

In [None]:
col = ["taskLocationDone.lon", "taskLocationDone.lat", "cod.amount"]
df[col] = df[col].fillna(0)

In [None]:
col = ["UserVar.receiver_city"]
df[col] = df[col].fillna("UNKNOWN")

In [None]:
df.isnull().sum()

# Feature Engineering

In [None]:
df = df[df["UserVar.taskStatusLabel"] != "Ongoing"]

In [None]:
df.dtypes

In [None]:
# Extract additional time-based features
df["hourOfDay"] = df["taskCreatedTime"].dt.hour
df["dayOfWeek"] = df["taskCreatedTime"].dt.dayofweek
df["month"] = df["taskCreatedTime"].dt.month
df["year"] = df["taskCreatedTime"].dt.year
df["taskDuration"] = (
    df["taskCompletedTime"] - df["taskCreatedTime"]
).dt.total_seconds()

# Map boolean values to integers
df["cod.received"] = df["cod.received"].map({True: 1, False: 0})

# Other experiment
df["taskWeightDurationInteraction"] = df["UserVar.weight"] * df["taskDuration"]

df.drop(["taskCreatedTime", "taskCompletedTime"], axis=1, inplace=True)

In [None]:
# Encode binary variables
df["status"] = df["UserVar.taskStatusLabel"].map({"Success": 1, "Failed": 0})
df.drop(columns=["UserVar.taskStatusLabel"], inplace=True)

## One Hot Encoding

In [None]:
categorical_columns = df.select_dtypes(exclude=["int", "float"]).columns
categorical_columns = [
    "taskAssignedTo",
    "cod.received",
    "UserVar.branch_dest",
    "UserVar.receiver_city",
    "UserVar.branch_origin",
]
categorical_columns

In [None]:
# One hot encoding independent variable x
def encode_and_bind(original_dfframe, feature_to_encode):
    dummies = pd.get_dummies(original_dfframe[[feature_to_encode]])
    res = pd.concat([original_dfframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res

In [None]:
for feature in categorical_columns:
    df = encode_and_bind(df, feature)

df.head()

In [None]:
df.shape

In [None]:
# Generate x and y sets
x = df.drop("status", axis=1).values
y = df["status"]

Try raw model

In [None]:
# Splitting the dfset into training set and test set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, df["status"], test_size=0.2, random_state=1234
)

In [None]:
from xgboost import XGBClassifier

classifier = XGBClassifier(random_state=1234)
classifier.fit(x_train, y_train)

In [None]:
# Predicting the test set
y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

## Handling Outlier

In [None]:
# TODO

## Handling imbalanced

In [None]:
# Importing packages for SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler

from imblearn.pipeline import Pipeline

from collections import Counter

In [None]:
sm = SMOTE(sampling_strategy="auto", random_state=1234)
x_sm, y_sm = sm.fit_resample(x_train, y_train)

In [None]:
print(Counter(y_train))
print(Counter(y_sm))

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])

fig.add_trace(
    go.Pie(
        labels=list(Counter(y_train).keys()),
        values=list(Counter(y_train).values()),
        name="Original df",
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Pie(
        labels=list(Counter(y_sm).keys()),
        values=list(Counter(y_sm).values()),
        name="SMOTE df",
    ),
    row=1,
    col=2,
)

fig.update_traces(textposition="inside", hole=0.4, hoverinfo="value+percent+name")
fig.update_layout(
    title_text="status distribution",
    # Add annotations in the center of the donut pies.
    annotations=[
        dict(text="Original", x=0.16, y=0.5, font_size=12, showarrow=False),
        dict(text="SMOTE", x=0.82, y=0.5, font_size=12, showarrow=False),
    ],
)
fig.show()

In [None]:
over = BorderlineSMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.6)

steps = [("o", over), ("u", under)]

In [None]:
pipeline = Pipeline(steps=steps)

# transform the dfset
x_sm_us, y_sm_us = pipeline.fit_resample(x_train, y_train)

print(Counter(y_train))
print(Counter(y_sm_us))

In [None]:
list(Counter(y_train).keys())

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])

fig.add_trace(
    go.Pie(
        labels=list(Counter(y_train).keys()),
        values=list(Counter(y_train).values()),
        name="Original df",
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Pie(
        labels=list(Counter(y_sm_us).keys()),
        values=list(Counter(y_sm_us).values()),
        name="SMOTE and US df",
    ),
    row=1,
    col=2,
)

fig.update_traces(textposition="inside", hole=0.4, hoverinfo="percent+name+value")
fig.update_layout(
    title_text="status distribution",
    # Add annotations in the center of the donut pies.
    annotations=[
        dict(text="Original", x=0.16, y=0.5, font_size=12, showarrow=False),
        dict(text="SMOTE and UnderSample", x=0.9, y=0.5, font_size=12, showarrow=False),
    ],
)
fig.show()

In [None]:
classifier = XGBClassifier(random_state=1234)
classifier.fit(x_sm, y_sm)

In [None]:
# Predicting the test set
y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

In [None]:
classifier = XGBClassifier(random_state=1234)
classifier.fit(x_sm_us, y_sm_us)

In [None]:
# Predicting the test set
y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

In [None]:
# Names of the independent variables
feature_names = list(df.drop("status", axis=1).columns)

In [None]:
# Concatenate train (with resampling) and test sets to build the new dfframe
sm_us_x = np.concatenate((x_sm_us, x_test))
sm_us_y = np.concatenate((y_sm_us, y_test))

In [None]:
sm_us_df = pd.DataFrame(
    np.column_stack([sm_us_y, sm_us_x]), columns=["status"] + feature_names
)
sm_us_df.head()

## Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=1234)
rf_clf.fit(x_sm_us, y_sm_us)

In [None]:
features_to_plot = 25

importances = rf_clf.feature_importances_
indices = np.argsort(importances)

best_vars = np.array(feature_names)[indices][-features_to_plot:]
values = importances[indices][-features_to_plot:]
best_vars

In [None]:
y_ticks = np.arange(0, features_to_plot)
fig, ax = plt.subplots()
ax.barh(y_ticks, values)
ax.set_yticklabels(best_vars)
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances")
fig.tight_layout()
plt.show()

In [None]:
best_vars = best_vars[-20:]
best_vars

## Auto ML with H2O

In [None]:
from h2o.automl import H2OAutoML

h2o.init()

In [None]:
hf = h2o.H2OFrame(sm_us_df[["status"] + list(best_vars)])
hf.head()

In [None]:
hf["status"] = hf["status"].asfactor()
predictors = hf.drop("status").columns
response = "status"

In [None]:
# Split into train and test
train, valid = hf.split_frame(ratios=[0.8], seed=1234)

In [None]:
# Add a Stopping Creterias: max number of models and max time
# We are going to exclude DeepLearning algorithms because they are too slow
aml = H2OAutoML(
    max_models=20, max_runtime_secs=300, seed=1234, exclude_algos=["DeepLearning"]
)

In [None]:
# Train the model
aml.train(x=predictors, y=response, training_frame=train, validation_frame=valid)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=5)  # Print the first 5 rows

In [None]:
print("The model performance in Accuracy: {}".format(aml.leader.accuracy(valid=True)))
print("The model performance in AUC: {}".format(aml.leader.auc(valid=True)))

In [None]:
lb = aml.leaderboard
lb

In [None]:
# Get GBM model
m = h2o.get_model(lb[1, "model_id"])

In [None]:
m.varimp_plot(num_of_features=10)

As we can see in the the Variable Importance plot, the top vars are much the same as they were in the [Feature Selection](#4.-feature-selection) part.

In [None]:
m.shap_summary_plot(valid)

# References

https://www.kaggle.com/code/andreshg/churn-prediction-0-99-auc-h2o-sklearn-smote/notebook#4.-Feature-Selection