In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load data
data = pd.read_csv("jira_tasks.csv")

# Feature engineering: explode assignees
data['assignees'] = data['assignees'].str.split(',')
data = data.explode('assignees')

# Feature engineering
data['day_of_year'] = pd.to_datetime(data['start_date']).dt.dayofyear

data.head()

Unnamed: 0,task_id,task_name,assignees,task_topic,start_date,close_time_days,day_of_year
0,1,enhance bricks-and-clicks solutions,Tara Young,Spike,2024-08-31,144,244
0,1,enhance bricks-and-clicks solutions,Mary Perez,Spike,2024-08-31,144,244
1,2,architect collaborative models,Dawn Johnston,Documentation,2024-06-13,156,165
1,2,architect collaborative models,Derek Harrison,Documentation,2024-06-13,156,165
1,2,architect collaborative models,Carol Shelton,Documentation,2024-06-13,156,165


In [3]:
# Define features and target
features = ['assignees', 'task_topic', 'day_of_year']

# Feature: Number of assignees
data['num_assignees'] = data['assignees'].str.count(',') + 1

# Updated features
features += ['num_assignees',]

target = 'close_time_days'

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    data[features], data[target], test_size=0.2, random_state=42
)

In [4]:
# Preprocessing pipeline
categorical_features = ['assignees', 'task_topic']
numeric_features = ['day_of_year']

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode `assignee` and `task_topic`
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)


In [5]:
# Define the Poisson regression model
model = lgb.LGBMRegressor(objective='poisson')

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

# Train the model
pipeline.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 1631, number of used features: 55
[LightGBM] [Info] Start training from score 4.727654


In [6]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

MAE: 21.94764231706979
MSE: 1001.7676737836867


In [7]:
# Feature importance
model = pipeline.named_steps['regressor']
importance = model.feature_importances_

In [8]:
# Get feature names for interpretation
ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat']
feature_names = ohe.get_feature_names_out(categorical_features).tolist() + numeric_features
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print("Top Features:")
print(feature_importance.head(20))

Top Features:
                        Feature  Importance
54                  day_of_year        2243
51     task_topic_Documentation         100
50               task_topic_Bug          91
52           task_topic_Feature          53
53             task_topic_Spike          53
7    assignees_Benjamin Collins          43
8   assignees_Brandon Adams DDS          41
35   assignees_Michael Alvarado          32
30         assignees_Mary Mills          24
9        assignees_Brian Dodson          24
31         assignees_Mary Perez          22
40      assignees_Pamela Stuart          21
46        assignees_Traci Woods          19
15        assignees_Donna Garza          16
2       assignees_Allison Perez          15
11    assignees_Cory Richardson          15
26    assignees_Kristy Cummings          14
20         assignees_Jerry Ruiz          11
3       assignees_Amanda Brewer          11
43   assignees_Stephanie Thomas          11


In [9]:
# Define the new task
new_task = pd.DataFrame({
    'task_id': 113511,
    'task_name': ["random sheet"],
    'assignees': ['Joshua Browning, Michael Alvarado'],
    'task_topic': ['Bug'],
    'start_date': ['2024-11-15']
})

# Preprocess the input (split assignees)
new_task['assignees'] = new_task['assignees'].str.split(',')  # Split into lists
new_task = new_task.explode('assignees')                     # Explode into rows

# Add numeric feature for day of the year
new_task['day_of_year'] = pd.to_datetime(new_task['start_date']).dt.dayofyear

# Use the trained pipeline to predict
predicted_close_times = pipeline.predict(new_task)

# Aggregate predictions if exploded
average_close_time = predicted_close_times.mean()

print(f"Predicted close time for the task: {average_close_time:.2f} days")


Predicted close time for the task: 28.44 days


In [10]:
# Evaluate the model
y_pred_test = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print(f"Model Accuracy Metrics:")
print(f"MAE: {mae:.2f} days")
print(f"MSE: {mse:.2f} days²")
print(f"R²: {r2:.2f}")

Model Accuracy Metrics:
MAE: 21.95 days
MSE: 1001.77 days²
R²: 0.75


In [12]:
import numpy as np
from scipy.stats import poisson

# Predicted mean time (λ) from the model
predicted_time = pipeline.predict(new_task)[0]

# Standard deviation (Poisson property)
std_dev = np.sqrt(predicted_time)

# Confidence interval: ± 1 day
confidence_interval = (predicted_time - std_dev, predicted_time + std_dev)

# Probability of exact predicted days
exact_days = int(round(predicted_time))
probability_exact_days = poisson.pmf(exact_days, mu=predicted_time) * 100

# Format output
print(f"Predicted time to close the task: {exact_days} days (probability {probability_exact_days:.2f}%)")
print(f"Predicted time to close the task: {round(predicted_time)} days ± {round(std_dev)} days")


Predicted time to close the task: 27 days (probability 7.64%)
Predicted time to close the task: 27 days ± 5 days
