PoC which attempts to size issues based on our team's sizing history. To train the model, export issues as CSV from Jira which have a sizing in "Custom field (Story Points)". The more issues the better.  Save the CSV as `training_issues.csv`.  Similarly, export issues which lack a sizing as a CSV named `unsized_issues.csv`. This notebook will train on issues based on their summary, description, and issue type. Predictions are made based on 13 categories and differentiated with softmax.

Note: Sizings are a highly variable activity from team to team. Each team would need to train their model based on issues in their project.

In [None]:
import pandas as pd

df = pd.read_csv("training_issues.csv")

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop]
    return ' '.join(tokens)

def getVectorized(col, df_in):
    vectorizer = TfidfVectorizer()        
    matrix = vectorizer.fit_transform(df_in[col].apply(preprocess_text))    
    return pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

def prepareDataframe(df_in):
    df_work = df_in[["Description", "Summary", "Issue Type", "Custom field (Story Points)"]]    
    df_work["Description"] = df_work["Description"].fillna("")
    df_work['issue_text'] = df_work["Summary"] + " " + df_work["Description"]
    df_vect = getVectorized("issue_text", df_work)    
    df_cat = pd.get_dummies(df_work, columns=['Issue Type'])            
    df_cat.drop(["Description", "Summary", "issue_text"], axis=1, inplace=True)    
    df_out = pd.concat([df_cat, df_vect], axis=1)
    return df_out

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

df_full = prepareDataframe(df)

x_cols = df_full.columns[df_full.columns != "Custom field (Story Points)"]

x = df_full[x_cols].values
df_full["Custom field (Story Points)"] = df_full["Custom field (Story Points)"].astype("float")
y = df_full["Custom field (Story Points)"].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

num_classes = 14
y_train_encoded = to_categorical(y_train, num_classes=num_classes)
y_test_encoded = to_categorical(y_test, num_classes=num_classes)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import backend
import tensorflow as tf
from tensorflow.keras.optimizers import Adamax
import matplotlib.pyplot as plt

#np.random.seed(42)

#import random
#random.seed(42)

#tf.random.set_seed(42)

backend.clear_session()

model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(14, activation='softmax'))

opt = Adamax()

model.compile(optimizer=opt, loss='categorical_focal_crossentropy', metrics=['accuracy'])  # Change loss function for regression/multi-class

# Train the model
training_stats = model.fit(X_train, y_train_encoded, epochs=100, batch_size=32, validation_data=(X_test, y_test_encoded))

model.summary()

plt.plot(training_stats.history['accuracy'])
plt.plot(training_stats.history['val_accuracy'])
plt.title('Accuracy vs Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.show()

In [None]:
df_input = pd.read_csv("unsized_issues.csv")
df_input_prepared = prepareDataframe(df_input)

# Identify missing features
missing_features = [feature for feature in df_full.columns if feature not in df_input_prepared.columns]

# Create a DataFrame for missing features with default values (e.g., 0)
missing_data = pd.DataFrame(0, index=df_input_prepared.index, columns=missing_features)

# Concatenate the original new data with the missing features DataFrame
new_data_complete = pd.concat([df_input_prepared, missing_data], axis=1)

# Reorder columns to match the expected feature order
new_data_complete = new_data_complete[df_full.columns]

new_data_complete.drop(["Custom field (Story Points)"], axis=1, inplace=True)

print(new_data_complete.columns)

In [None]:
scaler = StandardScaler()
scaler.fit(new_data_complete)
new_data_scaled = scaler.transform(new_data_complete)
predictions = model.predict(new_data_scaled)

In [None]:
import numpy as np

scores = []
for prediction in predictions:
    scores.append(np.argmax(prediction))

score_df = pd.DataFrame(scores, columns=["sizing"])
output_df = pd.concat([df_input, score_df], axis=1)
output_df = output_df.dropna(axis=1, how='all')
output_df.to_csv("auto_sized.csv")
