In [7]:
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import re
import string
from string import punctuation

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/Cs412/project_files

/content/drive/MyDrive/Cs412/project_files


In [5]:
train_df = pd.read_csv('bugs-train.csv')
test_df = pd.read_csv('bugs-test.csv')

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    return ''.join(text)

In [9]:
train_df['summary'] = train_df["summary"].apply(clean_text)

#test_df['summary'] = test_df['summary'].apply(lambda x: x.replace("\\r", ""))
test_df['summary'] = test_df["summary"].apply(clean_text)
# Check for missing values and drop them
train_df = train_df.dropna()
print(train_df.head())

   bug_id                                            summary  severity
0  365569                        remove workaround from bug     normal
1  365578        print preview crashes on any url in  builds  critical
2  365582                     lines are not showing in table     major
3  365584  firefox render ûïsimplified arabicû font fa...    normal
4  365597                                             crash   critical


In [10]:
severity_mapping = {
    'trivial': 0,
    'enhancement': 1,
    'minor': 2,
    'normal': 3,
    'major': 4,
    'blocker': 5,
    'critical': 6
}

train_df['severity_mapped'] = train_df['severity'].map(severity_mapping)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_df['summary'])
y_train = train_df['severity_mapped']

# Train an XGBoost model with the entire training dataset
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train_tfidf, y_train)

In [12]:
est_data = pd.read_csv("bugs-test.csv")

# Predict on the test data
X_test_tfidf = vectorizer.transform(test_df['summary'])
test_df['severity_mapped'] = model.predict(X_test_tfidf)

# Map the severity values back to their string representations
inverse_severity_mapping = {v: k for k, v in severity_mapping.items()}
test_df['severity'] = test_df['severity_mapped'].map(inverse_severity_mapping)

test_df.rename(columns={"bug id": "bug_id"}, inplace=True)
# Create the output CSV file with bug id and predicted severity
output_data = test_df[['bug_id', 'severity']]
output_file_path = 'predictions.csv'
output_data.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

Predictions saved to predictions.csv
