In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score
import joblib
import traceback
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the data
data = pd.read_csv('bugs-train.csv')

# Severity mapping
severity_mapping = {
    'trivial': 0,
    'enhancement': 1,
    'minor': 2,
    'normal': 3,
    'major': 4,
    'blocker': 5,
    'critical': 6
}

# Map severity values to integers
data['severity'] = data['severity'].map(severity_mapping)

print(data['severity'].unique())
# Extract features using n-gram vectorizer
X = data['summary']
y = data['severity']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF transformation
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Initialize the Logistic Regression classifier
clf = LogisticRegression(solver='saga', max_iter=5000)

# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.01, 0.1, 1, 10],
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1]  # This is only used if penalty is 'elasticnet'
}

# Scoring method
scorer = make_scorer(precision_score, average='macro')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=scorer, cv=3, verbose=2, n_jobs=-1)

# Perform grid search
try:
    print(f"Performing Grid Search...")
    grid_search.fit(X_tfidf, y)
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best macro precision score: {grid_search.best_score_}")
except Exception as e:
    print(f"Error during Grid Search: {e}")
    traceback.print_exc()

# Fit the final model with the best parameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_tfidf, y)

# Save the trained model
model_path = 'best_lr.pkl'
joblib.dump(best_clf, model_path)
print(f"Best model saved to {model_path}")

# Optionally, you can also save the n-gram vectorizer if needed for future use
vectorizer_path = 'tfidf_vectorizer_lr.pkl'
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TF-IDF vectorizer saved to {vectorizer_path}")

[3 6 4 1 5 0 2]
Performing Grid Search...
Fitting 3 folds for each of 60 candidates, totalling 180 fits


KeyboardInterrupt: 