<a href="https://colab.research.google.com/github/melihcgn/Algorithms---HW4/blob/master/CS412_TermProject_Codes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Random Forest

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# Load the training and test data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
X_train = train_data['summary']
y_train = train_data['severity']
X_test = test_data['summary']

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Create a pipeline to vectorize the text data and train a Random Forest classifier
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=100, random_state=42))

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predict the severity of the bugs in the test data
y_test_pred_encoded = pipeline.predict(X_test)
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Add the predictions to the test data
test_data['severity'] = y_test_pred
test_data.drop(columns=['summary'], inplace=True)

# Save the predictions to a new CSV file
test_data.to_csv('bugs-test-predictions_rndforest.csv', index=False)

print(test_data)

## Weighted voting / Soft voting

In [None]:
import pandas as pd

# Load prediction CSVs from different models
bugs_testpredictions_lightgbm_submission = pd.read_csv('bugs-test-predictions-lightgbm-submission.csv')
bugs_test_predictions_rf_submission = pd.read_csv('bugs-test-predictions-rf-submission.csv')
bugs_test_predictions_logreg_submission = pd.read_csv('bugs-test-predictions-logreg-submission.csv')
bugs_test_predictions = pd.read_csv('bugs-test-predictions.csv')
bugs_test_predictions_lr = pd.read_csv('bugs-test-predictions_lr.csv')
bugs_test_predictions_gradientboost_woutsum = pd.read_csv('bugs-test-predictions_gradientboost_woutsum.csv')
test1_predictions_SVM_OvUn = pd.read_csv('test1_predictions_SVM_OvUn.csv')
bugs_test_predictions_rndforest = pd.read_csv('bugs-test-predictions_rndforest.csv')
predicted_test_data_xgboost_vol2 = pd.read_csv('predicted_test_data_xgboost_vol2.csv')
predicted_severities = pd.read_csv('predicted_severities.csv')
bugs_test_predictions_mlp_woutsum=pd.read_csv('bugs-test-predictions_mlp_woutsum.csv')
final_predictions_with_adaboost_v100 = pd.read_csv('final_predictions_with_adaboost_v100.csv')
# Define weights for each model (you can adjust these based on model performance)
accuracy_rates = {
    'lightgbm': 0.40854,
    'rf': 0.35649,
    'logreg': 0.42913,
    'ayc': 0.49414,
    'lr': 0.36980,
    'gradientboost': 0.40854,
    'rndforest': 0.60637,
    'xgboost':0.50713,
    'ps':0.49287,
    'ada100':0.39476

}


# Combine dataframes into a list
dataframes = [bugs_testpredictions_lightgbm_submission,
              bugs_test_predictions_rf_submission,
              bugs_test_predictions_logreg_submission,
              bugs_test_predictions,
              bugs_test_predictions_lr,
              bugs_test_predictions_gradientboost_woutsum,
              bugs_test_predictions_rndforest,
              predicted_test_data_xgboost_vol2,
              predicted_severities,
              test1_predictions_SVM_OvUn,
              bugs_test_predictions_mlp_woutsum,
              final_predictions_with_adaboost_v100
              ]

# Define weights for each model (this list must correspond in order to your dataframe list)
weights = [0.40854, 0.35649, 0.42913, 0.49414, 0.36980, 0.40854, 0.60637,0.50713, 0.49287, 0.27333, 0.28, 0.39476]
#weights = [1,1,1,1,1,1,1,1,1,1,1,1]

# Calculate squared weights
#squared_weights = [weight**2 for weight in weights
squared_weights = [rate**(1/2) for rate in weights]

# Assign weights to each dataframe
for df, weight in zip(dataframes, weights):
    df['weight'] = weight

# Step 3: Concatenate all DataFrames
all_data = pd.concat(dataframes)

# Step 4: Group by 'bug.id' and 'severity' and sum the weights
result = all_data.groupby(['bug_id', 'severity']).sum()

# Step 5: Unstack the severity and sum the weights
# This transformation puts each severity in its own column with their summed weights
result_unstacked = result['weight'].unstack(fill_value=0)

# Find the severity with the maximum weighted score for each bug.id
final_severity = result_unstacked.idxmax(axis=1)

# Create a DataFrame for final results
final_results = pd.DataFrame({'bug_id': final_severity.index, 'severity': final_severity.values})

# Print or save the results
print(final_results)
# Optionally save to CSV
final_results.to_csv('final_predictions_softVote.csv', index=False)



## Adaboost

In [None]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder

# Load prediction CSVs from different models
dataframes = {
    'lightgbm': pd.read_csv('bugs-test-predictions-lightgbm-submission.csv'),
    'rf': pd.read_csv('bugs-test-predictions-rf-submission.csv'),
    'logreg': pd.read_csv('bugs-test-predictions-logreg-submission.csv'),
    'generic': pd.read_csv('bugs-test-predictions.csv'),
    'lr': pd.read_csv('bugs-test-predictions_lr.csv'),
    'gradientboost': pd.read_csv('bugs-test-predictions_gradientboost_woutsum.csv'),
    'rndforest': pd.read_csv('bugs-test-predictions_rndforest.csv'),
    'xgboost': pd.read_csv('predicted_test_data_xgboost_vol2.csv'),
    'pred_severities': pd.read_csv('predicted_severities.csv'),
    'svm': pd.read_csv('test1_predictions_SVM_OvUn.csv'),
    'mlp': pd.read_csv('bugs-test-predictions_mlp_woutsum.csv')
}

# Prepare the DataFrame with all predictions aligned
all_predictions = pd.DataFrame()

for name, df in dataframes.items():
    df = df.rename(columns={'severity': name})
    if all_predictions.empty:
        all_predictions = df
    else:
        all_predictions = pd.merge(all_predictions, df, on='bug_id', how='outer')

# Interpolate NaN values (filling missing values)
all_predictions = all_predictions.interpolate(method='linear', axis=0)

# Convert NaN values to 0 if there are still any
all_predictions.fillna(0, inplace=True)

# Encode the severities into numerical values
label_encoder = LabelEncoder()
for col in all_predictions.columns:
    if col != 'bug_id':  # Avoid encoding bug_id
        all_predictions[col] = label_encoder.fit_transform(all_predictions[col])

# Initialize AdaBoost
ada_boost = AdaBoostClassifier(n_estimators=100, random_state=42)

# Extract features and target
X = all_predictions.drop('bug_id', axis=1)
y = all_predictions['pred_severities']  # Use 'pred_severities' as the target

# Train AdaBoost on the entire dataset
ada_boost.fit(X, y)

# Predict on the entire dataset
y_pred = ada_boost.predict(X)

# Convert numeric predictions back to labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Create a DataFrame for final results
final_results = pd.DataFrame({
    'bug_id': all_predictions['bug_id'],
    'predicted_severity': y_pred_labels
})

# Display the results
print(final_results)

# Optionally save to CSV
final_results.to_csv('final_predictions_with_adaboost_v100.csv', index=False)


## MLP

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# Load the training and test data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
X_train = train_data['summary']
y_train = train_data['severity']
X_test = test_data['summary']

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Create a pipeline to vectorize the text data and train an MLP classifier
pipeline = make_pipeline(TfidfVectorizer(), MLPClassifier(hidden_layer_sizes=(10,), max_iter=10, random_state=42))

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predict the severity of the bugs in the test data
y_test_pred_encoded = pipeline.predict(X_test)
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Add the predictions to the test data
test_data['severity'] = y_test_pred

# Save the predictions to a new CSV file
test_data.to_csv('bugs-test-predictions_mlp.csv', index=False)

print(test_data)

## Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# Load the data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Display first few rows of the datasets
print(train_data.head())
print(test_data.head())

# Map severity labels to numerical values
severity_mapping = {
    'enhancement': 0,
    'minor': 1,
    'normal': 2,
    'major': 3,
    'blocker': 4,
    'critical': 5
}

train_data['severity'] = train_data['severity'].map(severity_mapping)

# Check for missing values and handle them
print(train_data.isnull().sum())
train_data = train_data.dropna(subset=['summary', 'severity'])

# Preprocess the data
X = train_data['summary'] + ' ' + train_data['bug_id'].astype(str)
y = train_data['severity']

# Split the data into training and validation sets
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_split = vectorizer.fit_transform(X_train_split)
X_valid_split = vectorizer.transform(X_valid_split)

# Initialize and train the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train_split, y_train_split)

# Predict on validation set
logreg_y_pred = logreg_model.predict(X_valid_split)

# Classification report
print("Logistic Regression Classification Report")
print(classification_report(y_valid_split, logreg_y_pred))

# Precision-Recall Curve
logreg_y_scores = logreg_model.predict_proba(X_valid_split)
logreg_precision = dict()
logreg_recall = dict()
logreg_average_precision = dict()
for i in range(len(severity_mapping)):
    logreg_precision[i], logreg_recall[i], _ = precision_recall_curve(y_valid_split == i, logreg_y_scores[:, i])
    logreg_average_precision[i] = average_precision_score(y_valid_split == i, logreg_y_scores[:, i])

# Plot Precision-Recall curve for Logistic Regression
plt.figure(figsize=(10, 7))
for i in range(len(severity_mapping)):
    plt.plot(logreg_recall[i], logreg_precision[i], lw=2, label=f'Class {i} (area = {logreg_average_precision[i]:0.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Logistic Regression Precision-Recall curve')
plt.legend(loc='best')
plt.show()

# Preprocess the test data
test_data['text'] = test_data['summary'] + ' ' + test_data['bug_id'].astype(str)
X_test = vectorizer.transform(test_data['text'])

# Predict on test set
test_predictions = logreg_model.predict(X_test)

# Map numerical severity back to categorical
severity_mapping_reverse = {v: k for k, v in severity_mapping.items()}
test_data['severity'] = test_predictions
test_data['severity'] = test_data['severity'].map(severity_mapping_reverse)

# Create the submission DataFrame
submission = pd.DataFrame({
    'bug_id': test_data['bug_id'],
    'severity': test_data['severity']
})

# Save the submission DataFrame to a CSV file
submission.to_csv('bugs-test-predictions-logreg-submission.csv', index=False)

print("Predictions saved to bugs-test-predictions-logreg-submission.csv")

## LİghtGBM

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import lightgbm as lgb
from imblearn.over_sampling import SMOTE

# Assuming the files are named 'bugs-train.csv' and 'bugs-test.csv'
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Display first few rows of the datasets
print(train_data.head())
print(test_data.head())

# Map severity labels to numerical values
severity_mapping = {
    'enhancement': 1,
    'minor': 2,
    'normal': 3,
    'major': 4,
    'blocker': 5,
    'critical': 6
}

train_data['severity'] = train_data['severity'].map(severity_mapping)

# Convert 'summary' and 'bug type' to strings and combine them for feature extraction
train_data['text'] = train_data['summary'].astype(str) + ' ' + train_data['bug_id'].astype(str)
test_data['text'] = test_data['summary'].astype(str) + ' ' + test_data['bug_id'].astype(str)

# Remove rows with NaN values in the target variable
train_data = train_data.dropna(subset=['severity'])

# TF-IDF Vectorization with fewer features
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X_train = vectorizer.fit_transform(train_data['text'])
y_train = train_data['severity']

X_test = vectorizer.transform(test_data['text'])

# Train-test split for validation
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_split, y_train_split)

# Define a smaller parameter grid for Randomized Search
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_child_samples': [20, 50]
}

# Initialize the LightGBM model with force_col_wise parameter to avoid threading issues
lgb_model = lgb.LGBMClassifier(random_state=42, force_col_wise=True, verbose=2)

# Perform Randomized Search with fewer iterations
random_search = RandomizedSearchCV(estimator=lgb_model, param_distributions=param_grid, cv=3, n_iter=10, scoring='precision_macro', random_state=42, n_jobs=1)
random_search.fit(X_train_res, y_train_res)

# Best parameters
best_params = random_search.best_params_
print("Best parameters found by Randomized Search:", best_params)

# Train the model with the best parameters
best_lgb_model = lgb.LGBMClassifier(**best_params, random_state=42, force_col_wise=True, verbose=2)
best_lgb_model.fit(
    X_train_res, y_train_res,
    eval_set=[(X_valid_split, y_valid_split)],
    eval_metric='logloss',
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Predict on validation set
y_pred = best_lgb_model.predict(X_valid_split)

# Classification report
print(classification_report(y_valid_split, y_pred))

# Precision-Recall Curve
y_scores = best_lgb_model.predict_proba(X_valid_split)
precision = dict()
recall = dict()
average_precision = dict()
for i in range(len(severity_mapping)):
    precision[i], recall[i], _ = precision_recall_curve(y_valid_split == i+1, y_scores[:, i])
    average_precision[i] = average_precision_score(y_valid_split == i+1, y_scores[:, i])

# Plot Precision-Recall curve
plt.figure(figsize=(10, 7))
for i in range(len(severity_mapping)):
    plt.plot(recall[i], precision[i], lw=2, label=f'Class {i+1} (area = {average_precision[i]:0.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('LightGBM Precision-Recall curve with Best Params')
plt.legend(loc='best')
plt.show()

# Predict on test set
test_pred = best_lgb_model.predict(X_test)
test_data['severity'] = test_pred

# Map numerical severity back to categorical
severity_mapping_reverse = {
    1: 'enhancement',
    2: 'minor',
    3: 'normal',
    4: 'major',
    5: 'blocker',
    6: 'critical'
}

# Convert the numerical severity back to categorical
test_data['severity'] = test_data['severity'].map(severity_mapping_reverse)

# Create the submission DataFrame
submission = pd.DataFrame({
    'bug_id': test_data['bug_id'],
    'severity': test_data['severity']
})

# Save the submission DataFrame to a CSV file
submission.to_csv('bugs-test-predictions-lightgbm-submission.csv', index=False)

print("Predictions saved to bugs-test-predictions-lightgbm-submission.csv")

## Gradient Boost

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# Load the training and test data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
X_train = train_data['summary']
y_train = train_data['severity']
X_test = test_data['summary']

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Create a pipeline to vectorize the text data and train a Gradient Boosting classifier
pipeline = make_pipeline(TfidfVectorizer(), GradientBoostingClassifier(n_estimators=100, random_state=42))

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predict the severity of the bugs in the test data
y_test_pred_encoded = pipeline.predict(X_test)
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Add the predictions to the test data
test_data['severity'] = y_test_pred

# Save the predictions to a new CSV file
test_data.to_csv('bugs-test-predictions_gradientboost.csv', index=False)

print(test_data)

## Soft Voting (Logistic regression + random forest)

> Blok alıntı ekle



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier

# Load the training data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
# Combine summary and bug_id into a single feature (optional)
train_data['text'] = train_data['bug_id'].astype(str) + ' ' + train_data['summary']
test_data['text'] = test_data['bug_id'].astype(str) + ' ' + test_data['summary']

# Encode the target variable
label_encoder = LabelEncoder()
train_data['severity_encoded'] = label_encoder.fit_transform(train_data['severity'])

# Define the feature and target variable
X_train = train_data['text']
y_train = train_data['severity_encoded']
X_test = test_data['text']

# Create pipelines for both classifiers
pipeline_lr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_rf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Create a voting classifier with weighted voting
voting_clf = VotingClassifier(
    estimators=[('lr', pipeline_lr), ('rf', pipeline_rf)],
    voting='soft',
    weights=[1, 2]  # Assign weights to the classifiers
)

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Predict the severity for the test data
predictions = voting_clf.predict(X_test)

# Decode the predicted labels
predicted_severities = label_encoder.inverse_transform(predictions)

# Add predictions to the test data
test_data['severity'] = predicted_severities

# Save the predictions to a CSV file
test_data[['bug_id', 'severity']].to_csv('predicted_severities.csv', index=False)

print("Predictions saved to 'predicted_severities.csv'")

## Hard Voting (Logistic regression + random forest + Gradient Boosting)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

# Load the training and test data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
X_train = train_data['summary']
y_train = train_data['severity']
X_test = test_data['summary']

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Split the training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Transform the training and validation data
X_train_tfidf = tfidf.fit_transform(X_train_split)
X_val_tfidf = tfidf.transform(X_val_split)
X_test_tfidf = tfidf.transform(X_test)

# Define base models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Define the stacking classifier
estimators = [
    ('log_reg', log_reg),
    ('rf', rf),
    ('gb', gb)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Train the stacking classifier
stacking_clf.fit(X_train_tfidf, y_train_split)

# Validate the model
y_val_pred = stacking_clf.predict(X_val_tfidf)
print(classification_report(y_val_split, y_val_pred, target_names=label_encoder.classes_))

# Train the stacking classifier on the full training data
X_train_full_tfidf = tfidf.fit_transform(X_train)
stacking_clf.fit(X_train_full_tfidf, y_train_encoded)

# Predict the severity of the bugs in the test data
y_test_pred_encoded = []

# Initialize tqdm for progress bar
with tqdm(total=len(X_test), desc="Processing", mininterval=0.1) as progress_bar:
    for text in X_test:
        encoded_pred = stacking_clf.predict([text])
        y_test_pred_encoded.append(encoded_pred[0])
        progress_bar.update(1)  # Update progress bar

# Decode the predictions
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Add the predictions to the test data
test_data['severity'] = y_test_pred

# Delete the "summary" column
test_data.drop(columns=['summary'], inplace=True)

# Save the predictions to a new CSV file
test_data.to_csv('bugs-test-predictions_hardVoting.csv', index=False)

print(test_data)

## SVM USİNG oversampling (with SMOTE) and undersampling (with Nearmiss)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Define preprocess_text function if not already defined
def preprocess_text(text):
    # Placeholder preprocessing function, replace with your actual preprocessing logic
    return text.lower()

# Load training data
train_data = pd.read_csv('bugs-train.csv')
test1_data = pd.read_csv('bugs-test.csv')

# Apply preprocessing
train_data['summary_clean'] = train_data['summary'].apply(preprocess_text)
test1_data['summary_clean'] = test1_data['summary'].apply(preprocess_text)

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(train_data['summary_clean'])
X_test1_tfidf = tfidf.transform(test1_data['summary_clean'])

# Apply NearMiss undersampling to the majority class
nm1 = NearMiss(version=1)
X_undersampled, y_undersampled = nm1.fit_resample(X_train_tfidf, train_data['severity'])

# Apply SMOTE oversampling to the undersampled data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_undersampled, y_undersampled)

# Divide the resampled data into train and validation sets
X_train_resampled, X_validation, y_train_resampled, y_validation = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

# Train SVM model with the best parameters
best_svm = SVC(C=1, gamma=0.01, kernel='linear', probability=True)
best_svm.fit(X_train_resampled, y_train_resampled)

# Predict severity labels for validation data
y_pred_validation = best_svm.predict(X_validation)

# Evaluate the model on validation data
precision = precision_score(y_validation, y_pred_validation, average='weighted')
recall = recall_score(y_validation, y_pred_validation, average='weighted')
f1 = f1_score(y_validation, y_pred_validation, average='weighted')
accuracy = accuracy_score(y_validation, y_pred_validation)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Accuracy:", accuracy)

# Predict severity labels for test data
test_predictions = best_svm.predict(X_test1_tfidf)

# Create a DataFrame with bug_id and predicted severity
final_predictions = pd.DataFrame({
    'bug_id': test1_data['bug_id'],
    'severity': test_predictions
})

# Save the final predictions to a CSV file
final_predictions.to_csv('final_predictions.csv', index=False)


Precision: 0.5671602064209946
Recall: 0.5647921760391198
F1-Score: 0.5641976945045722
Accuracy: 0.5647921760391198


## Xgboost

In [None]:
import pandas as pd

train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

severity_mapping = {
    'enhancement': 0,
    'minor': 1,
    'normal': 2,
    'major': 3,
    'blocker': 4,
    'critical': 5
}

train_data['severity'] = train_data['severity'].map(severity_mapping)

# Remove rows with missing severity values
train_data_clean = train_data.dropna(subset=['severity'])

print(train_data_clean.head())


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

X_train, X_val, y_train, y_val = train_test_split(train_data_clean['summary'], train_data_clean['severity'], test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)

evaluation_report = classification_report(y_val, y_pred, target_names=list(severity_mapping.keys()), output_dict=True)

# Print evaluation report
print(evaluation_report)

test_data['severity'] = pipeline.predict(test_data['summary'])
test_data['severity'] = test_data['severity'].map({v: k for k, v in severity_mapping.items()})

# Save the predictions to a new CSV file
predicted_test_data = test_data[['bug_id', 'summary', 'severity']]
predicted_test_data.to_csv('predicted_test_data_xgboost.csv', index=False)