In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd drive/MyDrive/Cs412/project_files

/content/drive/MyDrive/Cs412/project_files


In [8]:
%ls


 bugs-test.csv                       randomforest_predictions_2.csv           submission_0_51.csv
 bugs-train.csv                      random_forest_smote_model.pkl            submission_0_55.csv
 lgbm_p-_sm-.ipynb                   random_forest_smote_model_v1_0.pkl       submission.csv
 ngram_vectorizer.pkl                random_forest_smote_model_v2_tfidf.pkl   tfidf_vectorizer.pkl
'predictions_02 06 2024_02_49.csv'   [0m[01;34mresults[0m/                                 train_prep.csv
 predictions.csv                    'submission_02 06 2024.csv'               Untitled0.ipynb


In [15]:
from sklearn.metrics import classification_report, make_scorer, precision_score

scorer = make_scorer(precision_score, average='macro')


In [None]:
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Separate features and target
X_train = train_data['summary']  # Text column
y_train = train_data['severity']  # Target column
X_test = test_data['summary']  # Test text column


# Set up the TF-IDF vectorizer and logistic regression model with elastic net regularization
pipeline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegressionCV(
        cv=5,
        penalty='elasticnet',
        solver='saga',
        l1_ratios=[.5],  # Elastic Net mixing parameter, 0 <= l1_ratio <= 1
        max_iter=10000,
        scoring=scorer,
        n_jobs=-1,
        verbose=3
    )
)

# Perform cross-validation and get predictions for each fold
y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=5)

# Train the model on the entire training set
pipeline.fit(X_train, y_train)

# Get the final predictions on the test set
test_predictions = pipeline.predict(X_test)

# Print classification report for cross-validation predictions
print("Classification Report for Cross-Validation Predictions:")
print(classification_report(y_train, y_pred_cv))

# Print classification report for each fold
for i, (train_index, test_index) in enumerate(pipeline.named_steps['logisticregressioncv'].cv_):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    pipeline_fold = make_pipeline(
        TfidfVectorizer(),
        LogisticRegressionCV(
            penalty='elasticnet',
            solver='saga',
            l1_ratios=[.5],
            max_iter=10000,
            scoring='accuracy'
        )
    )
    pipeline_fold.fit(X_train_fold, y_train_fold)
    y_pred_fold = pipeline_fold.predict(X_test_fold)
    print(f"Classification Report for Fold {i+1}:")
    print(classification_report(y_test_fold, y_pred_fold))

print("Model training and prediction completed.")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


rescaling...rescaling...
rescaling...

rescaling...rescaling...
rescaling...rescaling...


rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...rescaling...
rescaling...
rescaling...

rescaling...
rescaling...rescaling...
rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...rescaling...


rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

In [14]:
!pip install cudf cuml cugraph


Collecting cudf
  Using cached cudf-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cuml
  Using cached cuml-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cugraph
  Using cached cugraph-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: cudf, cuml, cugraph
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for cudf (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for cudf[0m[31m
[0m[?25h  Running setup.py clean for cudf
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run

In [12]:
import cudf
from cuml.linear_model import LogisticRegression
from cuml.feature_extraction.text import TfidfVectorizer


# Load the datasets
print("Loading datasets...")
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Convert pandas DataFrame to cuDF DataFrame
train_data_cudf = cudf.DataFrame.from_pandas(train_data)
test_data_cudf = cudf.DataFrame.from_pandas(test_data)

# Separate features and target
X_train = train_data_cudf['summary']  # Text column
y_train = train_data_cudf['severity']  # Target column
X_test = test_data_cudf['summary']  # Test text column

# Initialize TF-IDF vectorizer
print("Applying TF-IDF vectorizer...")
tfidf = TfidfVectorizer()

# Transform the data using TF-IDF
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Set up logistic regression model with elastic net regularization
model = LogisticRegression(penalty='elasticnet', solver='qn', l1_ratio=0.5, max_iter=10000)

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5)

# Perform cross-validation
y_pred_cv = []
y_true_cv = []
best_precision = 0
best_model = None

print("Starting cross-validation...")
for i, (train_index, val_index) in enumerate(kf.split(X_train_tfidf)):
    print(f"Fold {i+1}/{kf.get_n_splits()}")
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    model.fit(X_train_fold, y_train_fold)
    y_pred_fold = model.predict(X_val_fold)

    y_pred_cv.extend(y_pred_fold.to_array())
    y_true_cv.extend(y_val_fold.to_array())

    # Calculate macro average precision for the current fold
    precision = precision_score(y_val_fold.to_array(), y_pred_fold.to_array(), average='macro')
    print(f"Macro Average Precision for Fold {i+1}: {precision}")

    # Update the best model if the current model is better
    if precision > best_precision:
        best_precision = precision
        best_model = model

    print(f"Completed Fold {i+1}")

# Print classification report for cross-validation predictions
print("Classification Report for Cross-Validation Predictions:")
print(classification_report(y_true_cv, y_pred_cv))

# Train the best model on the entire training set
print(f"Best Macro Average Precision: {best_precision}")
print("Training the best model on the entire training set...")
best_model.fit(X_train_tfidf, y_train)

# Get the final predictions on the test set
print("Predicting on the test set with the best model...")
test_predictions = best_model.predict(X_test_tfidf)

# Convert predictions to pandas DataFrame and save
dummy_submission = pd.read_csv('dummy_submission.csv')
dummy_submission['severity'] = test_predictions.to_array()
dummy_submission.to_csv('/content/drive/MyDrive/Cs412/project_files/bugs_test_predictions.csv', index=False)

print("Model training and prediction completed.")


Loading datasets...
Setting up the pipeline...
Performing cross-validation...
Fold 1/5


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...

KeyboardInterrupt: 