## Purpose of this Notebook: 
#### The purpose of this notebook is simply evaluating the results for pytorch with title and body, pytorch with title, tensorflow with title and body, and tensorflow with title. This will be helpful to choose the best model for our usecase to identify whether the issues are buggy or not. 


### 1.Using Bert Embedding

In [1]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import ast
import numpy as np
col1 = 'BERT Embedding'
col2 = 'Issue Title'

In [2]:
data_torch_Title_bert = pd.read_csv('./Data/Data_with_Embeddings/GT_bert_data_pytorch.csv')
data_tf_Title_bert    = data = pd.read_csv('./Data/Data_with_Embeddings/GT_bert_data_tf.csv')

#title and body together as a vector of embeddings
data_tf_concat_bert = pd.read_csv('./Data/Data_with_Embeddings/GT_bert_concat_data_tf.csv')
data_torch_concat_bert = pd.read_csv('./Data/Data_with_Embeddings/GT_bert_concat_data_torch.csv')

#### Some Helpful Functions for preprocessing the data

In [3]:
import re
def remove_extra_commas(input_string):
    # Use regular expression to replace multiple commas with a single comma
    cleaned_string = re.sub(',+', ',', input_string)
    return cleaned_string


def modify(input_string):
    try:
        input_string = input_string.replace('\n', '').replace(' ', ',').replace(":", "")
        input_string = remove_extra_commas(input_string)
        if(input_string[1] == ','):
            input_string = input_string[0] + input_string[2:]
        input_string = ast.literal_eval(input_string.replace(' ',''))
    except:
        print(input_string)
    return input_string



def training(model, X,y):
    # model = LogisticRegression()

    # Define scoring metrics
    scoring_metrics = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'f1_score': make_scorer(f1_score)
    }

    # Perform 5-fold cross-validation
    # You can adjust the 'cv' parameter to change the number of folds
    scores = cross_validate(model, X, y, cv=5, scoring=scoring_metrics)

    # Print the cross-validation scores
    print("Cross-Validation Accuracy Scores:", scores['test_accuracy'])
    print("Mean Accuracy:", scores['test_accuracy'].mean())

    print("\nCross-Validation Precision Scores:", scores['test_precision'])
    print("Mean Precision:", scores['test_precision'].mean())

    print("\nCross-Validation Recall Scores:", scores['test_recall'])
    print("Mean Recall:", scores['test_recall'].mean())

    print("\nCross-Validation F1 Scores:", scores['test_f1_score'])
    print("Mean F1 Score:", scores['test_f1_score'].mean())
    
def preprocessEmbeddingsTitle(data, col):
    X = data[col].apply(lambda x: modify(x))
    X = np.asarray(X.values.tolist(), dtype=np.float32)
    #Given data['Is Bug']make them 0 and 1
    data['Is Bug'] = data['Is Bug'].apply(lambda x: 1 if x == True else 0)
    #get it as numpy array
    y = np.asarray(data['Is Bug'], dtype=np.uint8)
    return X,y
    

#### Result 1: PyTorch with Title Only

In [4]:
#Logistic Regression
X,y = preprocessEmbeddingsTitle(data_torch_Title_bert, col2)
training(LogisticRegression(), X,y)

Cross-Validation Accuracy Scores: [0.87179487 0.94871795 0.87179487 0.85714286 0.83116883]
Mean Accuracy: 0.8761238761238761

Cross-Validation Precision Scores: [0.87179487 0.97297297 0.87179487 0.83333333 0.82051282]
Mean Precision: 0.8740817740817741

Cross-Validation Recall Scores: [0.87179487 0.92307692 0.87179487 0.8974359  0.84210526]
Mean Recall: 0.8812415654520919

Cross-Validation F1 Scores: [0.87179487 0.94736842 0.87179487 0.86419753 0.83116883]
Mean F1 Score: 0.8772649053350807


In [5]:
#Decision Tree
X,y = preprocessEmbeddingsTitle(data_torch_Title_bert, col2)
training(DecisionTreeClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.75641026 0.85897436 0.71794872 0.74025974 0.81818182]
Mean Accuracy: 0.7783549783549784

Cross-Validation Precision Scores: [0.73809524 0.91176471 0.75757576 0.77142857 0.8       ]
Mean Precision: 0.7957728545963839

Cross-Validation Recall Scores: [0.79487179 0.79487179 0.64102564 0.69230769 0.84210526]
Mean Recall: 0.7530364372469636

Cross-Validation F1 Scores: [0.7654321  0.84931507 0.69444444 0.72972973 0.82051282]
Mean F1 Score: 0.7718868323891155


In [6]:
#Random Forest
X,y = preprocessEmbeddingsTitle(data_torch_Title_bert, col2)
training(RandomForestClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.84615385 0.94871795 0.88461538 0.81818182 0.81818182]
Mean Accuracy: 0.8631701631701632

Cross-Validation Precision Scores: [0.7755102  0.94871795 0.82608696 0.74509804 0.75      ]
Mean Precision: 0.8090826297074013

Cross-Validation Recall Scores: [0.97435897 0.94871795 0.97435897 0.97435897 0.94736842]
Mean Recall: 0.9638326585695006

Cross-Validation F1 Scores: [0.86363636 0.94871795 0.89411765 0.84444444 0.8372093 ]
Mean F1 Score: 0.8776251412366323


In [7]:
from sklearn.naive_bayes import GaussianNB
#Naive Bayes
X,y = preprocessEmbeddingsTitle(data_torch_Title_bert, col2)
training(GaussianNB(), X,y)

Cross-Validation Accuracy Scores: [0.85897436 0.8974359  0.84615385 0.77922078 0.80519481]
Mean Accuracy: 0.8373959373959374

Cross-Validation Precision Scores: [0.79166667 0.87804878 0.78723404 0.7037037  0.7254902 ]
Mean Precision: 0.7772286778979597

Cross-Validation Recall Scores: [0.97435897 0.92307692 0.94871795 0.97435897 0.97368421]
Mean Recall: 0.9588394062078273

Cross-Validation F1 Scores: [0.87356322 0.9        0.86046512 0.8172043  0.83146067]
Mean F1 Score: 0.8565386619804893


#### Result 2: PyTorch with Title and Body

In [8]:
#Logistic Regression
X,y = preprocessEmbeddingsTitle(data_torch_concat_bert, col1)
training(LogisticRegression(), X,y)

Cross-Validation Accuracy Scores: [0.83333333 0.92307692 0.8974359  0.87012987 0.87012987]
Mean Accuracy: 0.8788211788211788

Cross-Validation Precision Scores: [0.84210526 0.97142857 0.87804878 0.82222222 0.85      ]
Mean Precision: 0.8727609674592985

Cross-Validation Recall Scores: [0.82051282 0.87179487 0.92307692 0.94871795 0.89473684]
Mean Recall: 0.8917678812415654

Cross-Validation F1 Scores: [0.83116883 0.91891892 0.9        0.88095238 0.87179487]
Mean F1 Score: 0.8805670005670005


In [9]:
#Decision Tree
X,y = preprocessEmbeddingsTitle(data_torch_concat_bert, col1)
training(DecisionTreeClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.67948718 0.74358974 0.78205128 0.75324675 0.75324675]
Mean Accuracy: 0.7423243423243424

Cross-Validation Precision Scores: [0.65909091 0.85185185 0.78947368 0.76315789 0.74358974]
Mean Precision: 0.7614328166959745

Cross-Validation Recall Scores: [0.74358974 0.58974359 0.76923077 0.74358974 0.76315789]
Mean Recall: 0.7218623481781377

Cross-Validation F1 Scores: [0.69879518 0.6969697  0.77922078 0.75324675 0.75324675]
Mean F1 Score: 0.7362958326813749


In [10]:
#Random Forest
X,y = preprocessEmbeddingsTitle(data_torch_concat_bert, col1)
training(RandomForestClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.79487179 0.88461538 0.83333333 0.83116883 0.84415584]
Mean Accuracy: 0.8376290376290377

Cross-Validation Precision Scores: [0.75555556 0.91666667 0.79545455 0.75       0.7826087 ]
Mean Precision: 0.8000570926657883

Cross-Validation Recall Scores: [0.87179487 0.84615385 0.8974359  1.         0.94736842]
Mean Recall: 0.9125506072874494

Cross-Validation F1 Scores: [0.80952381 0.88       0.84337349 0.85714286 0.85714286]
Mean F1 Score: 0.8494366035570854


In [11]:
from sklearn.naive_bayes import GaussianNB
#Naive Bayes
X,y = preprocessEmbeddingsTitle(data_torch_concat_bert, col1)
training(GaussianNB(), X,y)

Cross-Validation Accuracy Scores: [0.78205128 0.8974359  0.82051282 0.81818182 0.81818182]
Mean Accuracy: 0.8272727272727274

Cross-Validation Precision Scores: [0.72916667 0.8974359  0.77777778 0.75510204 0.75      ]
Mean Precision: 0.7818964765393337

Cross-Validation Recall Scores: [0.8974359  0.8974359  0.8974359  0.94871795 0.94736842]
Mean Recall: 0.9176788124156546

Cross-Validation F1 Scores: [0.8045977  0.8974359  0.83333333 0.84090909 0.8372093 ]
Mean F1 Score: 0.8426970650306655


#### Result 3: TensorFlow with Title Only

In [12]:
#Logistic Regression
X,y = preprocessEmbeddingsTitle(data_tf_Title_bert, col2)
training(LogisticRegression(), X,y)

Cross-Validation Accuracy Scores: [0.75362319 0.75362319 0.79411765 0.85294118 0.80882353]
Mean Accuracy: 0.7926257459505541

Cross-Validation Precision Scores: [0.73684211 0.77419355 0.83333333 0.92857143 0.83870968]
Mean Precision: 0.8223300185948743

Cross-Validation Recall Scores: [0.8        0.70588235 0.73529412 0.76470588 0.76470588]
Mean Recall: 0.7541176470588236

Cross-Validation F1 Scores: [0.76712329 0.73846154 0.78125    0.83870968 0.8       ]
Mean F1 Score: 0.7851089007104253


In [13]:
#Decision Tree
X,y = preprocessEmbeddingsTitle(data_tf_Title_bert, col2)
training(DecisionTreeClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.65217391 0.60869565 0.67647059 0.79411765 0.72058824]
Mean Accuracy: 0.6904092071611253

Cross-Validation Precision Scores: [0.63414634 0.6        0.71428571 0.83333333 0.74193548]
Mean Precision: 0.704740174590686

Cross-Validation Recall Scores: [0.74285714 0.61764706 0.58823529 0.73529412 0.67647059]
Mean Recall: 0.6721008403361346

Cross-Validation F1 Scores: [0.68421053 0.60869565 0.64516129 0.78125    0.70769231]
Mean F1 Score: 0.6854019553009182


In [14]:
#Random Forest
X,y = preprocessEmbeddingsTitle(data_tf_Title_bert, col2)
training(RandomForestClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.73913043 0.72463768 0.72058824 0.76470588 0.77941176]
Mean Accuracy: 0.745694799658994

Cross-Validation Precision Scores: [0.71794872 0.71428571 0.82608696 0.8        0.80645161]
Mean Precision: 0.7729546003318795

Cross-Validation Recall Scores: [0.8        0.73529412 0.55882353 0.70588235 0.73529412]
Mean Recall: 0.7070588235294119

Cross-Validation F1 Scores: [0.75675676 0.72463768 0.66666667 0.75       0.76923077]
Mean F1 Score: 0.7334583747627226


In [15]:
from sklearn.naive_bayes import GaussianNB
#Naive Bayes
X,y = preprocessEmbeddingsTitle(data_tf_Title_bert, col2)
training(GaussianNB(), X,y)

Cross-Validation Accuracy Scores: [0.75362319 0.72463768 0.66176471 0.70588235 0.75      ]
Mean Accuracy: 0.7191815856777494

Cross-Validation Precision Scores: [0.6875     0.7027027  0.7037037  0.81818182 0.79310345]
Mean Precision: 0.7410383345728173

Cross-Validation Recall Scores: [0.94285714 0.76470588 0.55882353 0.52941176 0.67647059]
Mean Recall: 0.6944537815126051

Cross-Validation F1 Scores: [0.79518072 0.73239437 0.62295082 0.64285714 0.73015873]
Mean F1 Score: 0.7047083563553508


#### Result 4: TensorFlow with Title and Body

In [16]:
#Logistic Regression
X,y = preprocessEmbeddingsTitle(data_tf_concat_bert, col1)
training(LogisticRegression(), X,y)

Cross-Validation Accuracy Scores: [0.73913043 0.79710145 0.77941176 0.82352941 0.79411765]
Mean Accuracy: 0.7866581415174766

Cross-Validation Precision Scores: [0.71794872 0.79411765 0.91304348 0.86666667 0.8125    ]
Mean Precision: 0.8208553019870155

Cross-Validation Recall Scores: [0.8        0.79411765 0.61764706 0.76470588 0.76470588]
Mean Recall: 0.7482352941176471

Cross-Validation F1 Scores: [0.75675676 0.79411765 0.73684211 0.8125     0.78787879]
Mean F1 Score: 0.7776190593915053


In [17]:
#Decision Tree
X,y = preprocessEmbeddingsTitle(data_tf_concat_bert, col1)
training(DecisionTreeClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.60869565 0.60869565 0.54411765 0.54411765 0.52941176]
Mean Accuracy: 0.567007672634271

Cross-Validation Precision Scores: [0.60526316 0.6        0.54285714 0.54545455 0.53333333]
Mean Precision: 0.5653816359079517

Cross-Validation Recall Scores: [0.65714286 0.61764706 0.55882353 0.52941176 0.47058824]
Mean Recall: 0.5667226890756303

Cross-Validation F1 Scores: [0.63013699 0.60869565 0.55072464 0.53731343 0.5       ]
Mean F1 Score: 0.5653741417984526


In [18]:
#Random Forest
X,y = preprocessEmbeddingsTitle(data_tf_concat_bert, col1)
training(RandomForestClassifier(), X,y)

Cross-Validation Accuracy Scores: [0.69565217 0.68115942 0.67647059 0.77941176 0.70588235]
Mean Accuracy: 0.7077152600170503

Cross-Validation Precision Scores: [0.75       0.6875     0.75       0.78787879 0.75      ]
Mean Precision: 0.7450757575757576

Cross-Validation Recall Scores: [0.6        0.64705882 0.52941176 0.76470588 0.61764706]
Mean Recall: 0.6317647058823529

Cross-Validation F1 Scores: [0.66666667 0.66666667 0.62068966 0.7761194  0.67741935]
Mean F1 Score: 0.6815123492659063


In [19]:
from sklearn.naive_bayes import GaussianNB
#Naive Bayes
X,y = preprocessEmbeddingsTitle(data_tf_concat_bert, col1)
training(GaussianNB(), X,y)

Cross-Validation Accuracy Scores: [0.50724638 0.53623188 0.75       0.72058824 0.60294118]
Mean Accuracy: 0.6234015345268543

Cross-Validation Precision Scores: [0.51724138 0.53571429 0.79310345 0.74193548 0.58974359]
Mean Precision: 0.63554763738301

Cross-Validation Recall Scores: [0.42857143 0.44117647 0.67647059 0.67647059 0.67647059]
Mean Recall: 0.5798319327731093

Cross-Validation F1 Scores: [0.46875    0.48387097 0.73015873 0.70769231 0.63013699]
Mean F1 Score: 0.6041217983788687


### 2. Using Bag Of Words

#### TensorFlow Results with Issue Title

In [20]:
#frequency with Bag of Words -> Naive Bayes
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import KFold
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_tf_concat_bert['Issue Title'].values.tolist()
labels = data_tf_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Split the data into training and testing sets


# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),'precision': 'precision_macro','recall': 'recall_macro','f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_validate(model, preprocessed_documents, labels, cv=kf, scoring=scoring)

# Display the cross-validation results
print("Cross-validation Results:")
for i in range(kf.get_n_splits()):
    print(f"Fold {i+1} - Accuracy: {cv_results['test_accuracy'][i]}, Precision: {cv_results['test_precision'][i]}, Recall: {cv_results['test_recall'][i]}, F1: {cv_results['test_f1'][i]}")

print("\nMean Accuracy:", np.mean(cv_results['test_accuracy']))
#mean precision, recall, f1
print("Mean Precision:", np.mean(cv_results['test_precision']))
print("Mean Recall:", np.mean(cv_results['test_recall']))
print("Mean F1:", np.mean(cv_results['test_f1']))



[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results:
Fold 1 - Accuracy: 0.855072463768116, Precision: 0.8573232323232323, Recall: 0.8585304054054055, F1: 0.8550420168067226
Fold 2 - Accuracy: 0.7681159420289855, Precision: 0.7922727272727272, Recall: 0.7701680672268907, F1: 0.7641025641025643
Fold 3 - Accuracy: 0.8676470588235294, Precision: 0.8715277777777778, Recall: 0.8731473408892764, F1: 0.8676184295911746
Fold 4 - Accuracy: 0.8676470588235294, Precision: 0.8780701754385964, Recall: 0.8757628596338274, F1: 0.8676184295911745
Fold 5 - Accuracy: 0.8823529411764706, Precision: 0.8897922312556459, Recall: 0.8761987794245858, F1: 0.8797524314765695

Mean Accuracy: 0.8481670929241261
Mean Precision: 0.8577972288135959
Mean Recall: 0.8507614905159973
Mean F1: 0.8468267743136412


In [21]:
#frequency with Bag of Words -> Logistic Regression
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import KFold
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_tf_concat_bert['Issue Title'].values.tolist()
labels = data_tf_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Split the data into training and testing sets


# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
model = make_pipeline(CountVectorizer(), LogisticRegression())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),'precision': 'precision_macro','recall': 'recall_macro','f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_validate(model, preprocessed_documents, labels, cv=kf, scoring=scoring)

# Display the cross-validation results
print("Cross-validation Results:")
for i in range(kf.get_n_splits()):
    print(f"Fold {i+1} - Accuracy: {cv_results['test_accuracy'][i]}, Precision: {cv_results['test_precision'][i]}, Recall: {cv_results['test_recall'][i]}, F1: {cv_results['test_f1'][i]}")

print("\nMean Accuracy:", np.mean(cv_results['test_accuracy']))
#mean precision, recall, f1
print("Mean Precision:", np.mean(cv_results['test_precision']))
print("Mean Recall:", np.mean(cv_results['test_recall']))
print("Mean F1:", np.mean(cv_results['test_f1']))



[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results:
Fold 1 - Accuracy: 0.8985507246376812, Precision: 0.8977272727272727, Recall: 0.8990709459459459, F1: 0.8982086406743941
Fold 2 - Accuracy: 0.8695652173913043, Precision: 0.8769230769230769, Recall: 0.8705882352941177, F1: 0.8691253951527924
Fold 3 - Accuracy: 0.9264705882352942, Precision: 0.9305555555555556, Recall: 0.9324324324324325, F1: 0.9264546831062082
Fold 4 - Accuracy: 0.9558823529411765, Precision: 0.9548611111111112, Recall: 0.9568439407149085, F1: 0.9556425309849967
Fold 5 - Accuracy: 0.9705882352941176, Precision: 0.970357454228422, Recall: 0.970357454228422, F1: 0.970357454228422

Mean Accuracy: 0.9242114236999148
Mean Precision: 0.9260848941090878
Mean Recall: 0.9258586017231654
Mean F1: 0.9239577408293627


In [22]:
#TFIDF with Bag of Words
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_tf_concat_bert['Issue Title'].values.tolist()
labels = data_tf_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Create a pipeline with a TfidfVectorizer and a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': 'precision_macro',
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring=make_scorer(accuracy_score))

# Display the cross-validation results
print("Cross-validation Results:", cv_results)
print("Mean Accuracy:", np.mean(cv_results))

# Additional: Calculate and print mean values for precision, recall, and f1 score
precision_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='precision_macro')
recall_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='recall_macro')
f1_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='f1_macro')

print("\nMean Precision:", np.mean(precision_results))
print("Mean Recall:", np.mean(recall_results))
print("Mean F1 Score:", np.mean(f1_results))


[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results: [0.8115942  0.69565217 0.85294118 0.80882353 0.82352941]
Mean Accuracy: 0.7985080988917306

Mean Precision: 0.8215539709459975
Mean Recall: 0.802272419647894
Mean F1 Score: 0.7941631229001876


In [23]:
#TFIDF with Bag of Words -> Logistic Regression
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_tf_concat_bert['Issue Title'].values.tolist()
labels = data_tf_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Create a pipeline with a TfidfVectorizer and a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': 'precision_macro',
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring=make_scorer(accuracy_score))

# Display the cross-validation results
print("Cross-validation Results:", cv_results)
print("Mean Accuracy:", np.mean(cv_results))

# Additional: Calculate and print mean values for precision, recall, and f1 score
precision_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='precision_macro')
recall_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='recall_macro')
f1_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='f1_macro')

print("\nMean Precision:", np.mean(precision_results))
print("Mean Recall:", np.mean(recall_results))
print("Mean F1 Score:", np.mean(f1_results))


[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results: [0.86956522 0.79710145 0.88235294 0.89705882 0.94117647]
Mean Accuracy: 0.8774509803921567

Mean Precision: 0.8825963127192843
Mean Recall: 0.879666709098635
Mean F1 Score: 0.8768171132067584


#### PyTorch Results with Issue Title

In [20]:
#frequency with Bag of Words -> Naive Bayes
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import KFold
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_torch_concat_bert['Issue Title'].values.tolist()
labels = data_torch_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Split the data into training and testing sets


# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),'precision': 'precision_macro','recall': 'recall_macro','f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_validate(model, preprocessed_documents, labels, cv=kf, scoring=scoring)

# Display the cross-validation results
print("Cross-validation Results:")
for i in range(kf.get_n_splits()):
    print(f"Fold {i+1} - Accuracy: {cv_results['test_accuracy'][i]}, Precision: {cv_results['test_precision'][i]}, Recall: {cv_results['test_recall'][i]}, F1: {cv_results['test_f1'][i]}")

print("\nMean Accuracy:", np.mean(cv_results['test_accuracy']))
#mean precision, recall, f1
print("Mean Precision:", np.mean(cv_results['test_precision']))
print("Mean Recall:", np.mean(cv_results['test_recall']))
print("Mean F1:", np.mean(cv_results['test_f1']))



[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'cross_validate' is not defined

In [25]:
#frequency with Bag of Words -> Logistic Regression
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import KFold
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_torch_concat_bert['Issue Title'].values.tolist()
labels = data_torch_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Split the data into training and testing sets


# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
# Create a pipeline with a CountVectorizer and a Naive Bayes classifier
model = make_pipeline(CountVectorizer(), LogisticRegression())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),'precision': 'precision_macro','recall': 'recall_macro','f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_validate(model, preprocessed_documents, labels, cv=kf, scoring=scoring)

# Display the cross-validation results
print("Cross-validation Results:")
for i in range(kf.get_n_splits()):
    print(f"Fold {i+1} - Accuracy: {cv_results['test_accuracy'][i]}, Precision: {cv_results['test_precision'][i]}, Recall: {cv_results['test_recall'][i]}, F1: {cv_results['test_f1'][i]}")

print("\nMean Accuracy:", np.mean(cv_results['test_accuracy']))
#mean precision, recall, f1
print("Mean Precision:", np.mean(cv_results['test_precision']))
print("Mean Recall:", np.mean(cv_results['test_recall']))
print("Mean F1:", np.mean(cv_results['test_f1']))



[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results:
Fold 1 - Accuracy: 0.9358974358974359, Precision: 0.9349049964813512, Recall: 0.9291666666666667, F1: 0.9318539227677791
Fold 2 - Accuracy: 0.9102564102564102, Precision: 0.9125, Recall: 0.9222222222222223, F1: 0.9098861198217527
Fold 3 - Accuracy: 0.9615384615384616, Precision: 0.9620962425840475, Recall: 0.9611842105263158, F1: 0.9614814814814814
Fold 4 - Accuracy: 0.961038961038961, Precision: 0.9612010796221322, Recall: 0.9612010796221322, F1: 0.9610389610389611
Fold 5 - Accuracy: 0.961038961038961, Precision: 0.9625, Recall: 0.9625, F1: 0.961038961038961

Mean Accuracy: 0.945954045954046
Mean Precision: 0.9466404637375062
Mean Recall: 0.9472548358074674
Mean F1: 0.945059889229787


In [26]:
#TFIDF with Bag of Words -< Naive Bayes
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_torch_concat_bert['Issue Title'].values.tolist()
labels = data_torch_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Create a pipeline with a TfidfVectorizer and a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': 'precision_macro',
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring=make_scorer(accuracy_score))

# Display the cross-validation results
print("Cross-validation Results:", cv_results)
print("Mean Accuracy:", np.mean(cv_results))

# Additional: Calculate and print mean values for precision, recall, and f1 score
precision_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='precision_macro')
recall_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='recall_macro')
f1_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='f1_macro')

print("\nMean Precision:", np.mean(precision_results))
print("Mean Recall:", np.mean(recall_results))
print("Mean F1 Score:", np.mean(f1_results))


[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results: [0.8974359  0.87179487 0.92307692 0.8961039  0.96103896]
Mean Accuracy: 0.9098901098901099

Mean Precision: 0.9143433037560124
Mean Recall: 0.9078044436597068
Mean F1 Score: 0.9076686026806204


In [27]:
#TFIDF with Bag of Words -> Logistic Regression
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the text from processed tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example data
documents = data_torch_concat_bert['Issue Title'].values.tolist()
labels = data_torch_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Create a pipeline with a TfidfVectorizer and a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Define a custom scorer for cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': 'precision_macro',
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring=make_scorer(accuracy_score))

# Display the cross-validation results
print("Cross-validation Results:", cv_results)
print("Mean Accuracy:", np.mean(cv_results))

# Additional: Calculate and print mean values for precision, recall, and f1 score
precision_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='precision_macro')
recall_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='recall_macro')
f1_results = cross_val_score(model, preprocessed_documents, labels, cv=kf, scoring='f1_macro')

print("\nMean Precision:", np.mean(precision_results))
print("Mean Recall:", np.mean(recall_results))
print("Mean F1 Score:", np.mean(f1_results))


[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results: [0.88461538 0.82051282 0.91025641 0.90909091 0.90909091]
Mean Accuracy: 0.8867132867132866

Mean Precision: 0.9021041613536142
Mean Recall: 0.8870552181736391
Mean F1 Score: 0.8839120659480934


### 3. Using Word2Vec

#### 1. TensorFlow Results with title Only

In [28]:
import numpy as np
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

# Example data
documents = data_tf_concat_bert['Issue Title'].values.tolist()
labels = data_tf_concat_bert['Is Bug'].values.tolist()

# Tokenization and preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=preprocessed_documents, vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for a document
def document_embedding(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Create document embeddings using Word2Vec
embeddings = [document_embedding(doc, word2vec_model) for doc in preprocessed_documents]

# Create a pipeline with logistic regression
model = make_pipeline(LogisticRegression())

# Define a custom scorer for cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(model, embeddings, labels, cv=kf, scoring=scoring)

# Display the cross-validation results
print("Cross-validation Results:")
print("Accuracy:", cv_results['test_accuracy'])
print("Precision:", cv_results['test_precision'])
print("Recall:", cv_results['test_recall'])
print("F1 Score:", cv_results['test_f1'])

# Calculate and print mean values
mean_accuracy = np.mean(cv_results['test_accuracy'])
mean_precision = np.mean(cv_results['test_precision'])
mean_recall = np.mean(cv_results['test_recall'])
mean_f1 = np.mean(cv_results['test_f1'])

print("\nMean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1)


[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cross-validation Results:
Accuracy: [0.52173913 0.53623188 0.47058824 0.5        0.45588235]
Precision: [0.66269841 0.55373303 0.73134328 0.63809524 0.22794118]
Recall: [0.55194257 0.53991597 0.51351351 0.53792502 0.5       ]
F1 Score: [0.44259486 0.50626118 0.34264232 0.41438703 0.31313131]

Mean Accuracy: 0.49688832054560955
Mean Precision: 0.5627622285041074
Mean Recall: 0.528659413852725
Mean F1 Score: 0.4038033409092924


  _warn_prf(average, modifier, msg_start, len(result))


#### 2. Pytorch Results with title Only

In [29]:
import numpy as np
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

# Example data
documents = data_torch_concat_bert['Issue Title'].values.tolist()
labels = data_torch_concat_bert['Is Bug'].values.tolist()

# Tokenization and preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=preprocessed_documents, vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for a document
def document_embedding(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Create document embeddings using Word2Vec
embeddings = [document_embedding(doc, word2vec_model) for doc in preprocessed_documents]

# Create a pipeline with logistic regression
model = make_pipeline(LogisticRegression())

# Define a custom scorer for cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(model, embeddings, labels, cv=kf, scoring=scoring)

# Display the cross-validation results
print("Cross-validation Results:")
print("Accuracy:", cv_results['test_accuracy'])
print("Precision:", cv_results['test_precision'])
print("Recall:", cv_results['test_recall'])
print("F1 Score:", cv_results['test_f1'])

# Calculate and print mean values
mean_accuracy = np.mean(cv_results['test_accuracy'])
mean_precision = np.mean(cv_results['test_precision'])
mean_recall = np.mean(cv_results['test_recall'])
mean_f1 = np.mean(cv_results['test_f1'])

print("\nMean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1)


[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation Results:
Accuracy: [0.38461538 0.42307692 0.48717949 0.55844156 0.48051948]
Precision: [0.19230769 0.21153846 0.24358974 0.76388889 0.24025974]
Recall: [0.5        0.5        0.5        0.56410256 0.5       ]
F1 Score: [0.27777778 0.2972973  0.32758621 0.45909091 0.3245614 ]

Mean Accuracy: 0.4667665667665667
Mean Precision: 0.3303169053169053
Mean Recall: 0.5128205128205128
Mean F1 Score: 0.33726271891426157


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Conclusion: 
From th results above, we can see that using Bag of Words is the best choice for classifying the bugs

##### Predicting whether the issue is buggy or not using Bag Of Words for all Issue Titles 

### Predicting PyTorch

In [9]:
import re

def get_type(string):
    # Use regular expression to find all occurrences of 'name="type:xyz"'
    matches = re.findall(r'name="([^"]+)', string)
    #convert the list to string
    matches = ' '.join(matches)
    return matches

In [31]:
df_torch = pd.read_csv('../../Issues_parser/Scraped_Data/torch_issues/Pytorch_open_issue.csv')
for i in range(0, 3):
    df_torch = pd.concat([df_torch, pd.read_csv('../../issues_parser/Scraped_Data/torch_issues/torch_issues_classified.csv_' + str(i) + '.csv')])

In [10]:
df_torch['Tags'] = df_torch['Tags'].apply(get_type)
df_torch['Issue Title'] = df_torch['Issue Title'] + ' ' + df_torch['Tags']

In [12]:
def wordBasedChecker(IssueTitle):
    '''
    Input: IssueTitle, IssueBody
    Output: True/False
    '''
    bug_keywords = {
    'error',
    'exception',
    'traceback',
    'crash',
    'issue',
    'problem',
    'unexpected',
    'incorrect',
    'not working',
    'failure',
    'flaw',
    'mistake',
    'fault',
    'glitch',
    'inconsistency',
    'abnormal',
    'unexpected behavior',
    'unhandled',
    'segmentation fault',
    'defect',
    'bug'
    }
    #Parse the title
    try:
        IssueTitle = IssueTitle.lower()
        IssueTitle = IssueTitle.split()
         #Check if any of the keywords is in the title
        for word in IssueTitle:
            if word in bug_keywords:
                return 1
        return 0
    except:
        return 0
   

In [15]:
import numpy as np

In [16]:
#Getting Predicted Labels usingwordBasedChecker
predicted_labels = []
for index, row in df_torch.iterrows():
    predicted_labels.append(wordBasedChecker(row['Issue Title']))

In [17]:
#convert the list to numpy array
predicted_labels = np.asarray(predicted_labels)
df_torch['Predicted_Is_Bug'] = predicted_labels

In [22]:
#using frequency 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    try:
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens]
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        processed_text = ' '.join(tokens)
    except: 
        print(text)
        processed_text = ""
    return processed_text

# Example data
documents = data_torch_concat_bert['Issue Title'].values.tolist()
labels = data_torch_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

model = make_pipeline(CountVectorizer(), LogisticRegression())

# Fit the model on the entire dataset
model.fit(preprocessed_documents, labels)

# Make predictions on new data
new_data = df_torch['Issue Title'].values.tolist()
new_data_preprocessed = [preprocess_text(doc) for doc in new_data]
LR_prediction = model.predict(new_data_preprocessed)

[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
# Adding the predictions to the dataframe
df_torch['LR_Predicted_Is_Bug'] = LR_prediction
df_torch['Final_Is_Bug'] = df_torch['LR_Predicted_Is_Bug'] | df_torch['Predicted_Is_Bug']
df_torch['Final_Is_Bug'].value_counts()

Final_Is_Bug
True     9358
False    2862
Name: count, dtype: int64

In [24]:
df_torch

Unnamed: 0,Issue Number,Issue Title,Time created,Time closed,Number of Assignees,Number of Comments,Tags,Predicted_Is_Bug,LR_Predicted_Is_Bug,Final_Is_Bug
0,114968,Missing `.so` files when installing PyTorch 1....,2023-12-01 18:20:37+00:00,,0,0,oncall: pt2,0,False,False
1,114967,Inplace update to buffers doesn't work with `a...,2023-12-01 18:10:19+00:00,,0,0,,0,False,False
2,114966,[dynamo] dynamo does not support dataclasses w...,2023-12-01 18:04:35+00:00,,0,0,triaged module: dynamo,0,False,False
3,114964,[dynamo] missing support for function `object....,2023-12-01 18:00:39+00:00,,0,0,triaged module: dynamo,0,False,False
4,114963,[dynamo] missing support for builtin function ...,2023-12-01 17:58:57+00:00,,0,0,triaged module: dynamo,0,False,False
...,...,...,...,...,...,...,...,...,...,...
12215,634,Feature Request: NegativeSampling and Hierarch...,2017-01-29 18:30:26+00:00,,0,11,feature module: nn module: loss triaged Stale ...,0,True,True
12216,630,Add Peephole connections for LSTMs? feature tr...,2017-01-29 06:14:27+00:00,,0,18,feature triaged Stale,0,False,False
12217,499,Feature Request: Locally Connected Layer propo...,2017-01-19 10:36:23+00:00,,0,23,proposal accepted feature module: nn triaged S...,0,False,False
12218,285,Keyword arguments passed to module's __call__ ...,2016-12-01 22:42:55+00:00,,0,1,module: nn low priority triaged enhancement,0,True,True


In [25]:
df_torch['LR_Predicted_Is_Bug'].value_counts()

LR_Predicted_Is_Bug
True     8944
False    3276
Name: count, dtype: int64

In [26]:
df_torch.columns

Index(['Issue Number', 'Issue Title', 'Time created', 'Time closed',
       'Number of Assignees', 'Number of Comments', 'Tags', 'Predicted_Is_Bug',
       'LR_Predicted_Is_Bug', 'Final_Is_Bug'],
      dtype='object')

In [27]:
#In the csv file 
saved_df = df_torch[['Issue Number','Issue Title', 'Time created',
       'Time closed', 'Number of Assignees', 'Number of Comments', 'Tags', 'Final_Is_Bug']]


In [28]:
saved_df.to_csv('torch_issues_classified.csv', index=False)

#### TensorFlow Results with Issue Title

In [None]:
df_tf = pd.read_csv('../../Issues_parser/Scraped_Data/tf_issues/Tensorflow_open_issue.csv')
for i in range(0, 4):
    df_tf = pd.concat([df_tf, pd.read_csv('../../issues_parser/tf_issues/tf_issues_classified.csv_' + str(i) + '.csv')])
df_tf['Tags'] = df_tf['Tags'].apply(get_type)
df_tf['Issue Title'] = df_tf['Issue Title'] + ' ' + df_tf['Tags']

In [30]:
#Getting Predicted Labels usingwordBasedChecker
tf_predicted_labels = []
for index, row in df_tf.iterrows():
    tf_predicted_labels.append(wordBasedChecker(row['Issue Title']))

#convert the list to numpy array
tf_predicted_labels = np.asarray(tf_predicted_labels)
df_tf['Predicted_Is_Bug'] = tf_predicted_labels

In [31]:
df_tf['Predicted_Is_Bug'].value_counts()

Predicted_Is_Bug
0    1659
1     258
Name: count, dtype: int64

In [32]:
#using frequency 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    try:
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens]
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        processed_text = ' '.join(tokens)
    except: 
        print(text)
        processed_text = ""
    return processed_text

# Example data
documents = data_tf_concat_bert['Issue Title'].values.tolist()
labels = data_tf_concat_bert['Is Bug'].values.tolist()

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

model = make_pipeline(CountVectorizer(), LogisticRegression())

# Fit the model on the entire dataset
model.fit(preprocessed_documents, labels)

# Make predictions on new data
new_data = df_tf['Issue Title'].values.tolist()
new_data_preprocessed = [preprocess_text(doc) for doc in new_data]
LR_prediction = model.predict(new_data_preprocessed)

[nltk_data] Downloading package stopwords to /home/mamm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mamm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Adding the predictions to the dataframe
df_tf['LR_Predicted_Is_Bug'] = LR_prediction
df_tf['Final_Is_Bug'] = df_tf['LR_Predicted_Is_Bug'] | df_tf['Predicted_Is_Bug']
df_tf['Final_Is_Bug'].value_counts()

Final_Is_Bug
True     1844
False      73
Name: count, dtype: int64

In [34]:
#In the csv file 
saved_df = df_tf[['Issue Number','Issue Title', 'Time created',
       'Time closed', 'Number of Assignees', 'Number of Comments', 'Tags', 'Final_Is_Bug']]


In [35]:
saved_df.to_csv('tf_issues_classified.csv', index=False)