##mozilla

In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from pprint import pprint
import gensim
import multiprocessing

# Helper functions for data loading and preprocessing
def load_data(dataset, train=True, percent=0.8):
    '''Reads in and formats data from the dataset with train/test split.'''
    df = pd.read_csv(dataset.path, sep=',', encoding='ISO-8859-1')
    df = df.dropna(subset=["Severity", "Description"])  # Drop rows with missing Severity or Description
    df['Severity'] = df['Severity'].astype(int)  # Convert Severity to int
    df['Description'] = df['Description'].astype(str).str.strip()  # Ensure Description is a string

    # Filter out invalid rows
    df = df[df['Description'] != '']

    raw_data = df[['Description', 'Severity']].to_numpy()  # Use only Description and Severity columns

    # Split the dataset into train and test based on the percent
    train_data, test_data = train_test_split(raw_data, test_size=(1 - percent), random_state=42)
    return train_data if train else test_data

def preprocess(train_data, test_data):
    '''Generate paragraph vectors using Doc2Vec.'''
    print("Preprocessing training data...")
    train_corpus = list(_read_corpus(train_data))
    print(f"Number of valid training documents: {len(train_corpus)}")

    print("Preprocessing test data...")
    test_corpus = list(_read_corpus(test_data, tokens_only=True))
    print(f"Number of valid test documents: {len(test_corpus)}")

    if len(train_corpus) == 0 or len(test_corpus) == 0:
        raise ValueError("No valid data found after preprocessing. Check the 'Description' column.")

    cores = max(1, multiprocessing.cpu_count() // 2)  # Use half the cores

    # Initialize Doc2Vec models
    model_DM = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=1, dm_concat=1)
    model_DBOW = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=0)

    # Build vocabulary
    print("Building vocabulary...")
    model_DM.build_vocab(train_corpus)
    model_DBOW.build_vocab(train_corpus)

    print(f"Vocabulary size (DM): {len(model_DM.wv.key_to_index)}")
    print(f"Vocabulary size (DBOW): {len(model_DBOW.wv.key_to_index)}")

    if len(model_DM.wv.key_to_index) == 0 or len(model_DBOW.wv.key_to_index) == 0:
        raise ValueError("Vocabulary is empty. Check your input data or preprocessing.")

    # Train Doc2Vec models
    print("Training models...")
    model_DM.train(train_corpus, total_examples=model_DM.corpus_count, epochs=model_DM.epochs)
    model_DBOW.train(train_corpus, total_examples=model_DBOW.corpus_count, epochs=model_DBOW.epochs)

    # Generate training data
    print("Generating training data...")
    X_train = [(list(model_DM.dv[i]) + list(model_DBOW.dv[i])) for i in range(len(train_corpus))]
    Y_train = [doc[1] for doc in train_data]

    print("Generating testing data...")
    X_test = [(list(model_DM.infer_vector(test_corpus[i])) + list(model_DBOW.infer_vector(test_corpus[i]))) for i in range(len(test_corpus))]
    Y_test = [doc[1] for doc in test_data]

    return X_train, Y_train, X_test, Y_test

def _read_corpus(data, tokens_only=False):
    '''Helper function to prepare data for Doc2Vec.'''
    for i, line in enumerate(data):
        description = str(line[0]).strip()  # Ensure the description is a string and trimmed
        if not description:
            continue
        tokens = gensim.utils.simple_preprocess(description)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

# Classifier
class ASP():
    def __init__(self, X_train, Y_train, X_test, Y_test):
        self.X_train, self.Y_train = X_train, Y_train
        self.X_test, self.Y_test = X_test, Y_test
        self.classifier = MLPClassifier(alpha=0.7, max_iter=10000)

    def fit(self):
        self.classifier.fit(self.X_train, self.Y_train)

    def predict(self):
        prediction = self.classifier.predict(self.X_test)
        accuracy = accuracy_score(self.Y_test, prediction)
        prf1 = precision_recall_fscore_support(y_true=self.Y_test, y_pred=prediction, average='weighted')

        print('Evaluation Metrics:')
        print(f"Accuracy: {accuracy:.4f}")
        print('Precision | Recall | F-Score')
        pprint(prf1)

# Dataset class
class Dataset():
    def __init__(self, path, project_id):
        self.path = path
        self.project_id = project_id

# Experiment class
class Experiment():
    def __init__(self, train_data, test_data):
        self.train_data = train_data
        self.test_data = test_data

    def run(self):
        X_train, Y_train, X_test, Y_test = preprocess(self.train_data, self.test_data)
        classifier = ASP(X_train, Y_train, X_test, Y_test)
        classifier.fit()
        classifier.predict()

# Script to run experiment with 80/20 train-test split
a = Dataset('/content/mozilla_bug_report_data.csv', project_id=1)  # Replace with your dataset path
train_data = load_data(a, train=True, percent=0.8)
test_data = load_data(a, train=False, percent=0.8)

print(time.ctime(time.time()))
start = time.time()

experiment = Experiment(train_data, test_data)
experiment.run()

print(time.ctime(time.time()))
print('TOTAL RUNTIME: ', time.time() - start, 's')
print('')


Tue Dec 10 21:38:06 2024
Preprocessing training data...
Number of valid training documents: 7825
Preprocessing test data...
Number of valid test documents: 1957
Building vocabulary...
Vocabulary size (DM): 27470
Vocabulary size (DBOW): 27469
Training models...
Generating training data...
Generating testing data...
Evaluation Metrics:
Accuracy: 0.7379
Precision | Recall | F-Score
(0.7195068326466643, 0.7378640776699029, 0.7278383787618734, None)
Tue Dec 10 21:42:54 2024
TOTAL RUNTIME:  287.48146080970764 s



In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from pprint import pprint
import gensim
import multiprocessing

# Helper functions for data loading and preprocessing
def load_data(dataset, train=True, percent=0.8):
    '''Reads in and formats data from the dataset with train/test split.'''
    df = pd.read_csv(dataset.path, sep=',', encoding='ISO-8859-1')
    df = df.dropna(subset=["Severity", "short_description"])  # Drop rows with missing Severity or Description
    df['Severity'] = df['Severity'].astype(int)  # Convert Severity to int
    df['short_description'] = df['short_description'].astype(str).str.strip()  # Ensure Description is a string

    # Filter out invalid rows
    df = df[df['short_description'] != '']

    raw_data = df[['short_description', 'Severity']].to_numpy()  # Use only Description and Severity columns

    # Split the dataset into train and test based on the percent
    train_data, test_data = train_test_split(raw_data, test_size=(1 - percent), random_state=42)
    return train_data if train else test_data

def preprocess(train_data, test_data):
    '''Generate paragraph vectors using Doc2Vec.'''
    print("Preprocessing training data...")
    train_corpus = list(_read_corpus(train_data))
    print(f"Number of valid training documents: {len(train_corpus)}")

    print("Preprocessing test data...")
    test_corpus = list(_read_corpus(test_data, tokens_only=True))
    print(f"Number of valid test documents: {len(test_corpus)}")

    if len(train_corpus) == 0 or len(test_corpus) == 0:
        raise ValueError("No valid data found after preprocessing. Check the 'Description' column.")

    cores = max(1, multiprocessing.cpu_count() // 2)  # Use half the cores

    # Initialize Doc2Vec models
    model_DM = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=1, dm_concat=1)
    model_DBOW = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=0)

    # Build vocabulary
    print("Building vocabulary...")
    model_DM.build_vocab(train_corpus)
    model_DBOW.build_vocab(train_corpus)

    print(f"Vocabulary size (DM): {len(model_DM.wv.key_to_index)}")
    print(f"Vocabulary size (DBOW): {len(model_DBOW.wv.key_to_index)}")

    if len(model_DM.wv.key_to_index) == 0 or len(model_DBOW.wv.key_to_index) == 0:
        raise ValueError("Vocabulary is empty. Check your input data or preprocessing.")

    # Train Doc2Vec models
    print("Training models...")
    model_DM.train(train_corpus, total_examples=model_DM.corpus_count, epochs=model_DM.epochs)
    model_DBOW.train(train_corpus, total_examples=model_DBOW.corpus_count, epochs=model_DBOW.epochs)

    # Generate training data
    print("Generating training data...")
    X_train = [(list(model_DM.dv[i]) + list(model_DBOW.dv[i])) for i in range(len(train_corpus))]
    Y_train = [doc[1] for doc in train_data]

    print("Generating testing data...")
    X_test = [(list(model_DM.infer_vector(test_corpus[i])) + list(model_DBOW.infer_vector(test_corpus[i]))) for i in range(len(test_corpus))]
    Y_test = [doc[1] for doc in test_data]

    return X_train, Y_train, X_test, Y_test

def _read_corpus(data, tokens_only=False):
    '''Helper function to prepare data for Doc2Vec.'''
    for i, line in enumerate(data):
        description = str(line[0]).strip()  # Ensure the description is a string and trimmed
        if not description:
            continue
        tokens = gensim.utils.simple_preprocess(description)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

# Classifier
class ASP():
    def __init__(self, X_train, Y_train, X_test, Y_test):
        self.X_train, self.Y_train = X_train, Y_train
        self.X_test, self.Y_test = X_test, Y_test
        self.classifier = MLPClassifier(alpha=0.7, max_iter=10000)

    def fit(self):
        self.classifier.fit(self.X_train, self.Y_train)

    def predict(self):
        prediction = self.classifier.predict(self.X_test)
        accuracy = accuracy_score(self.Y_test, prediction)
        prf1 = precision_recall_fscore_support(y_true=self.Y_test, y_pred=prediction, average='weighted')

        print('Evaluation Metrics:')
        print(f"Accuracy: {accuracy:.4f}")
        print('Precision | Recall | F-Score')
        pprint(prf1)

# Dataset class
class Dataset():
    def __init__(self, path, project_id):
        self.path = path
        self.project_id = project_id

# Experiment class
class Experiment():
    def __init__(self, train_data, test_data):
        self.train_data = train_data
        self.test_data = test_data

    def run(self):
        X_train, Y_train, X_test, Y_test = preprocess(self.train_data, self.test_data)
        classifier = ASP(X_train, Y_train, X_test, Y_test)
        classifier.fit()
        classifier.predict()

# Script to run experiment with 80/20 train-test split
a = Dataset('/content/mozilla_bug_report_data.csv', project_id=1)  # Replace with your dataset path
train_data = load_data(a, train=True, percent=0.8)
test_data = load_data(a, train=False, percent=0.8)

print(time.ctime(time.time()))
start = time.time()

experiment = Experiment(train_data, test_data)
experiment.run()

print(time.ctime(time.time()))
print('TOTAL RUNTIME: ', time.time() - start, 's')
print('')


Sun Dec 15 04:54:39 2024
Preprocessing training data...
Number of valid training documents: 7997
Preprocessing test data...
Number of valid test documents: 2000
Building vocabulary...
Vocabulary size (DM): 8948
Vocabulary size (DBOW): 8947
Training models...
Generating training data...
Generating testing data...
Evaluation Metrics:
Accuracy: 0.8235
Precision | Recall | F-Score
(0.7158100085230732, 0.8235, 0.7643457209738342, None)
Sun Dec 15 04:56:22 2024
TOTAL RUNTIME:  103.02941250801086 s



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#cross product

In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from pprint import pprint
import gensim
import multiprocessing
from imblearn.over_sampling import RandomOverSampler

# Helper functions for data loading and preprocessing
def load_data(dataset, train=True):
    '''Reads in and formats data from the dataset based on product_name split.'''
    df = pd.read_csv(dataset.path, sep=',', encoding='ISO-8859-1')
    df = df.dropna(subset=["Severity", "Description", "product_name"])  # Drop rows with missing columns
    df['Severity'] = df['Severity'].astype(int)  # Convert Severity to int
    df['Description'] = df['Description'].astype(str).str.strip()  # Ensure Description is a string

    # Filter out invalid rows
    df = df[df['Description'] != '']

    # Split based on product_name
    test_data = df[df['product_name'] == 'CORE'][['Description', 'Severity']].to_numpy()
    train_data = df[df['product_name'] != 'CORE'][['Description', 'Severity']].to_numpy()

    return train_data if train else test_data

def preprocess(train_data, test_data):
    '''Generate paragraph vectors using Doc2Vec.'''
    print("Preprocessing training data...")
    train_corpus = list(_read_corpus(train_data))
    print(f"Number of valid training documents: {len(train_corpus)}")

    print("Preprocessing test data...")
    test_corpus = list(_read_corpus(test_data, tokens_only=True))
    print(f"Number of valid test documents: {len(test_corpus)}")

    if len(train_corpus) == 0 or len(test_corpus) == 0:
        raise ValueError("No valid data found after preprocessing. Check the input data.")

    cores = max(1, multiprocessing.cpu_count() // 2)  # Use half the cores

    # Initialize Doc2Vec models
    model_DM = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=1, dm_concat=1)
    model_DBOW = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=0)

    # Build vocabulary
    print("Building vocabulary...")
    model_DM.build_vocab(train_corpus)
    model_DBOW.build_vocab(train_corpus)

    print(f"Vocabulary size (DM): {len(model_DM.wv.key_to_index)}")
    print(f"Vocabulary size (DBOW): {len(model_DBOW.wv.key_to_index)}")

    if len(model_DM.wv.key_to_index) == 0 or len(model_DBOW.wv.key_to_index) == 0:
        raise ValueError("Vocabulary is empty. Check your input data or preprocessing.")

    # Train Doc2Vec models
    print("Training models...")
    model_DM.train(train_corpus, total_examples=model_DM.corpus_count, epochs=model_DM.epochs)
    model_DBOW.train(train_corpus, total_examples=model_DBOW.corpus_count, epochs=model_DBOW.epochs)

    # Generate training data
    print("Generating training data...")
    X_train = [(list(model_DM.dv[i]) + list(model_DBOW.dv[i])) for i in range(len(train_corpus))]
    Y_train = [doc[1] for doc in train_data]

    # Oversample minority classes in training data
    print("Balancing training data...")
    ros = RandomOverSampler(random_state=42)
    X_train_balanced, Y_train_balanced = ros.fit_resample(X_train, Y_train)

    print("Generating testing data...")
    X_test = [(list(model_DM.infer_vector(test_corpus[i])) + list(model_DBOW.infer_vector(test_corpus[i]))) for i in range(len(test_corpus))]
    Y_test = [doc[1] for doc in test_data]

    return X_train_balanced, Y_train_balanced, X_test, Y_test

def _read_corpus(data, tokens_only=False):
    '''Helper function to prepare data for Doc2Vec.'''
    for i, line in enumerate(data):
        description = str(line[0]).strip()  # Ensure the description is a string and trimmed
        if not description:
            continue
        tokens = gensim.utils.simple_preprocess(description)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

# Classifier
class ASP():
    def __init__(self, X_train, Y_train, X_test, Y_test):
        self.X_train, self.Y_train = X_train, Y_train
        self.X_test, self.Y_test = X_test, Y_test
        self.classifier = MLPClassifier(alpha=0.7, max_iter=10000)

    def fit(self):
        self.classifier.fit(self.X_train, self.Y_train)

    def predict(self):
        prediction = self.classifier.predict(self.X_test)
        accuracy = accuracy_score(self.Y_test, prediction)
        prf1 = precision_recall_fscore_support(y_true=self.Y_test, y_pred=prediction, average='weighted')

        print('Evaluation Metrics:')
        print(f"Accuracy: {accuracy:.4f}")
        print('Precision | Recall | F-Score')
        pprint(prf1)

# Dataset class
class Dataset():
    def __init__(self, path, project_id):
        self.path = path
        self.project_id = project_id

# Experiment class
class Experiment():
    def __init__(self, train_data, test_data):
        self.train_data = train_data
        self.test_data = test_data

    def run(self):
        X_train, Y_train, X_test, Y_test = preprocess(self.train_data, self.test_data)
        classifier = ASP(X_train, Y_train, X_test, Y_test)
        classifier.fit()
        classifier.predict()

# Script to run experiment with CORE as test set
a = Dataset('/content/mozilla_bug_report_data.csv', project_id=1)  # Replace with your dataset path
train_data = load_data(a, train=True)
test_data = load_data(a, train=False)

print(time.ctime(time.time()))
start = time.time()

experiment = Experiment(train_data, test_data)
experiment.run()

print(time.ctime(time.time()))
print('TOTAL RUNTIME: ', time.time() - start, 's')
print('')


Mon Jan 27 19:01:35 2025
Preprocessing training data...
Number of valid training documents: 6893
Preprocessing test data...
Number of valid test documents: 2889
Building vocabulary...
Vocabulary size (DM): 24295
Vocabulary size (DBOW): 24294
Training models...
Generating training data...
Balancing training data...
Generating testing data...
Evaluation Metrics:
Accuracy: 0.6383
Precision | Recall | F-Score
(0.7320354587508161, 0.6382831429560402, 0.6807283470053496, None)
Mon Jan 27 19:07:20 2025
TOTAL RUNTIME:  344.5559241771698 s



#FireFox

In [None]:
# Check for overlap between training and test data
train_descriptions = set(train_data[:, 0])  # Set of unique descriptions in training data
test_descriptions = set(test_data[:, 0])    # Set of unique descriptions in test data

# Find overlap
overlap = train_descriptions.intersection(test_descriptions)
print(f"Number of overlapping descriptions: {len(overlap)}")
if len(overlap) > 0:
    print("Overlapping Descriptions:", overlap)
else:
    print("No overlap between training and test data.")


Number of overlapping descriptions: 0
No overlap between training and test data.


In [None]:
print("Training set Severity distribution:")
print(pd.Series([doc[1] for doc in train_data]).value_counts())
print("Test set Severity distribution:")
print(pd.Series([doc[1] for doc in test_data]).value_counts())


Training set Severity distribution:
2    7514
4     690
5     583
1     259
6     194
Name: count, dtype: int64
Test set Severity distribution:
2    439
4     45
1     32
5     18
6      8
Name: count, dtype: int64


In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from pprint import pprint
import gensim
import multiprocessing
from imblearn.over_sampling import RandomOverSampler

# Helper functions for data loading and preprocessing
def load_data(dataset, train=True):
    '''Reads in and formats data from the dataset based on product_name split.'''
    df = pd.read_csv(dataset.path, sep=',', encoding='ISO-8859-1')
    df = df.dropna(subset=["Severity", "Description", "product_name"])  # Drop rows with missing columns
    df['Severity'] = df['Severity'].astype(int)  # Convert Severity to int
    df['Description'] = df['Description'].astype(str).str.strip()  # Ensure Description is a string

    # Filter out invalid rows
    df = df[df['Description'] != '']

    # Split based on product_name
    test_data = df[df['product_name'] == 'FIREFOX'][['Description', 'Severity']].to_numpy()
    train_data = df[df['product_name'] != 'FIREFOX'][['Description', 'Severity']].to_numpy()

    return train_data if train else test_data

def preprocess(train_data, test_data):
    '''Generate paragraph vectors using Doc2Vec.'''
    print("Preprocessing training data...")
    train_corpus = list(_read_corpus(train_data))
    print(f"Number of valid training documents: {len(train_corpus)}")

    print("Preprocessing test data...")
    test_corpus = list(_read_corpus(test_data, tokens_only=True))
    print(f"Number of valid test documents: {len(test_corpus)}")

    if len(train_corpus) == 0 or len(test_corpus) == 0:
        raise ValueError("No valid data found after preprocessing. Check the input data.")

    cores = max(1, multiprocessing.cpu_count() // 2)  # Use half the cores

    # Initialize Doc2Vec models
    model_DM = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=1, dm_concat=1)
    model_DBOW = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=cores, dm=0)

    # Build vocabulary
    print("Building vocabulary...")
    model_DM.build_vocab(train_corpus)
    model_DBOW.build_vocab(train_corpus)

    print(f"Vocabulary size (DM): {len(model_DM.wv.key_to_index)}")
    print(f"Vocabulary size (DBOW): {len(model_DBOW.wv.key_to_index)}")

    if len(model_DM.wv.key_to_index) == 0 or len(model_DBOW.wv.key_to_index) == 0:
        raise ValueError("Vocabulary is empty. Check your input data or preprocessing.")

    # Train Doc2Vec models
    print("Training models...")
    model_DM.train(train_corpus, total_examples=model_DM.corpus_count, epochs=model_DM.epochs)
    model_DBOW.train(train_corpus, total_examples=model_DBOW.corpus_count, epochs=model_DBOW.epochs)

    # Generate training data
    print("Generating training data...")
    X_train = [(list(model_DM.dv[i]) + list(model_DBOW.dv[i])) for i in range(len(train_corpus))]
    Y_train = [doc[1] for doc in train_data]

    # Oversample minority classes in training data
    print("Balancing training data...")
    ros = RandomOverSampler(random_state=42)
    X_train_balanced, Y_train_balanced = ros.fit_resample(X_train, Y_train)

    print("Generating testing data...")
    X_test = [(list(model_DM.infer_vector(test_corpus[i])) + list(model_DBOW.infer_vector(test_corpus[i]))) for i in range(len(test_corpus))]
    Y_test = [doc[1] for doc in test_data]

    return X_train_balanced, Y_train_balanced, X_test, Y_test

def _read_corpus(data, tokens_only=False):
    '''Helper function to prepare data for Doc2Vec.'''
    for i, line in enumerate(data):
        description = str(line[0]).strip()  # Ensure the description is a string and trimmed
        if not description:
            continue
        tokens = gensim.utils.simple_preprocess(description)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

# Classifier
class ASP():
    def __init__(self, X_train, Y_train, X_test, Y_test):
        self.X_train, self.Y_train = X_train, Y_train
        self.X_test, self.Y_test = X_test, Y_test
        self.classifier = MLPClassifier(alpha=0.7, max_iter=10000)

    def fit(self):
        self.classifier.fit(self.X_train, self.Y_train)

    def predict(self):
        prediction = self.classifier.predict(self.X_test)
        accuracy = accuracy_score(self.Y_test, prediction)
        prf1 = precision_recall_fscore_support(y_true=self.Y_test, y_pred=prediction, average='weighted')

        print('Evaluation Metrics:')
        print(f"Accuracy: {accuracy:.4f}")
        print('Precision | Recall | F-Score')
        pprint(prf1)

# Dataset class
class Dataset():
    def __init__(self, path, project_id):
        self.path = path
        self.project_id = project_id

# Experiment class
class Experiment():
    def __init__(self, train_data, test_data):
        self.train_data = train_data
        self.test_data = test_data

    def run(self):
        X_train, Y_train, X_test, Y_test = preprocess(self.train_data, self.test_data)
        classifier = ASP(X_train, Y_train, X_test, Y_test)
        classifier.fit()
        classifier.predict()

# Script to run experiment with CORE as test set
a = Dataset('/content/mozilla_bug_report_data.csv', project_id=1)  # Replace with your dataset path
train_data = load_data(a, train=True)
test_data = load_data(a, train=False)

print(time.ctime(time.time()))
start = time.time()

experiment = Experiment(train_data, test_data)
experiment.run()

print(time.ctime(time.time()))
print('TOTAL RUNTIME: ', time.time() - start, 's')
print('')


Mon Jan 27 18:54:02 2025
Preprocessing training data...
Number of valid training documents: 9240
Preprocessing test data...
Number of valid test documents: 542
Building vocabulary...
Vocabulary size (DM): 30768
Vocabulary size (DBOW): 30767
Training models...
Generating training data...
Balancing training data...
Generating testing data...
Evaluation Metrics:
Accuracy: 0.6716
Precision | Recall | F-Score
(0.7316051156879086, 0.6715867158671587, 0.6973252146490013, None)
Mon Jan 27 18:58:36 2025
TOTAL RUNTIME:  273.7646908760071 s

