# **Installing the packages**

In [1]:
! pip install pandas numpy plotly scikit-learn matplotlib



# **Importing libraries**

In [2]:
# Mount the drive if not mounted
from google.colab import drive
drive.mount("/content/drive/")

from joblib import parallel_backend

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report, accuracy_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive/


# **Loading and Pre-processing data**

In [3]:
df = pd.read_csv("/content/drive/MyDrive/AA-Tutorial/data/Agora.csv", encoding='ISO-8859-1')
# Renaming all the features of the dataframe
df = df.rename(str.strip, axis='columns')
# Merging the Item and Item Description using a [SEP] token
separator = ' [SEP] '
df['TEXT'] = df.apply(lambda row: f"{row['Item']}{separator}{row['Item Description']}", axis=1)
# dropping Unncessary columns
df.drop(columns=["Item", "Item Description", "Category", "Price", "Origin", "Destination", "Rating", "Remarks"], inplace=True)
# Assuming that vendors Amsterdam100 and amsterdam100 are the same vendors
df.Vendor = df.Vendor.apply(lambda x: x.lower())
# Getting all unique vendor handles from the 'Vendor' column.
unique_vendors = df['Vendor'].unique()

# Assigning vendor IDs to vendor handles using a dictionary comprehension.
# This approach eliminates the need for checking if a vendor already exists in the dictionary,
# as each unique vendor will be processed once. The enumerate function provides a counter (idx),
# which is used to assign IDs, starting from 1 for the first vendor.
vendor_to_idx_dict = {vendor: idx + 1 for idx, vendor in enumerate(unique_vendors)}

# Updating the 'Vendor' column in the DataFrame to reflect the vendor IDs.
# The 'map' function is used to replace each vendor handle with its corresponding vendor ID
# based on the 'vendor_to_idx_dict'. This operation is vectorized and efficient.
df['Vendor'] = df['Vendor'].map(vendor_to_idx_dict)

In [4]:
df.shape

(109689, 2)

Due to the extensive time required to train on over 100K+ samples, we have decided to limit our analysis to a subset of 5K samples. To get these samples, we look into vendors that have 5+ advertisements and then allocate all the vendors that have less than 5 ads into a new class, "others".  

In [5]:
df = df.iloc[:5000]

In [6]:
# Assigning a vendor ID to "others" class
vendor_to_idx_dict["others"] = len(vendor_to_idx_dict) + 1
# Calculate advertisement frequency for each vendor
ad_freq = df['Vendor'].value_counts()
# Filter vendors with ad frequency less than 5
vendors_to_replace = ad_freq[ad_freq < 5].index
# Update DataFrame: Replace vendor names with 'others' where ad frequency is less than 5
df['Vendor'] = df['Vendor'].apply(lambda x: vendor_to_idx_dict['others'] if x in vendors_to_replace else x)

# **Visualizing data**

In [7]:
def generate_freq_distribution(df, column_name):
    """
    This function takes a DataFrame and a column name as inputs and generates a bar plot of the normalized frequency distribution of the values in the specified column using Plotly.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - column_name (str): The name of the column to analyze for value frequencies.

    The function calculates the normalized frequencies, converts them to a DataFrame, and then plots them in a bar chart. The title of the chart includes the total number of unique entries in the column.
    """

    # Calculate the normalized frequency of the specified column
    # and count the number of unique entries
    freq = df[column_name].value_counts(normalize=True)
    nr_entries = df[column_name].nunique()

    # Convert the frequency Series to a DataFrame for plotting
    # Setting the column names to the name of the analyzed column and 'frequency'
    freq_df = freq.reset_index()
    freq_df.columns = [column_name, 'frequency']

    # Create the plot using Plotly Express
    # The x-axis represents the unique values in the column
    # The y-axis represents the normalized frequency of each value
    # The plot includes a title that states the total number of unique entries
    fig = px.bar(freq_df, x=column_name, y='frequency',
                 labels={'frequency': 'Normalized Frequency'},
                 title=f'Total number of Entries: {nr_entries}')
    # Display the plot in the output
    fig.show()

In [9]:
generate_freq_distribution(df, "Vendor")

In [10]:
# Calculating the sentence length by splitting tokens by ' '
df['SENT_LEN'] = df['TEXT'].apply(lambda x: len(x.split(" ")))

In [11]:
# Visualizing sentence length
fig = px.violin(df, y="SENT_LEN", box=True, # draw box plot inside the violin
                points='all', # can be 'outliers', or False
               )
fig.show()

# **Splitting data**

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.25, random_state=1111)
# Calculate the proportion of test size in the temporary dataset
test_size_in_temp = 0.20 / (0.20 + 0.05)
# Now split the temporary set into test and validation sets
test_df, val_df = train_test_split(temp_df, test_size=test_size_in_temp, random_state=1111)

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Validation set size: {len(val_df)}")

Training set size: 3750
Test set size: 250
Validation set size: 1000


# **Model Evaluation**

In [None]:
# This function evaluates the performance of a trained model on a test dataset.
def evaluateBoWModels(trained_model, test_data):
  # Extract the 'Vendor' column from the test dataset and convert it to a list.
  # This list contains the actual labels/classes for each test instance.
  vendors = test_data['Vendor'].to_list()
  # Extract the 'TEXT' column from the test dataset and convert it to a list.
  # This list contains the input data (text) for making predictions.
  text = test_data['TEXT'].to_list()

  # Use the trained model to make predictions on the input text from the test dataset.
  predictions = trained_model.predict(text)

  # Calculate the macro-averaged F1 score, which calculates metrics for each label,
  # and finds their unweighted mean. This does not take label imbalance into account.
  macro_f1 = f1_score(vendors, predictions, average='macro')
  # Calculate the micro-averaged F1 score, which aggregates the contributions of all classes
  # to compute the average metric. This accounts for label imbalance.
  micro_f1 = f1_score(vendors, predictions, average='micro')
  # Calculate the weighted-averaged F1 score, which calculates metrics for each label,
  # and finds their average weighted by support (the number of true instances for each label).
  # This alters 'macro' to account for label imbalance.
  weighted_f1 = f1_score(vendors, predictions, average='weighted')
  # Calculate the balanced accuracy, which is the average of recall obtained on each class.
  # This metric is useful for dealing with imbalanced datasets.
  accuracy = balanced_accuracy_score(vendors, predictions)

  # Return the calculated metrics: accuracy, weighted F1 score, micro F1 score, and macro F1 score.
  return accuracy, weighted_f1, micro_f1, macro_f1

# This function generates and prints a detailed classification report
# for the predictions made by a trained model on the test dataset.
def generate_classification_report(trained_model, test_data):
  # Extract the 'Vendor' column from the test dataset and convert it to a list.
  # This list contains the actual labels/classes for each test instance.
  vendors = test_data['Vendor'].to_list()

  # Extract the 'TEXT' column from the test dataset and convert it to a list.
  # This list contains the input data (text) for making predictions.
  text = test_data['TEXT'].to_list()

  # Use the trained model to make predictions on the input text from the test dataset.
  predictions = trained_model.predict(text)

  # Generate a classification report comparing the actual labels to the predicted labels.
  # The report includes main classification metrics: precision, recall, f1-score for each class,
  # along with a weighted average of these metrics. The 'digits=4' parameter formats the output
  # to display numbers with four decimal places for precision.
  print(classification_report(np.array(vendors), np.array(predictions), digits=4))

# **Model Architecture**

In [None]:
class BoWModel(object):

    def __init__(self, train_data, test_data, stats_model_type, n_splits=2):
        self.train_data = train_data
        self.test_data = test_data

        # Initializing a KFold cross-validator with a specified number of splits,
        # enabling shuffling to randomize the splits and setting a random state for reproducibility.
        # Defining a scoring dictionary with accuracy as the metric to be used for model evaluation.
        self.cv = KFold(n_splits=n_splits, shuffle=True, random_state=1111)
        self.scoring = {'Accuracy': make_scorer(accuracy_score)}

        # Specifying the parameters to be tuned during the hyperparameter optimization process.
        # Includes the n-gram range for the CountVectorizer and the use of idf for the TfidfTransformer.
        self.parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)}
        # Storing the type of statistical model to be used, as specified during instantiation.
        self.stats_model_type = stats_model_type

        # Extracting the text corpus (titles) and the target variable (vendors) from the training data.
        self.train_corpus = list(self.train_data['TEXT'])
        self.train_vendor = list(self.train_data['Vendor'])

    def train_models(self):
      # Conditional blocks to handle different types of statistical models.
      # Each block initializes a model pipeline with a CountVectorizer, TfidfTransformer,
      # and the specified classifier, followed by a GridSearchCV for hyperparameter optimization.
      # Controlling the backend that joblib will use
      with parallel_backend('threading', n_jobs=-1):
          # Depending on the stats_model_type, different classifiers are used:
          # MultinomialNB, SVC, RandomForestClassifier, LogisticRegression, or MLPClassifier.
          # For each classifier, a GridSearchCV object is created with the specified parameters,
          # cross-validator, scoring metrics, and verbosity. The GridSearchCV is then fitted with
          # the training data.

          # After fitting, model evaluation metrics (accuracy, weighted F1, micro F1, macro F1) are calculated
          # using a separate function, `evaluateBoWModels`, which is not defined in this snippet but is assumed
          # to take the fitted model and test data as input to compute the metrics.

          if self.stats_model_type == 'MultinomialNB':
            # Multinomial-NB
            clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
            gs_clf = GridSearchCV(clf, self.parameters, n_jobs=10, cv=self.cv, return_train_score=True, verbose=1, scoring=self.scoring, refit='Accuracy')
            gs_clf = gs_clf.fit(self.train_corpus, self.train_vendor)
            accuracy, weighted_f1, micro_f1, macro_f1 = evaluateBoWModels(gs_clf, self.test_data)

          elif self.stats_model_type == 'SVC':
            # Support Vector Machine
            clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(class_weight='balanced', kernel="linear"))])
            gs_clf = GridSearchCV(clf, self.parameters, n_jobs=10, cv=self.cv, return_train_score=True, verbose=1, scoring=self.scoring, refit='Accuracy')
            gs_clf = gs_clf.fit(self.train_corpus, self.train_vendor)
            accuracy, weighted_f1, micro_f1, macro_f1 = evaluateBoWModels(gs_clf, self.test_data)

          elif self.stats_model_type == 'RandomForestClassifier':
            # Random Forest
            clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=5, random_state=1))])
            gs_clf = GridSearchCV(clf, self.parameters, n_jobs=10, cv=self.cv, return_train_score=True, verbose=1, scoring=self.scoring, refit='Accuracy')
            gs_clf = gs_clf.fit(self.train_corpus, self.train_vendor)
            accuracy, weighted_f1, micro_f1, macro_f1 = evaluateBoWModels(gs_clf, self.test_data)

          elif self.stats_model_type == 'LogisticRegression':
            # Logistic Regression
            clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(class_weight='balanced', random_state=1))])
            gs_clf = GridSearchCV(clf, self.parameters, n_jobs=10, cv=self.cv, return_train_score=True, verbose=1, scoring=self.scoring, refit='Accuracy')
            gs_clf = gs_clf.fit(self.train_corpus, self.train_vendor)
            accuracy, weighted_f1, micro_f1, macro_f1 = evaluateBoWModels(gs_clf, self.test_data)

          elif self.stats_model_type == 'MLPClassifier':
            # Multilayer Perceptron Model
            clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MLPClassifier(hidden_layer_sizes=(100,100,100), random_state=1))])
            gs_clf = GridSearchCV(clf, self.parameters, n_jobs=10, cv=self.cv, return_train_score=True, verbose=1, scoring=self.scoring, refit='Accuracy')
            gs_clf = gs_clf.fit(self.train_corpus, self.train_vendor)
            accuracy, weighted_f1, micro_f1, macro_f1 = evaluateBoWModels(gs_clf, self.test_data)

          else:
            # If an unrecognized stats_model_type is specified, an exception is raised.
            raise Exception("--stats_model_type can only be one amongst MultinomialNB, MLPClassifier, LogisticRegression, RandomForestClassifier, and SVC")

      return clf, accuracy, weighted_f1, micro_f1, macro_f1

# **Training Models**

In [None]:
scores_df = pd.DataFrame()
scores_df["Metrics"] = ['Accuracy','Weighted-F1','Micro-F1','Macro-F1']

[Multinomial Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)

In [None]:
_, accuracy, weighted_f1, micro_f1, macro_f1 = BoWModel(train_df, test_df, "MultinomialNB").train_models()
scores_df["MultinomialNB"] = [accuracy, weighted_f1, micro_f1, macro_f1]

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Suport Vector Machine](https://en.wikipedia.org/wiki/Support_vector_machine)

---



In [None]:
_, accuracy, weighted_f1, micro_f1, macro_f1 = BoWModel(train_df, test_df, "SVC").train_models()
scores_df["SVC"] = [accuracy, weighted_f1, micro_f1, macro_f1]

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Random Forest](https://en.wikipedia.org/wiki/Random_forest)

In [None]:
_, accuracy, weighted_f1, micro_f1, macro_f1 = BoWModel(train_df, test_df, "RandomForestClassifier").train_models()
scores_df["RandomForest"] = [accuracy, weighted_f1, micro_f1, macro_f1]

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Logistic Regression](https://en.wikipedia.org/wiki/Logistic_regression)

In [None]:
_, accuracy, weighted_f1, micro_f1, macro_f1 = BoWModel(train_df, test_df, "LogisticRegression").train_models()
scores_df["LogisticRegression"] = [accuracy, weighted_f1, micro_f1, macro_f1]

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Multilayer perceptron](https://en.wikipedia.org/wiki/Multilayer_perceptron)

In [None]:
_, accuracy, weighted_f1, micro_f1, macro_f1 = BoWModel(train_df, test_df, "MLPClassifier").train_models()
scores_df["MLP"] = [accuracy, weighted_f1, micro_f1, macro_f1]

Fitting 2 folds for each of 4 candidates, totalling 8 fits


Checking Model Performance

In [None]:
scores_df

Unnamed: 0,Metrics,MultinomialNB,SVC,RandomForest,LogisticRegression,MLP
0,Accuracy,0.331297,0.682322,0.412122,0.775756,0.676682
1,Weighted-F1,0.524405,0.708481,0.34429,0.662284,0.752288
2,Micro-F1,0.596,0.7,0.344,0.668,0.756
3,Macro-F1,0.337436,0.671945,0.283832,0.590027,0.646615


# **Storing results**

In [None]:
scores_df.to_csv('/content/drive/MyDrive/AA-Tutorial/data/results.csv', encoding='utf-8', index=False)