## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Run if you run on colab
%pip install transformers
%pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
^C
Traceback (most recent call last):
  File "/Users/marianluca/anaconda3/envs/env_pytorch/lib/python3.9/runpy.py", line 188, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/Users/marianluca/anaconda3/envs/env_pytorch/lib/python3.9/runpy.py", line 147, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "/Users/marianluca/anaconda3/envs/env_pytorch/lib/python3.9/runpy.py", line 111, in _get_module_details
    __import__(pkg_name)
  File "/Users/marianluca/anaconda3/envs/env_pytorch/lib/python3.9/site-packages/spacy/__init__.py", line 6, in <module>
  File "/Users/marianluca/anaconda3/envs/env_pytorch/lib/python3.9/site-packages/spacy/errors.py", line 3, in <module>
    from .compat import Literal
  File "/Users/marianluca/anaconda3/envs/env_pytorch/lib/python3.9/site-packag

In [5]:
from transformers import AutoTokenizer, AutoModel, BertModel
from torch import nn
import torch
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
import torch.nn.functional as F

from sklearn.preprocessing import QuantileTransformer

In [6]:
# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.compose import ColumnTransformer

from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
)
from sklearn.metrics import balanced_accuracy_score

import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [8]:
import spacy, re

nlp = spacy.load("en_core_web_sm")

def remove_mentions_and_hashtags(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    return text


def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text


def remove_punctuation(text):

    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation using the translation table
    text_without_punct = text.translate(translator)

    return text_without_punct


def remove_stopwords(text):
    filtered_sentence = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            filtered_sentence.append(token.text)
    return " ".join(filtered_sentence)


def clean_text(text,to_lemmatize:bool = True):

    # Standardize text
    # text = standardize_accented_chars(text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove mentions and hashtags
    text = remove_mentions_and_hashtags(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove all the special characters
    text = re.sub(r"\W", " ", text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    if to_lemmatize:
        text = lemmatize(text)

    return text


def lemmatize(text):
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text

## Utils functions

In [9]:
def print_metrics(y_pred, y_test, title: str = "Confusion Matrix"):
    from sklearn.metrics import matthews_corrcoef

    print(f"Reports for {title}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred,average='weighted')}")
    print(f"Recall: {recall_score(y_test, y_pred,average='weighted')}")
    print(f"F1: {f1_score(y_test, y_pred,average='weighted')}")
    print(f"Metthew corr: {matthews_corrcoef(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    # cm = confusion_matrix(y_test, y_pred)
    # sns.heatmap(cm, annot=True, fmt="g")
    # plt.title(title)
    # plt.xlabel("Predicted")
    # plt.ylabel("True")
    # plt.show()

In [35]:
def split_dataset(data):
    """Function to split dataset into train, val and test"""
    # np.random.seed(112)
    # df_train, df_val, df_test = np.split(
    #     data.sample(frac=1, random_state=42),
    #     [int(0.8 * len(data)), int(0.9 * len(data))],
    # )

    # # Print dimensions
    # print(len(df_train), len(df_val), len(df_test))

    # Split data into train (80%) and temp_test (20%)
    df_train, temp_test = train_test_split(
        data, test_size=0.2, random_state=42, stratify=data["target"]
    )

    # Split temp_test into validation (50% of temp_test, 10% of total) and test (50% of temp_test, 10% of total)
    df_val, df_test = train_test_split(
        temp_test, test_size=0.5, random_state=42, stratify=temp_test["target"]
    )

    # Print dimensions to verify
    print(
        f"Train size: {len(df_train)}, Validation size: {len(df_val)}, Test size: {len(df_test)}"
    )

    return df_train, df_val, df_test

## Traditional algorithms

### Data Colab

In [11]:
current_directory = os.getcwd()

parent_directory = os.path.dirname(current_directory)

DATA_PATH = os.path.join(parent_directory, "data")
DATA_TOURNAMENT = os.path.join(DATA_PATH, "tournament_hints_data.parquet")

### Data


In [12]:
current_directory = os.getcwd()

parent_directory = os.path.dirname(current_directory)

DATA_PATH = os.path.join(parent_directory,"data")
DATA_TOURNAMENT = os.path.join(DATA_PATH,"tournament_hints_data.parquet")

print("Current Directory:", current_directory)
print("Parent Directory:", parent_directory)

Current Directory: /Users/marianluca/Projects/HackingBigNumbers/Tournament/marian
Parent Directory: /Users/marianluca/Projects/HackingBigNumbers/Tournament


In [13]:
df = pd.read_parquet(DATA_TOURNAMENT)
df

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies
...,...,...,...,...,...
626241,Global Golf Tech Solutions,Manufacturing | Golf Academy | G Launch Monito...,The best personal golf launch monitor screen p...,Global Golf Tech Solutions is a company that s...,Golf Courses & Country Clubs
626242,Renko,Latest Processing Technologies | E Gaskets for...,EPDM Rubber products from Renko can be found o...,RENKO is a company that has been producing hig...,Fabricated Rubber Products
626243,Norstal,Residential Buildings | Custom Project Service...,Norstal produces a broad range of steel struct...,Norstal is a steel structure producer that spe...,Metal Fabrication Services
626244,Acoustic,Wood-based Acoustic Products Manufacturer | De...,"We are designed and manufactured in UAE, Acous...",Acoustic.ae is a member of a UAE-based group o...,Building Material Manufacturers


In [14]:
columns_round = set(df.columns)
columns_round.remove("main_business_category")
print(columns_round)

{'business_tags', 'description', 'commercial_name', 'short_description'}


In [36]:
from sklearn.preprocessing import LabelEncoder

# Creating an instance of LabelEncoder
encoder = LabelEncoder()

# Fitting the encoder
encoder.fit(df["main_business_category"].unique())

# Transforming the data
df["target"] = encoder.transform(df["main_business_category"])
# print("Encoded data:", encoded_data)

# Inverse transforming the data
# decoded_data = encoder.inverse_transform(encoded_data)
# print("Decoded data:", decoded_data)

In [49]:
def remove_rare_classes(df, target_column, more_than:int = 1):
    """Remove rows where the target class has only one occurrence.

    Args:
        df (DataFrame): The dataset to be filtered.
        target_column (str): The column name of the target variable.

    Returns:
        DataFrame: Filtered dataset without rare classes.
    """
    # Calculate the count of each class in the target column
    value_counts = df[target_column].value_counts()

    # Identify classes where the count is more than 1
    classes_to_keep = value_counts[value_counts > more_than].index

    # Filter the DataFrame to keep only rows with classes that have more than one occurrence
    filtered_df = df[df[target_column].isin(classes_to_keep)]

    print(
        f"From a total of {len(df[target_column].unique())} it remains {len(filtered_df[target_column].unique())} classes. So {len(df[target_column].unique()) - len(filtered_df[target_column].unique())} was deleted"
    )

    return filtered_df


# Example usage:
df_filtered = remove_rare_classes(df, 'target',5)

From a total of 569 it remains 551 classes. So 18 was deleted


In [50]:
df_train, df_val, df_test = split_dataset(df_filtered)

Train size: 500955, Validation size: 62619, Test size: 62620


In [None]:
# df_filtered["clean_text"] = df_filtered["Tweet"].apply(lambda x: clean_text(x))

### Algorithms


In [53]:
def apply_bow(X_train, X_test):
    # count_vect = CountVectorizer()

    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf_transformer = TfidfVectorizer()

    # X_train = count_vect.fit_transform(X_train)
    X_train = tfidf_transformer.fit_transform(X_train)

    # X_test = count_vect.transform(X_test)
    X_test = tfidf_transformer.transform(X_test)

    return X_train, X_test


def apply_traditional_algorithms(df_train, df_test, pipeline_cls, threshold=0.5):

    X_train, y_train = df_train["clean_text"], df_train["target"]
    X_test, y_test = df_test["clean_text"], df_test["target"]

    X_train, X_test = apply_bow(X_train, X_test)

    best_model, best_model_name = None, None
    base_f1_score = 0
    for Name, cls in pipeline_cls.items():
        cls.fit(X_train, y_train)
        y_proba = cls.predict_proba(X_test)
        y_pred = (y_proba[:, 1] >= threshold).astype(int)

        if f1_score(y_test, y_pred, average="macro") > base_f1_score:
            base_f1_score = f1_score(y_test, y_pred, average="macro")
            best_model = cls
            best_model_name = Name


        print(Name)
        print(cls)
        print_metrics(y_pred, y_test, Name + " - Text")
        
        print(f"Test set F1-score: {f1_score(y_test, y_pred, average='macro')}")
        print(f"Test set Precision {precision_score(y_test,y_pred,average='macro')}")
        print(f"Test set Recall {recall_score(y_test, y_pred,average='macro')}")
        print("\n\n")
        print(
            "-----------------------------------------------------------------------------------------------------------------------"
        )

    return best_model, best_model_name, base_f1_score

In [54]:
pipeline_classifiers = {
    "RandomForestClassifier": RandomForestClassifier(class_weight="balanced"),
    "XGBClassifier": XGBClassifier(scale_pos_weight=1, use_label_encoder=False, eval_metric='mlogloss'),
    "SVC": SVC(class_weight="balanced", probability=True),
    "Logistic Regression": LogisticRegression(),
}

In [58]:
print(columns_round)

{'business_tags', 'description', 'commercial_name', 'short_description'}


In [59]:
for i, col_name in enumerate(columns_round):
    print("#####################################################################")
    print(f"######################## Round {i} #################################")
    print(f"######################## {col_name} #################################")

    df_tmp_train, df_tmp_test = (
        df_val[[col_name, "target"]],
        df_test[[col_name, "target"]],
    )
    # df_tmp_train.rename(columns={col_name: "clean_text"}, inplace=True)
    # df_tmp_test.rename(columns={col_name:"clean_text"},inplace=True)

    df_tmp_train["clean_text"] = df_filtered[col_name].apply(lambda x: clean_text(x))
    df_tmp_test["clean_text"] = df_filtered[col_name].apply(lambda x: clean_text(x))

    apply_traditional_algorithms(df_tmp_train, df_tmp_test, pipeline_classifiers)

#####################################################################
######################## Round 0 #################################
######################## business_tags #################################


## Bert

In [None]:
import torch.nn as nn
from transformers import BertTokenizer, BertModel


class BERTClass(nn.Module):

    def __init__(
        self,
        bert_model: str = "bert-base-uncased",
        num_classes: int = 2,
        droput_rate: int = 0.3,
    ):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(bert_model, return_dict=True)
        self.dropout = nn.Dropout(droput_rate)
        self.linear = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)

        return output