In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


import xgboost as xgb
from joblib import dump
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import threading
import statistics
import re
from nltk.corpus import stopwords
import scipy.stats as stats
import matplotlib.pyplot as plt
import threading
import numpy as np

In [None]:
def tokenization(input):
    assert(type(input) == list or type(input) == np.ndarray)

    processing_arm = [sub.split() for sub in input]
    extracted_tokens = []
    for elements in processing_arm:
        if type(elements) == list:
            for value in elements:
                extracted_tokens.append(value)

    release_stage = np.array(extracted_tokens)

    return release_stage

def stop_words_purge(stored_tokens):
    def remove_stopwords_from_text(tokens, stop_words_applied):
        assert (type(tokens) == list or type(tokens) == np.ndarray)
        if type(tokens) != np.ndarray:
            tokens = np.array(tokens)

        filtered_tokens = []
        for token in tokens:
            if token.lower() not in stop_words_applied:
                filtered_tokens.append(token)
        return filtered_tokens

    assert(type(stored_tokens) == list or type(stored_tokens) == np.ndarray)
    dev_mode = 0

    english_stop_words = set(stopwords.words("English"))
    spanish_stop_words = set(stopwords.words("Spanish"))
    if dev_mode == 1:
        print("<------------->")
        print("Purging of spanish and english stop words in progress...")
        print(f"English stop words(length: {len(english_stop_words)}): {english_stop_words}")
        print(f"Spanish stop words(length: {len(spanish_stop_words)}): {spanish_stop_words}")

    post_purge = []
    # streamlined version
    for tokens in stored_tokens:
        tokens_lower = [token.lower() for token in tokens]
        english_clearance = remove_stopwords_from_text(tokens_lower, english_stop_words)
        spanish_clearance = remove_stopwords_from_text(english_clearance, spanish_stop_words)
        post_purge.append(spanish_clearance)
    if dev_mode == 1:
        print("Purge of Spanish and english stop words completed...")
        print("<------------->")

    return post_purge

def replace_additional_dollar_signs(text):
    assert(type(text) == str)
    parts = text.split('$', 1)

    if len(parts) > 1:
        # If there was at least one '$', replace the rest and reconstruct the string
        parts[1] = re.sub(r"\$", "", parts[1])
        return '$'.join(parts)
    else:
        return text

def dataframe_generator(description_column, results_column, c1, c2, column_name, developer_mode):
    def data_cataloging(mutation_col_1, mutation_col_2, data_logging_01, data_logging_02):
        assert(type(mutation_col_1) == str and type(mutation_col_2) == str)
        assert(type(data_logging_01) == list or type(data_logging_01) == np.ndarray)
        assert(type(data_logging_01) == list or type(data_logging_02) == np.ndarray)

        dataframe = pd.DataFrame({
            mutation_col_1: data_logging_01,
            mutation_col_2: data_logging_02
        })
        return dataframe

    assert(type(description_column) == list or type(description_column) == np.ndarray)
    assert(type(results_column) == list or type(results_column) == np.ndarray)
    assert(type(c1) == str)
    assert(type(c2) == str)
    assert(type(column_name) == str)
    assert(type(developer_mode) == int)
    assert(len(description_column) == len(results_column))

    dataframe_desription = description_column
    dataframe_specifics = results_column

    dataframe = data_cataloging(c1, c2, dataframe_desription, dataframe_specifics)
    file_name = f"{column_name.capitalize()} data analysis results.csv"

    #saving it as a csv.
    #-------------------
    dataframe.to_csv(file_name, index=True)
    # -------------------
    if developer_mode == 1:
        print(f"File name: {file_name}")

def enhanced_diagnostics(column_name, input_data, developer_mode):
    assert (type(column_name) == str)
    assert (type(input_data) == list or type(input_data) == np.ndarray)
    assert (type(developer_mode) == int)
    if type(input_data) != np.ndarray:
        input_data = np.array(input_data)

    zeros = 0
    positives = 0
    negatives = 0
    for values in input_data:
        if values == 0:
            zeros += 1
        elif values > 0:
            positives += 1
        elif values < 0:
            negatives += 1
    percentage_of_unique = (len(set(input_data)) / len(input_data))*100

    updated_name = column_name + " enhanced diagnostics"
    description = ["Number of unique values", "Percentage of unique values", "Zeros", "Negatives", "Positives", "Total number of raw input values"]
    outputs = [len(set(input_data)), f"{percentage_of_unique}%", zeros, negatives, positives, len(input_data)]
    dataframe_generator(description,outputs, "Analysis metric", "Result",updated_name.upper(),developer_mode)

    if developer_mode == 1:
        if len(description) == len(outputs):
            for element_A175 in range(len(description)):
                print(f"{description[element_A175]}: {outputs[element_A175]}")

def numeric_analysis_arm(column_name, input_data, developer_mode):
    assert(type(column_name) == str)
    assert(type(input_data) == list or type(input_data) == np.ndarray)
    assert(type(developer_mode) == int)
    if type(input_data) != np.ndarray:
        input_data = np.array(input_data)

    analysis_description = ["Maximum", "Minimum", "Mean", "Median", "Mode", "Standard deviation", "Range", "Skew", "Kurtosis", "Variance"]
    analysis_results = [round(np.max(input_data), 4), round(np.min(input_data), 4), round(np.mean(input_data), 4), round(np.median(input_data), 4),
                        round(statistics.mode(input_data), 4), round(np.std(input_data), 4), round(np.max(input_data) - np.min(input_data), 4),
                        stats.skew(input_data),round(stats.kurtosis(input_data),4), round(statistics.variance(input_data),4)]
    if developer_mode == 1:
        for analysis_outputs in range(len(analysis_results)):
            print(f"{analysis_description[analysis_outputs]}: {analysis_results[analysis_outputs]}")

    dataframe_generator(analysis_description,analysis_results,"Analysis metric", "Result",column_name.upper(), developer_mode)

def remove_emojis(text):
    assert(type(text))
    # Regex pattern to match all emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def link_mention_purge(text):
    assert(type(text) == str)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove Mentions
    text = re.sub(r'@\w+', '', text)
    # Remove everything except letters and necessary whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def inpurity_purging_protocol(input_storage):
    assert(type(input_storage) == list or type(input_storage) == np.ndarray)
    if type(input_storage) != np.ndarray:
        input_storage = np.array(input_storage)
    cleared = []
    for element in input_storage:
        text_no_urls = link_mention_purge(element)
        baseline = remove_emojis(text_no_urls).replace("(","").replace(")","").strip('"')
        updated_baseline = re.sub(r"(\w)([,.!?;:()-])", r"\1 \2", baseline)
        purged_S01 = updated_baseline.replace("..", "")
        purged_S02 = purged_S01.replace('"',"")
        purged_S03 = purged_S02.replace(";)","")
        purged_S04 = purged_S03.replace("*", "")
        purged_S05 = re.sub(r"@\w+", " ", purged_S04)
        purged_S06 = purged_S05.replace("  "," ")
        purged_S07 = purged_S06.replace("!!", "!")
        purged_S08 = purged_S07.replace("!!!", "!")
        purged_S09 = replace_additional_dollar_signs(purged_S08)
        purged_S10 = purged_S09.replace(",,","")
        purged_S11 = purged_S10.replace("=(","")
        purged_S12 = purged_S11.replace("=>","")
        purged_S13 = purged_S12.replace(" .", "")
        purged_S14 = purged_S13.replace("!","")
        cleared.append(purged_S14)

    cleared_numpy_conversion = np.array(cleared)

    return cleared_numpy_conversion

In [None]:
def remove_stopwords_from_text(tokens, stop_words_applied):
    assert(type(tokens) == list or type(tokens) == np.ndarray)
    if type(tokens) != np.ndarray:
        tokens = np.array(tokens)

    filtered_tokens = []
    for token in tokens:
        if token.lower() not in stop_words_applied:
            filtered_tokens.append(token)
    return filtered_tokens

def ml_diagnostics(y_test, predictions):
    assert(type(y_test) == list or type(y_test) == np.ndarray)
    assert(type(predictions) == list or type(predictions) == np.ndarray)

    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    details = ["Accuracy", "Precision", "Recall", "F1"]
    results = [round(accuracy, 4), round(precision, 4), round(recall, 4), round(f1, 4)]
    if len(details) == len(results):
        print("Diagnostic metrics")
        for elements in range(len(results)):
            print(f"{details[elements]}: {results[elements]}")

    if (accuracy > 0.9) and (precision > 0.9):
        print()
        print(f"Go touch come grass. You got accuracy to reach {round(accuracy, 4)} and precision to reach {round(precision, 4)}")
        if (recall > 0.8) and (f1 > 0.8):
            print("Bruv. Go live life outside. This is already accurate like you had OCD writing this.")
            print(f"Recall is {round(recall, 4)} and F1 score is {round(f1, 4)}. Go touch some grass. Seriously.")

def gradient_booster(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)

    return y_test, predictions, clf

def upgraded_gradient_booster(X, y):
    if type(y) != np.ndarray:
        y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    #Baseline classifier
    base_clf = DecisionTreeClassifier(max_depth=1)
    #AdaBoost paired with blase classifier
    ada_clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm="SAMME.R", learning_rate=0.666)
    ada_clf.fit(X_train, y_train)
    predictions  = ada_clf.predict(X_test)

    return y_test, predictions, ada_clf

Purification and analysis

In [2]:
purification_analysis_deploy = 0

In [4]:
if purification_analysis_deploy == 1:
    # Uses the english translations of the tweets
    file_path = r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\tranlated_tweets.csv"
    source_data = pd.read_csv(file_path)
    developer_mode = 1
    loop_viewer = 0

    print("<------------------>")
    column_names = list(source_data.columns)
    column_data_types_detected = []
    object_data_columns = []
    analysis_compatible_columns = []

    for c_names in column_names:
        column_data = source_data[c_names].to_numpy()
        types_detected = column_data.dtype
        column_data_types_detected.append(types_detected)
        if types_detected == "int64":
            analysis_compatible_columns.append(c_names)
        elif types_detected == "object":
            object_data_columns.append(c_names)

    if developer_mode == 1:
        if len(column_names) == len(column_data_types_detected):
            for element in range(len(column_names)):
                print(f"Column name: {column_names[element]} | Data type detected: {column_data_types_detected[element]}")
    elif developer_mode == 0:
        print("Developer mode inactive")
    print("<------------------>")

    text_isolation = source_data["Description Cleaned Translated"].to_numpy()
    text_isolation = inpurity_purging_protocol(text_isolation)
    unique_locations = []
    for tweets in text_isolation:
        if tweets not in unique_locations:
            unique_locations.append(tweets)

    target_isolation = source_data["Analysis results"].to_numpy()
    numbers_detected = []
    for numbers in target_isolation:
        if numbers not in numbers_detected:
            numbers_detected.append(numbers)
        elif numbers in numbers_detected:
            continue

    if len(target_isolation) == len(text_isolation):
        print(f"Target isolated data: {len(target_isolation)}")
        print(f"Text isolated data: {len(text_isolation)}")

    zero = 0
    one = 0
    asociation_text = []
    asociation_target = []
    if len(target_isolation) == len(text_isolation):
        for elements in range(len(target_isolation)):
            asociation_text.append(text_isolation[elements])
            asociation_target.append(target_isolation[elements])
            if target_isolation[elements] == 0:
                zero += 1
            elif target_isolation[elements] == 1:
                one += 1

    full_set = one + zero
    if developer_mode == 1:
        print(f"Flagged: {one}")
        print(f"Cleared: {zero}")
        print(f"Full set: {full_set}")

    if full_set != 0:
        zero_percentage = (zero / full_set) * 100
        one_percentage = (one / full_set) * 100
    else:
        zero_percentage = "Infinity"
        one_percentage = "Infinity"

    if one > zero:
        description = ["Cleared", "Flagged", "Percentage ratio of Cleared", "Percentage ratio of Flagged"]
        results = [zero, one, f"{round(zero_percentage, 2)} %", f"{round(one_percentage, 2)} %"]
    else:
        description = ["Cleared", "Flagged", "Percentage ratio of flagged", "Percentage ratio of Cleared"]
        results = [zero, one, f"{round(one_percentage, 2)} %", f"{round(zero_percentage, 2)} %"]

    # Token analysis with stop words purge integrated
    text_isolation = text_isolation.flatten()
    tokenized_conversion = tokenization(text_isolation)

    # Emoji removal in classic for-loop format
    post_purge_storage = []
    for pre_purge in tokenized_conversion:
        post_purge = remove_emojis(pre_purge)
        post_purge_storage.append(post_purge)

    # Calculate unique tokens after stop words removal
    unique_tokens, counts = np.unique(post_purge_storage, return_counts=True)

    max_value = max(counts)
    index_coordinates = 0
    for values in range(len(counts)):
        if counts[values] == max_value:
            index_coordinates = values
            break

    # Out of the box random analysis
    print(f"Max value: {max_value}")
    print(f"Index coordinate of max value: {index_coordinates}")
    print(f"Token at corresponding coordinate: {post_purge_storage[index_coordinates]}")

    # Zips everything into a dictionary
    token_counts = dict(zip(unique_tokens, counts))

    # Prepare for dataframe generation
    token_keys = list(token_counts.keys())
    token_values = list(token_counts.values())
    token_des_column = ["Unique tokens", "Total tokens", "Percentage of unique tokens"]
    token_res_column = [len(unique_tokens), sum(counts), f"{(len(unique_tokens) / sum(counts) * 100)} %"]

    # Enable this multithread deployment to get the csv files
    # I disabled it because its not required anymore
    multithreading_deployment = 0
    if multithreading_deployment == 1:
        if __name__ == "__main__":
            t1 = threading.Thread(target=dataframe_generator(asociation_text, asociation_target, c1="Cleaned tweet", c2="Tags", column_name="Purification verification", developer_mode=developer_mode))
            t2 = threading.Thread(target=dataframe_generator(description, results, c1="Cleared/Flagged quantity", c2="Analysis output", column_name="Tags analysis", developer_mode=developer_mode))
            t3 = threading.Thread(target=dataframe_generator(token_des_column, token_res_column, c1="Token details", c2="Token count", column_name="Token analysis", developer_mode=developer_mode))
            t4 = threading.Thread(target=dataframe_generator(token_keys, token_values, c1="Individual words", c2="Occurrences", column_name="Frequency of token usage", developer_mode=developer_mode))

            threads = [t1, t2, t3, t4]
            for individual_threads in threads:
                individual_threads.start()
            for initiated_threads in threads:
                initiated_threads.join()

Gradient boosting expriment

In [5]:
Gradient_boosting_test = 0

In [None]:
if Gradient_boosting_test == 1:
    purified_data = pd.read_csv(r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\Results\Token based\Post_purging\38\Purification verification data analysis results.csv")

    cleaned_tweet = purified_data["Cleaned tweet"].to_numpy()
    tags = purified_data["Tags"].to_numpy()

    # Tokenization
    token_conversion = []
    for tweets in cleaned_tweet:
        post_conversion = word_tokenize(tweets)
        token_conversion.append(post_conversion)

    print("Tokenization complete")
    for i in range(3):
        print(token_conversion[i])

    # Stopwords purge
    english_stop_words = set(stopwords.words("English"))
    spanish_stop_words = set(stopwords.words("Spanish"))
    print("<------------->")
    print("Purging of spanish and english stop words in progress...")
    print(f"English stop words(length: {len(english_stop_words)}): {english_stop_words}")
    print(f"Spanish stop words(length: {len(spanish_stop_words)}): {spanish_stop_words}")

    stage_0 = []
    # English stopwords removal
    for stage_0_element in token_conversion:
        english_cleaned_text = remove_stopwords_from_text(stage_0_element, english_stop_words)
        stage_0.append(english_cleaned_text)

    stop_words_stage_1 = []
    # Spanish stop words removal
    stage_1 = []
    for stage_1_elements in stage_0:
        spanish_cleaned_text = remove_stopwords_from_text(stage_1_elements, spanish_stop_words)
        stage_1.append(spanish_cleaned_text)
    print("Purge of Spanish and english stop words completed...")
    print("<------------->")

    # lammination
    lemmatizer = WordNetLemmatizer()
    lamminized_tokens = []
    for post_processed_tokens in stage_1:
        processed_tokens = []
        for token in post_processed_tokens:
            if token not in english_stop_words and token not in spanish_stop_words and token.isalpha():
                # Lemmatize the token and append to the result list
                lemmatized_token = lemmatizer.lemmatize(token)
                processed_tokens.append(lemmatized_token)
        lamminized_tokens.append(processed_tokens)


    preprocessed_texts = [" ".join(tokens) for tokens in lamminized_tokens]

    tfidf_vectorizer = TfidfVectorizer()
    X = tfidf_vectorizer.fit_transform(preprocessed_texts)
    y = tags
    print("...")
    print(f"X type: {type(X)}")
    print(f"y type: {type(y)}")
    print("...")

    standard = 0
    if standard == 1:
        print("Baseline deployed")
        y_test_output, predicted_results, clf = gradient_booster(X, y)
        ml_diagnostics(y_test_output, predicted_results)
        dump(clf, "Standard_baseline_boosted_model.joblib")
    elif standard == 0:
        print("Upgraded baseline deployed")
        y_test_2_output, predicted_2, ada_clf = upgraded_gradient_booster(X, y)
        ml_diagnostics(y_test_2_output, predicted_2)
        dump(ada_clf,"ada_boosted_model.joblib")


"""
Benchmark(label encoder = True)
--------------
Diagnostic metrics
Accuracy: 0.74
Precision: 0.7405405405405405
Recall: 0.7098445595854922
F1: 0.7248677248677249

Alternative alteration
n_estimators=100, learning_rate=0.1, max_depth=5
------------------
Diagnostic metrics
Accuracy: 0.76
Precision: 0.7975460122699386
Recall: 0.6735751295336787
F1: 0.7303370786516852

Benchmark to beat
----------
Upgraded baseline deployed
Diagnostic metrics
Accuracy: 0.9133333333333333
Precision: 0.9466666666666667
Recall: 0.8875
F1: 0.9161290322580645

Random forrest test

In [6]:
import time

import nltk
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()

nltk.download('stopwords')
nltk.download('wordnet')

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA, TruncatedSVD
from nltk.tokenize import word_tokenize
import pandas as pd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
def binary_classifier(X, y):
    def error_metrics(y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        return accuracy, precision, recall, f1

    if type(y) != np.ndarray:
        y = np.array(y)

    upgrade_deployment = 0
    if upgrade_deployment == 1:
        svd = TruncatedSVD(n_components=78)
        X = svd.fit_transform(X)
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    # Prediction
    prediction = model.predict(X_test)
    accuracy_metric, precision_metric, recall_metric, f1_metric = error_metrics(y_test, prediction)
    return accuracy_metric, precision_metric, recall_metric, f1_metric, prediction, model

def remove_stopwords_from_text(tokens, stop_words_applied):
    assert(type(tokens) == list or type(tokens) == np.ndarray)
    if type(tokens) != np.ndarray:
        tokens = np.array(tokens)

    filtered_tokens = []
    for token in tokens:
        if token.lower() not in stop_words_applied and token.lower() not in spanish_stop_words:
            filtered_tokens.append(token)
    return filtered_tokens

In [8]:
random_forrest_classifier = 0

In [None]:
if random_forrest_classifier == 1:
    purified_data = pd.read_csv(r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\Results\Token based\Post_purging\32\Purification verification data analysis results.csv")

    cleaned_tweet = purified_data["Cleaned tweet"].to_numpy()
    tags = purified_data["Tag"].to_numpy()

    # Tokenization
    token_conversion = []
    for tweets in cleaned_tweet:
        post_conversion = word_tokenize(tweets)
        token_conversion.append(post_conversion)

    print("Tokenization complete")
    for i in range(3):
        print(token_conversion[i])

    # Stopwords purge
    english_stop_words = set(stopwords.words("English"))
    spanish_stop_words = set(stopwords.words("Spanish"))
    print("<------------->")
    print("Purging of spanish and english stop words in progress...")
    print(f"English stop words(length: {len(english_stop_words)}): {english_stop_words}")
    print(f"Spanish stop words(length: {len(spanish_stop_words)}): {spanish_stop_words}")

    stage_0 = []
    # English stopwords removal
    for stage_0_element in token_conversion:
        english_cleaned_text = remove_stopwords_from_text(stage_0_element, english_stop_words)
        stage_0.append(english_cleaned_text)

    stop_words_stage_1 = []
    # Spanish stop words removal
    stage_1 = []
    for stage_1_elements in stage_0:
        spanish_cleaned_text = remove_stopwords_from_text(stage_1_elements, spanish_stop_words)
        stage_1.append(spanish_cleaned_text)
    print("Purge of Spanish and english stop words completed...")
    print("<------------->")


    # lammination
    lemmatizer = WordNetLemmatizer()
    lamminized_tokens = []
    for post_processed_tokens in stage_1:
        processed_tokens = []
        for token in post_processed_tokens:
            if token not in english_stop_words and token not in spanish_stop_words and token.isalpha():
                # Lemmatize the token and append to the result list
                lemmatized_token = lemmatizer.lemmatize(token)
                processed_tokens.append(lemmatized_token)
        lamminized_tokens.append(processed_tokens)

    preprocessed_texts = [" ".join(tokens) for tokens in lamminized_tokens]
    tfidf_vectorizer = TfidfVectorizer()
    X = tfidf_vectorizer.fit_transform(preprocessed_texts)
    y = tags
    if type(y) != np.ndarray:
        y = np.array(y)
    print(f"X type: {type(X)}")
    print(f"y type: {type(y)}")
    accuracy_metric, precision_metric, recall_metric, f1_metric, predictions, model = binary_classifier(X, y)
    dump(model, "Random_forest_model.joblib")
    end_time = time.time()
    print(f"Accuracy: {round(accuracy_metric, 4)}")
    print(f"Precision: {round(precision_metric, 4)}")
    print(f"Recall: {round(recall_metric, 4)}")
    print(f"F1: {round(f1_metric, 4)}")
    processing_time = end_time - start_time
    if processing_time < 60:
        print(f"Processing time: {processing_time} seconds")
    elif processing_time >= 60:
        hours_conversion = (processing_time)/60
        if hours_conversion > 1 and hours_conversion < 2:
            print(f"Processing time: {hours_conversion} hour")
        else:
            print(f"Processing time: {hours_conversion} hours")
"""
Benchmark to beat:
------------------
It meets the minimum standard of 76.0% <= X <= 1.0% on every evaluation metric
Accuracy: 0.79
Precision: 0.7943262411347518
Recall: 0.7671232876712328
F1: 0.7804878048780488
Predictions:
[0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0
 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0
 1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0 1 1
 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1
 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1 1
 1 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1
 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1
 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1
 1 0 0 1]
Processing time: 4.091778993606567 seconds

Observation: Any attempt to increase accuracy makes these metrics decrease. Any assist is welcomed.
Note: The processing time fluctuates drastically from 3.45 seconds to sometimes 8 seconds. Do not consider that an important consideration or anything if it stays in the seconds for processing time.
"""