In [101]:
# Set up for google drive
from google.colab import drive
import os
gdrive_path='/content/gdrive/MyDrive/nlp/'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/gdrive', force_remount=True)

os.chdir(gdrive_path)


Mounted at /content/gdrive


In [97]:
import pandas as pd
from pathlib import Path
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    f1_score,
    precision_score,
)
from scipy.spatial.distance import cdist

# Rule based Approach to Text Classification

In [27]:
base_path = '/content/gdrive/MyDrive/nlp'
project_dir = Path(base_path)

In [99]:
def load_data(embedding_type: str,data_set_type: str,set_type:str ) -> pd.DataFrame:
    training_data = load_pickle(
        project_dir / f"embeddings/{data_set_type}/{embedding_type}_{set_type}_{data_set_type}.pkl")

    return training_data

In [29]:
def load_pickle(file_path:Path) -> pd.DataFrame:
    with open(file_path, "rb") as file:
        return pickle.load(file)

## 1. Rule-based Approach - Cosinus similarity approach between process descripton and legal text

In [224]:
def calculate_cosine_similarity(vector1: np.ndarray, vector2:np.ndarray):
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)

    return cosine_similarity(vector1, vector2)[0][0]

In [225]:
embedding_dims = {
            "glove": 300,
            "fasttext": 100,
            "word2vec": 100,
            "tfidf": 1000,
            "gpt": 1536,
            "bert": 768,
        }

In [226]:
columns_to_drop = [
                "Text",
                "Label",
                "Process",
                "Process_description",
                "Combined_Text",
            ]


In [297]:
for emb_type in [ "tfidf"]:
    training_data = load_data(emb_type,"separate", "train")
    result_data = training_data[["Text", "Label", "Process", "Process_description"]].copy()
    columns_to_drop = [
                "Text",
                "Label",
                "Process",
                "Process_description",
                "Combined_Text",
            ]

    embedding_df = training_data.drop(columns=columns_to_drop)

    dim = embedding_dims[emb_type]

    # Slice the DataFrame for process description embeddings
    embedding_proc_desc = embedding_df.iloc[:, :dim]

    # Slice the DataFrame for legal text embeddings
    embedding_legal_text = embedding_df.iloc[:, dim : dim * 2]


    result_data['Cosine_Similarity'] = training_data.apply(
        lambda row: calculate_cosine_similarity(
            row[embedding_proc_desc.columns].values,
            row[embedding_legal_text.columns].values
        ), axis=1
    )


In [228]:
# Group the data by 'Label' and 'Process' to calculate statistics for cosine similarity
grouped_label = result_data.groupby('Label')['Cosine_Similarity'].agg(['mean', 'min', 'max'])
grouped_process = result_data.groupby(['Process', 'Label'])['Cosine_Similarity'].agg(['mean', 'min', 'max' , 'count'])

# Display the aggregated statistics
grouped_label, grouped_process.reset_index()

(           mean  min       max
 Label                         
 0      0.051819  0.0  0.547664
 1      0.189265  0.0  0.825904,
                    Process  Label      mean       min       max  count
 0                   GDPR_1      0  0.045437  0.000000  0.149925     71
 1                   GDPR_1      1  0.255885  0.005897  0.382947      8
 2                   GDPR_2      0  0.102582  0.000000  0.319013    144
 3                   GDPR_2      1  0.181062  0.000000  0.362563     16
 4                   GDPR_3      0  0.049445  0.000000  0.232776     99
 5                   GDPR_3      1  0.202652  0.107737  0.257083     11
 6                   GDPR_4      0  0.069038  0.000000  0.436148     36
 7                   GDPR_4      1  0.220458  0.046173  0.496624      4
 8                   GDPR_5      0  0.031303  0.000000  0.120781     36
 9                   GDPR_5      1  0.154785  0.032809  0.361616      4
 10                  GDPR_6      0  0.076496  0.000000  0.240425     18
 11    

In [229]:
means_by_process = result_data.groupby(['Process', 'Label'])['Cosine_Similarity'].mean().unstack()

In [230]:
means_by_process

Label,0,1
Process,Unnamed: 1_level_1,Unnamed: 2_level_1
GDPR_1,0.045437,0.255885
GDPR_2,0.102582,0.181062
GDPR_3,0.049445,0.202652
GDPR_4,0.069038,0.220458
GDPR_5,0.031303,0.154785
GDPR_6,0.076496,0.334806
GDPR_7,0.043093,0.11864
Hiring Employee,0.034281,0.026895
Know Your Customer,0.02552,0.106151
SM2_1,0.052884,0.249147


Findings: Mean of unrelevant cosinus similarity is = 0.051819, while relevant is 0.189265. Also for almost each process the mean cosinus similarity is a higher for label 1. Only for Hiring Employee the cosinus is lower.

In [300]:
def classify_test_set(test_set, means_by_process:pd.DataFrame) -> pd.DataFrame:
    # Add a new column for predictions, default to 0
    test_set['Predicted_Label'] = 0

    # Loop over each row in the test set
    for index, row in test_set.iterrows():
        process = row['Process']
        # Corrected the dynamic column name
        cosine_similarity = row["Cosine_Similarity"]

        # Check if the mean of label 1 is greater than the mean of label 0 for the process
        if means_by_process.loc[process, 1] > means_by_process.loc[process, 0]:
            # If so, apply the rule that if the cosine similarity is greater than the mean of label 0, predict 1
            threshold = means_by_process.loc[process, 0]
            test_set.at[index, 'Predicted_Label'] = int(cosine_similarity > threshold)
        else:
            # Otherwise, apply the rule that if the cosine similarity is less than or equal to the mean of label 1, predict 1
            threshold = means_by_process.loc[process, 1]
            test_set.at[index, 'Predicted_Label'] = int(cosine_similarity <= threshold)

    return test_set

In [299]:
classified_test_set = classify_test_set(result_data, means_by_process)
print(classified_test_set[['Process', 'Cosine_Similarity', 'Predicted_Label']])

KeyError: 'GDPR_1'

In [235]:
classified_test_set

Unnamed: 0,Text,Label,Process,Process_description,Cosine_Similarity,Predicted_Label
0,we will have policy and procedure in place to ...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,0.098884,1
1,a service supplier must first get our approval...,0,Travel Insurance Claim,the process for a travel insurance claim invol...,0.000000,0
2,if we be contact by an uninsured person who wi...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,0.418910,1
3,our decision will be make within 4 month of re...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,0.137660,1
4,we must comply with the timeframe in this part...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,0.038904,0
...,...,...,...,...,...,...
1664,a mixture of two or more coffee and/or roast p...,0,SM6_3,this process involve a load switch device whic...,0.000000,0
1665,a combination of espresso steamed milk honey a...,0,SM6_3,this process involve a load switch device whic...,0.000000,0
1666,coffee ground be steep in cold water for about...,0,SM6_3,this process involve a load switch device whic...,0.006339,0
1667,almost all the coffee produce in the world be ...,0,SM6_3,this process involve a load switch device whic...,0.025975,0


In [92]:
true_labels = classified_test_set['Label'].tolist()
predicted_labels = classified_test_set['Predicted_Label'].tolist()

# Calculate metrics using sklearn
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average="weighted")
recall = recall_score(true_labels, predicted_labels, average="weighted")
f1 = f1_score(true_labels, predicted_labels, average="weighted")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Accuracy: 0.6656680647094069
Precision: 0.9092421446155924
Recall: 0.6656680647094069
F1: 0.737748767161893


### Testing this approach on Test set

In [301]:
results_list = []
predicted_labels_dfs= pd.DataFrame()
for emb_type in ["gpt", "fasttext", "word2vec", "glove", "bert", "tfidf"]:
    test_data = load_data(emb_type,"separate", "test")
    result_data = test_data[["Text", "Label", "Process", "Process_description"]].copy()
    columns_to_drop = [
                "Text",
                "Label",
                "Process",
                "Process_description",
                "Combined_Text",
            ]

    embedding_df = test_data.drop(columns=columns_to_drop)

    dim = embedding_dims[emb_type]

    # Slice the DataFrame for process description embeddings
    embedding_proc_desc = embedding_df.iloc[:, :dim]

    # Slice the DataFrame for legal text embeddings
    embedding_legal_text = embedding_df.iloc[:, dim : dim * 2]



    result_data["Cosine_Similarity"] = test_data.apply(
        lambda row: calculate_cosine_similarity(
            row[embedding_proc_desc.columns].values,
            row[embedding_legal_text.columns].values
        ), axis=1
    )
    means_by_process = result_data.groupby(['Process', 'Label'])["Cosine_Similarity"].mean().unstack()
    classified_test_set = classify_test_set(result_data, means_by_process)
    true_labels = classified_test_set['Label'].tolist()
    predicted_labels = classified_test_set['Predicted_Label'].tolist()

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(
        true_labels, predicted_labels, average="weighted"
    )
    recall = recall_score(true_labels, predicted_labels, average="weighted")
    f1 = f1_score(true_labels, predicted_labels, average="weighted")

    results_list.append(
        {
            "model": "Rule-Based Cosine Similarity",
            "Embedding": emb_type,
            "accuracy_oos": accuracy,
            "precision_oos": precision,
            "recall_oos": recall,
            "f1_oos": f1,
        }
    )

    predicted_label_column_name = f'{emb_type}_Cosine_Similarity_Prediction'

    classified_test_set.rename(columns={'Predicted_Label': predicted_label_column_name}, inplace=True)

    if not predicted_labels_dfs.empty:
        predicted_labels_dfs = pd.concat(
            [predicted_labels_dfs, classified_test_set[[predicted_label_column_name]]], axis=1
        )
    else:
        predicted_labels_dfs = classified_test_set



In [302]:
results = pd.DataFrame(results_list)

In [303]:
results

Unnamed: 0,model,Embedding,accuracy_oos,precision_oos,recall_oos,f1_oos
0,Rule-Based Cosine Similarity,gpt,0.578947,0.882216,0.578947,0.652955
1,Rule-Based Cosine Similarity,fasttext,0.552632,0.86571,0.552632,0.630377
2,Rule-Based Cosine Similarity,word2vec,0.518797,0.861397,0.518797,0.598977
3,Rule-Based Cosine Similarity,glove,0.432331,0.831981,0.432331,0.515129
4,Rule-Based Cosine Similarity,bert,0.571429,0.84898,0.571429,0.648585
5,Rule-Based Cosine Similarity,tfidf,0.635338,0.869582,0.635338,0.702263


In [304]:
predicted_labels_dfs.drop("Cosine_Similarity",axis=1)

Unnamed: 0,Text,Label,Process,Process_description,gpt_Cosine_Similarity_Prediction,fasttext_Cosine_Similarity_Prediction,word2vec_Cosine_Similarity_Prediction,glove_Cosine_Similarity_Prediction,bert_Cosine_Similarity_Prediction,tfidf_Cosine_Similarity_Prediction
0,we encourage you or your representative to tel...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,0,1,0
1,you must co - operate at all time in relation ...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,0,1,1,0
2,once we have all relevant information and have...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,1,1,1
3,we comply with the principles of the privacy a...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,0,1,1,1,1,0
4,we will tell you about the progress of your cl...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...
261,the active power limit value of the meter must...,0,SM2_2,this process involve the smart meter be turn o...,1,1,1,0,0,1
262,roasting software : with the quest link to a c...,0,SM2_2,this process involve the smart meter be turn o...,0,0,0,0,0,0
263,for a plan push operation the readout plan ( t...,0,SM2_2,this process involve the smart meter be turn o...,1,0,1,0,0,1
264,if there be a time difference between 2 and 9 ...,0,SM2_2,this process involve the smart meter be turn o...,0,0,1,0,0,0


In [307]:
predicted_labels_dfs.to_csv("test_data_cosine_similarity_predictions.csv", index=False)

## 2.Rule-based Approach - Generate mean embeddings centroid and calculate vector distance to new text

In [270]:
def calculate_centroids(embedding_df: pd.DataFrame, process_column: str, label_column: str) -> pd.DataFrame:
    # Calculate centroids for each combination of process and label
    centroids = embedding_df.groupby([process_column, label_column]).mean()

    # Create a new DataFrame with Process as the index
    # and columns for each label's centroid
    unique_processes = embedding_df[process_column].unique()
    centroid_columns = ['centroid_0', 'centroid_1']
    centroids_df = pd.DataFrame(index=unique_processes, columns=centroid_columns)

    # Populate the new DataFrame with centroid values
    for process in unique_processes:
        for label in [0, 1]:
            if (process, label) in centroids.index:
                # Assign the mean centroid value for this process and label
                centroids_df.at[process, f'centroid_{label}'] = centroids.loc[(process, label)].values

    return centroids_df.reset_index().rename(columns={'index': process_column})

In [271]:
def predict_label_based_on_distance(row, centroids_df: pd.DataFrame, process_column: str):
    # Extract the current process's centroids for label 0 and 1
    process_centroids = centroids_df[centroids_df['Process'] == row[process_column]]
    print(process_centroids)

    if process_centroids.empty:
        # If there is no centroid for this process, we cannot predict
        return None

    # Ensure centroids are in the correct numpy array format
    centroid_0 = np.array(process_centroids['centroid_0'].tolist()[0], dtype=float).reshape(1, -1)
    centroid_1 = np.array(process_centroids['centroid_1'].tolist()[0], dtype=float).reshape(1, -1)

    print(centroid_0.shape)
    print(type(centroid_0))
    print(type(centroid_1))

    # The row embedding needs to be converted to a numpy array excluding non-embedding columns
    row_embedding = np.array(row[5:], dtype=float).reshape(1, -1)  # Adjust the index 5 based on where embedding starts  # Adjust the index 5 based on where embedding starts
    print(row_embedding.shape)
    print(type(row_embedding))



    # Calculate distances to both centroids
    distances = cdist(row_embedding, np.vstack([centroid_0, centroid_1]), 'euclidean')

    # Predict the label based on the nearest centroid
    predicted_label = np.argmin(distances)
    return predicted_label


In [272]:
trainings_data = load_data("tfidf","combined", "train")
process_label_cols = ['Process', 'Label']
numeric_embedding_cols = trainings_data.columns[4:]  # Assuming embeddings start from 5th column

# Keep only 'Process', 'Label', and the numeric embedding columns
embedding_df = trainings_data[process_label_cols + numeric_embedding_cols.tolist()]

# Now you can calculate the centroids
centroids_df = calculate_centroids(embedding_df, 'Process', 'Label')

# Apply the function to each row in your test set
trainings_data['Predicted_Label'] = trainings_data.apply(lambda row: predict_label_based_on_distance(row, centroids_df, 'Process'), axis=1)

  centroids = embedding_df.groupby([process_column, label_column]).mean()


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
  Process                                         centroid_0  \
9  GDPR_7  [0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...   

                                          centroid_1  
9  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008...  
(1, 1000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 1000)
<class 'numpy.ndarray'>
  Process                                         centroid_0  \
9  GDPR_7  [0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...   

                                          centroid_1  
9  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008...  
(1, 1000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 1000)
<class 'numpy.ndarray'>
  Process                                         centroid_0  \
9  GDPR_7  [0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...   

                                          centroid_1  
9  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008...  
(1, 1000)
<class '

In [273]:
# Apply the function to each row in your test se
trainings_data['Predicted_Label']

0       1
1       1
2       1
3       1
4       1
       ..
1664    0
1665    0
1666    0
1667    0
1668    0
Name: Predicted_Label, Length: 1669, dtype: int64

In [274]:
true_labels = trainings_data['Label'].tolist()
predicted_labels = trainings_data['Predicted_Label'].tolist()

# Calculate metrics using sklearn
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average="weighted")
recall = recall_score(true_labels, predicted_labels, average="weighted")
f1 = f1_score(true_labels, predicted_labels, average="weighted")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Accuracy: 0.8987417615338527
Precision: 0.9452099146163176
Recall: 0.8987417615338527
F1: 0.912971138292915


In [275]:
centroids_df

Unnamed: 0,Process,centroid_0,centroid_1
0,Travel Insurance Claim,"[0.0, 0.0, 0.0010613721220414118, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Know Your Customer,"[0.07171538230483233, 0.07171538230483233, 0.0...","[0.06511524690711619, 0.06511524690711619, 0.0..."
2,Hiring Employee,"[0.0, 0.0, 0.0018379722593074355, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,GDPR_1,"[0.0, 0.0, 0.022648323348814565, 0.04110485750...","[0.0, 0.0, 0.021112675073108454, 0.03994444315..."
4,GDPR_2,"[0.0, 0.0, 0.0005127173455302673, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,GDPR_3,"[0.0, 0.0, 0.1716578585224608, 0.0, 0.09892727...","[0.0, 0.0, 0.19353010799782086, 0.0, 0.0905740..."
6,GDPR_4,"[0.0, 0.0, 0.0014839941695523361, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,GDPR_5,"[0.0, 0.0, 0.005043424730291155, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,GDPR_6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,GDPR_7,"[0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008..."


In [276]:
for emb_type in ["gpt", "fasttext", "word2vec", "glove", "bert", "tfidf"]:
    trainings_data = load_data(emb_type,"combined", "train")
    result_data = trainings_data[["Text", "Label", "Process", "Process_description"]].copy()
    columns_to_drop = [
                "Text",
                "Process_description",
                "Combined_Text",
            ]

    embedding_df = test_data.drop(columns=columns_to_drop)
    centroids_df = calculate_centroids(embedding_df,"Process","Label")
    print(centroids_df)

    #result_data['Predicted_Label'] = result_data.apply(predict_label_based_on_distance, axis=1, centroids_df=centroids_df, process_column='Process')

                  Process                                         centroid_0  \
0  Travel Insurance Claim  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1      Know Your Customer  [0.07826646416822636, 0.07826646416822636, 0.0...   
2         Hiring Employee  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3                  GDPR_2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4                  GDPR_3  [0.0, 0.0, 0.1892058414183068, 0.0, 0.10964714...   
5                   SM2_1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
6                   SM2_2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                                          centroid_1  
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.07826646416822634, 0.07826646416822634, 0.0...  
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
4  [0.0, 0.0, 0.1892058414183068, 0.0, 0.10964714...  
5  [0.0, 0.0, 0.0, 0.0, 0.0, 

### Apply to test data

In [277]:
trainings_data = load_data("tfidf","combined", "train")
process_label_cols = ['Process', 'Label']
numeric_embedding_cols = trainings_data.columns[4:]  # Assuming embeddings start from 5th column

# Keep only 'Process', 'Label', and the numeric embedding columns
embedding_df = trainings_data[process_label_cols + numeric_embedding_cols.tolist()]

# Now you can calculate the centroids
centroids_df = calculate_centroids(embedding_df, 'Process', 'Label')

# Apply the function to each row in your test set
trainings_data['Predicted_Label'] = trainings_data.apply(lambda row: predict_label_based_on_distance(row, centroids_df, 'Process'), axis=1)

  centroids = embedding_df.groupby([process_column, label_column]).mean()


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
  Process                                         centroid_0  \
9  GDPR_7  [0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...   

                                          centroid_1  
9  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008...  
(1, 1000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 1000)
<class 'numpy.ndarray'>
  Process                                         centroid_0  \
9  GDPR_7  [0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...   

                                          centroid_1  
9  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008...  
(1, 1000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 1000)
<class 'numpy.ndarray'>
  Process                                         centroid_0  \
9  GDPR_7  [0.0, 0.0, 0.004961691803292059, 0.0, 0.0, 0.0...   

                                          centroid_1  
9  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008...  
(1, 1000)
<class '

In [292]:
results_list = []
predicted_labels_dfs_cenroid = pd.DataFrame()

for emb_type in ["gpt", "fasttext", "word2vec", "glove", "bert", "tfidf"]:
    test_data = load_data(emb_type,"combined", "test")
    result_predictions = test_data[["Text", "Label", "Process", "Process_description"]].copy()
    columns_to_drop = [
                "Text",
                "Process_description",
                "Combined_Text",
            ]

    embedding_df = test_data.drop(columns=columns_to_drop)
    centroids_df = calculate_centroids(embedding_df,"Process","Label")

    prediction_column_name = f'{emb_type}_Mean_Centroid_Prediction'
    result_predictions[prediction_column_name] = test_data.apply(
        lambda row: predict_label_based_on_distance(row, centroids_df, 'Process'), axis=1
    )

    true_labels = result_predictions['Label'].tolist()
    predicted_labels = result_predictions[prediction_column_name].tolist()

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="weighted")
    recall = recall_score(true_labels, predicted_labels, average="weighted")
    f1 = f1_score(true_labels, predicted_labels, average="weighted")

    results_list.append({
        "model": "Rule-Based Mean Centroid",
        "Embedding": emb_type,
        "accuracy_oos": accuracy,
        "precision_oos": precision,
        "recall_oos": recall,
        "f1_oos": f1,
    })

    if not predicted_labels_dfs_cenroid.empty:
        # After the first iteration, only add the new prediction column to the existing DataFrame
        predicted_labels_dfs_cenroid = pd.concat(
            [predicted_labels_dfs_cenroid, result_predictions[prediction_column_name]], axis=1
        )
    else:
        # On the first iteration, set the DataFrame with all necessary columns
        predicted_labels_dfs_cenroid = result_predictions






[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
                  Process                                         centroid_0  \
0  Travel Insurance Claim  [-0.15290129, 0.13292967, 0.49699935, -0.14329...   

                                          centroid_1  
0  [-0.1336693, 0.12211438, 0.5491598, -0.1286308...  
(1, 768)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 768)
<class 'numpy.ndarray'>
                  Process                                         centroid_0  \
0  Travel Insurance Claim  [-0.15290129, 0.13292967, 0.49699935, -0.14329...   

                                          centroid_1  
0  [-0.1336693, 0.12211438, 0.5491598, -0.1286308...  
(1, 768)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1, 768)
<class 'numpy.ndarray'>
                  Process                                         centroid_0  \
0  Travel Insurance Claim  [-0.15290129, 0.13292967, 0.49699935, -0.14329...   

                                    

In [294]:
performance_df = pd.DataFrame(results_list)


In [295]:
performance_df

Unnamed: 0,model,Embedding,accuracy_oos,precision_oos,recall_oos,f1_oos
0,Rule-Based Mean Centroid,gpt,0.894737,0.937328,0.894737,0.906955
1,Rule-Based Mean Centroid,fasttext,0.703008,0.878204,0.703008,0.75651
2,Rule-Based Mean Centroid,word2vec,0.706767,0.878706,0.706767,0.759441
3,Rule-Based Mean Centroid,glove,0.823308,0.911701,0.823308,0.850154
4,Rule-Based Mean Centroid,bert,0.827068,0.91714,0.827068,0.853648
5,Rule-Based Mean Centroid,tfidf,0.943609,0.956353,0.943609,0.947305


In [296]:
predicted_labels_dfs_cenroid

Unnamed: 0,Text,Label,Process,Process_description,gpt_Mean_Centroid_Prediction,fasttext_Mean_Centroid_Prediction,word2vec_Mean_Centroid_Prediction,glove_Mean_Centroid_Prediction,bert_Mean_Centroid_Prediction,tfidf_Mean_Centroid_Prediction
0,we encourage you or your representative to tel...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,1,1,1
1,you must co - operate at all time in relation ...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,1,1,0
2,once we have all relevant information and have...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,1,1,1
3,we comply with the principles of the privacy a...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,0,1,1,1,0,1
4,we will tell you about the progress of your cl...,1,Travel Insurance Claim,the process for a travel insurance claim invol...,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
261,the active power limit value of the meter must...,0,SM2_2,this process involve the smart meter be turn o...,0,0,0,0,0,0
262,roasting software : with the quest link to a c...,0,SM2_2,this process involve the smart meter be turn o...,0,0,0,0,0,0
263,for a plan push operation the readout plan ( t...,0,SM2_2,this process involve the smart meter be turn o...,0,0,0,0,0,0
264,if there be a time difference between 2 and 9 ...,0,SM2_2,this process involve the smart meter be turn o...,0,0,0,0,0,0


In [290]:
predicted_labels_dfs_cenroid.to_csv("test_data_mean_cenroid_predictions.csv", index=False)