In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

np.random.seed(42) # For reprodcubility

In [2]:
""" df = pd.read_csv("output_train.csv")
df_spacy = df.drop(columns=["A_raw_entities"]).copy()
df_spacy_sample = df_spacy.head(5) """

' df = pd.read_csv("output_train.csv")\ndf_spacy = df.drop(columns=["A_raw_entities"]).copy()\ndf_spacy_sample = df_spacy.head(5) '

In [3]:
""" df_spacy_sample """

' df_spacy_sample '

In [4]:
""" df_spacy_sample["B_raw_entities"][0] """

' df_spacy_sample["B_raw_entities"][0] '

In [5]:
# Load datasets (SpaCy only)
df_train = pd.read_csv("output_train.csv").drop(columns=['A_raw_entities']).copy()
df_test = pd.read_csv("output_test.csv").drop(columns=['A_raw_entities']).copy()

In [6]:
# Function to extract entity types
def extract_entities(entity_list_str):
    entity_list = eval(entity_list_str)  # Convert string to list of dicts
    return [entity['entity'] for entity in entity_list]  # Extract entity types

In [7]:
# Apply entity extraction to both datasets
df_train['entities'] = df_train['B_raw_entities'].apply(extract_entities)
df_test['entities'] = df_test['B_raw_entities'].apply(extract_entities)

# One-hot encode entity types
entity_dummies_train = df_train['entities'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum()
entity_dummies_test = df_test['entities'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum()

# Ensure test has same columns as train
entity_dummies_test = entity_dummies_test.reindex(columns=entity_dummies_train.columns, fill_value=0)

# Merge with main dataset
df_train_final = df_train[['label_binary']].join(entity_dummies_train)
df_test_final = df_test[['label_binary']].join(entity_dummies_test)

# Fill NaNs (just in case)
df_train_final = df_train_final.fillna(0)
df_test_final = df_test_final.fillna(0)

In [8]:
df_train_final.head(3)

Unnamed: 0,label_binary,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [9]:
df_test_final.head(3)

Unnamed: 0,label_binary,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


In [10]:
# Define features and target
X_train, y_train = df_train_final.drop(columns=['label_binary']), df_train_final['label_binary']
X_test, y_test = df_test_final.drop(columns=['label_binary']), df_test_final['label_binary']


In [11]:
# Train and evaluate multiple models
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(), # SVM
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=10),
    "Decision Tree": DecisionTreeClassifier(),
}

results = {} #  # Store model evaluation metrics

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True) # Classification report

    # Store evaluation metrics
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"],
    }

# Random Baseline
p_0 = (y_train == 0).mean() # Probability of class 0 in data
p_1 = (y_train == 1).mean() # Probability of class 1 in data

# Generate random predictions based on these probabilities
y_pred_random = np.random.choice([0, 1], size=len(y_test), p=[p_0, p_1])

# # Evaluate random baseline
accuracy_rand = accuracy_score(y_test, y_pred_random)
precision_rand = precision_score(y_test, y_pred_random, zero_division=0)
recall_rand = recall_score(y_test, y_pred_random)
f1_rand = f1_score(y_test, y_pred_random)

# Store baseline results
results["Random Baseline"] = {
    "Accuracy": accuracy_rand,
    "Precision": precision_rand,
    "Recall": recall_rand,
    "F1-Score": f1_rand,
}

# Convert results to DataFrame and display
df_results = pd.DataFrame(results).T
df_results

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Logistic Regression,0.617596,0.612027,0.617596,0.584108
SVM,0.619774,0.612205,0.619774,0.595651
KNN (k=5),0.591463,0.580238,0.591463,0.577625
Decision Tree,0.608885,0.598689,0.608885,0.586382
Random Baseline,0.514373,0.428571,0.437821,0.433147
