In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

np.random.seed(42) # For reprodcubility

# Load data

In [32]:
df_train = pd.read_csv('../data/output_train.csv').drop(columns=['B_raw_entities', 'label'])
df_valid = pd.read_csv('../data/output_valid.csv').drop(columns=['B_raw_entities', 'label'])
# Combine train and valid data for training
df_train = pd.concat([df_train, df_valid], axis=0)

df_test = pd.read_csv('../data/output_test.csv').drop(columns=['B_raw_entities','label'])

In [33]:
df_train.head(3)

Unnamed: 0,statement,label_binary,A_raw_entities
0,"90 percent of Americans ""support universal bac...",1,"[{'entity': 'MISC', 'score': 0.99866974, 'inde..."
1,Last year was one of the deadliest years ever ...,0,[]
2,"Bernie Sanders's plan is ""to raise your taxes ...",0,"[{'entity': 'PER', 'score': 0.9983652, 'index'..."


## Extract entity types

In [34]:
# Function to extract entity types
def extract_entities(entity_list_str):
    entity_list = eval(entity_list_str)  # Convert string to list of dicts
    return [entity['entity'] for entity in entity_list]  # Extract entity types

### Extract entities for train & test data

In [35]:
# Apply entity extraction to both datasets
df_train['entities'] = df_train['A_raw_entities'].apply(extract_entities)
df_test['entities'] = df_test['A_raw_entities'].apply(extract_entities)

In [36]:
df_train.head(3)

Unnamed: 0,statement,label_binary,A_raw_entities,entities
0,"90 percent of Americans ""support universal bac...",1,"[{'entity': 'MISC', 'score': 0.99866974, 'inde...",[MISC]
1,Last year was one of the deadliest years ever ...,0,[],[]
2,"Bernie Sanders's plan is ""to raise your taxes ...",0,"[{'entity': 'PER', 'score': 0.9983652, 'index'...","[PER, PER]"


## Encoding entity types as features

In [37]:
entity_dummies_train = df_train['entities'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum()
entity_dummies_test = df_test['entities'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum()

In [38]:
entity_dummies_train.head(3)

Unnamed: 0,LOC,MISC,ORG,PER
0,0,1,0,2
1,1,1,1,3
2,0,0,0,4


In [39]:
# Ensure test has same columns as train
entity_dummies_test = entity_dummies_test.reindex(columns=entity_dummies_train.columns, fill_value=0)

# Merge with main dataset
df_train_final = df_train[['label_binary']].join(entity_dummies_train)
df_test_final = df_test[['label_binary']].join(entity_dummies_test)

In [40]:
df_train_final.head(3)

Unnamed: 0,label_binary,LOC,MISC,ORG,PER
0,1,0.0,1.0,0.0,2.0
1,0,1.0,1.0,1.0,3.0
2,0,0.0,0.0,0.0,4.0


In [41]:
df_test_final.head(3)

Unnamed: 0,label_binary,LOC,MISC,ORG,PER
0,0,0.0,4.0,0.0,0.0
1,0,0.0,0.0,0.0,2.0
2,0,0.0,1.0,0.0,10.0


# Model training

In [42]:
# Fill null values
df_train_final = df_train_final.fillna(0)
df_test_final = df_test_final.fillna(0)

# Define features and target
X_train, y_train = df_train_final.drop(columns=['label_binary']), df_train_final['label_binary']
X_test, y_test = df_test_final.drop(columns=['label_binary']), df_test_final['label_binary']

In [43]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(), # SVM
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=10),
    "Decision Tree": DecisionTreeClassifier(),
}
results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True) 

    # Store evaluation metrics
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"],
    }

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Random baseline

In [44]:
# Random Baseline
p_0 = (y_train == 0).mean() # Probability of class 0 in data
p_1 = (y_train == 1).mean() # Probability of class 1 in data

# Generate random predictions
y_pred_random = np.random.choice([0, 1], size=len(y_test), p=[p_0, p_1])

# Evaluate random baseline
accuracy_rand = accuracy_score(y_test, y_pred_random)
precision_rand = precision_score(y_test, y_pred_random, zero_division=0)
recall_rand = recall_score(y_test, y_pred_random)
f1_rand = f1_score(y_test, y_pred_random)

# Store baseline results
results["Random Baseline"] = {
    "Accuracy": accuracy_rand,
    "Precision": precision_rand,
    "Recall": recall_rand,
    "F1-Score": f1_rand,
}

## Results

In [45]:
# Convert results to DataFrame and display
df_results = pd.DataFrame(results).T
df_results

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Logistic Regression,0.57622,0.332029,0.57622,0.421298
SVM,0.566638,0.552086,0.566638,0.5507
KNN (k=5),0.546603,0.50473,0.546603,0.489115
Decision Tree,0.563589,0.554701,0.563589,0.556363
Random Baseline,0.533101,0.448916,0.447071,0.447992
