# Setup

In [None]:
try:
    import transformers
except ModuleNotFoundError:
    !pip install transformers==4.28.0
    import transformers

try:
    import datasets
except ModuleNotFoundError:
    !pip install datasets 
    import datasets
    
try:
    import evaluate
except ModuleNotFoundError:
    !pip install evaluate
    import evaluate
        
try:
    import accelerate
except ModuleNotFoundError:
    !pip install accelerate
    import accelerate

from huggingface_hub import notebook_login
from google.colab import drive
import pandas as pd
from datasets import Dataset
from transformers import pipeline
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from IPython.display import clear_output
clear_output()
print("Setup complete")

In [None]:
notebook_login()

# Load dataset

In [None]:
drive.mount("/content/gdrive")
project_path = "/content/gdrive/MyDrive/IR" # the dataset will be loaded from Google Drive

In [None]:
train = pd.read_table(f"{project_path}/train.tsv", names=['ID', 'LABEL', 'STATEMENT', 'TOPICS', 'SPEAKER', 'ROLE', 'CITY', 'PARTY', 'H1', 'H2', 'H3', 'H4', 'H5', 'CONTEXT'])
val = pd.read_table(f"{project_path}/valid.tsv", names=['ID', 'LABEL', 'STATEMENT', 'TOPICS', 'SPEAKER', 'ROLE', 'CITY', 'PARTY', 'H1', 'H2', 'H3', 'H4', 'H5', 'CONTEXT'])
test = pd.read_table(f"{project_path}/test.tsv", names=['ID', 'LABEL', 'STATEMENT', 'TOPICS', 'SPEAKER', 'ROLE', 'CITY', 'PARTY', 'H1', 'H2', 'H3', 'H4', 'H5', 'CONTEXT'])

In [None]:
labels = ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true']

train.LABEL= train.LABEL.apply(lambda x: 0 if x in labels[:3] else 1)
test.LABEL= test.LABEL.apply(lambda x: 0 if x in labels[:3] else 1)
val.LABEL= val.LABEL.apply(lambda x: 0 if x in labels[:3] else 1)

In [None]:
train_dataset = Dataset.from_dict(train[["LABEL", "STATEMENT"]].rename(columns={"LABEL": "label", "STATEMENT": "text"}))
valid_dataset = Dataset.from_dict(val[["LABEL", "STATEMENT"]].rename(columns={"LABEL": "label", "STATEMENT": "text"}))
test_dataset = Dataset.from_dict(test[["LABEL", "STATEMENT"]].rename(columns={"LABEL": "label", "STATEMENT": "text"}))
dataset_dict = datasets.DatasetDict({"train":train_dataset,"valid":valid_dataset,"test":test_dataset})

# Load BERT

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "POSITIVE", 1: "NEGATIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id #distilbert-base-uncased
)

# Fine Tuning

In [None]:
training_args = TrainingArguments(
    output_dir="bert-base-uncased-fine-tuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

# Feature Extraction

In [None]:
pipeline = pipeline('feature-extraction', model='MicheleFiori/bert-base-uncased-fine-tuned', device=0) #MicheleFiori/liar_model MicheleFiori/distilbert-base-uncased-fine-tuned

In [None]:
X_train = train.STATEMENT.apply(lambda s: np.array(pipeline(s)).mean(axis=1))
X_train = np.stack(X_train).squeeze()
Y_train = np.stack(train.LABEL)

In [None]:
X_test = test.STATEMENT.apply(lambda s: np.array(pipeline(s)).mean(axis=1))
X_test = np.stack(X_test).squeeze()
Y_test = np.stack(test.LABEL)

# SVM

In [None]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test, Y_test)

In [None]:
preds = clf.predict(X_test)

In [None]:
confusion_matrix(Y_test, preds, normalize="true")

In [None]:
print(classification_report(Y_test, preds, target_names=["fake","true"]))

# Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=None, n_estimators=2000)
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test, Y_test)

In [None]:
preds = clf.predict(X_test)

In [None]:
confusion_matrix(Y_test, preds, normalize="true")

In [None]:
print(classification_report(Y_test, preds, target_names=["fake","true"]))

# CONTEXT DATA

In [None]:
whole_dataset = pd.concat([train, val, test], axis=0).drop(["H1", "H2", "H3", "H4", "H5"], axis=1)
whole_dataset.reset_index(drop=True, inplace =True)

In [None]:
topic = False
city = True
party = True

# Topic

In [None]:
if topic:
  topics = list(set(whole_dataset.TOPICS))
  topics = [str(t).split(',') for t in topics]
  single_topics = [t for top in topics for t in top]
  topics = list(set(single_topics))

  topics_dicts = []
  for i, row in whole_dataset.iterrows():
    topics_values = {k:0 for k in topics}
    current_topics = str(row.TOPICS).split(",")
    for t in current_topics: topics_values[t] = 1 
    topics_dicts.append(topics_values)

  whole_dataset = pd.concat([whole_dataset, pd.DataFrame(topics_dicts)], axis=1)

# City

In [None]:
if city:
  cities = list(set(whole_dataset.CITY))
  cities = [str(c).split(',') for c in cities]
  single_cities = [c for cit in cities for c in cit]
  cities = list(set(single_cities))

  cities_dicts = []
  for i, row in whole_dataset.iterrows():
    cities_values = {k:0 for k in cities}
    current_cities = str(row.CITY).split(",")
    for c in current_cities: cities_values[c] = 1 
    cities_dicts.append(cities_values)

  whole_dataset = pd.concat([whole_dataset, pd.DataFrame(cities_dicts)], axis=1)

# Party

In [None]:
if party:
  parties = list(set(whole_dataset.PARTY))
  parties = [str(p).split(',') for p in parties]
  single_parties = [p for part in parties for p in part]
  parties = list(set(single_parties))

  parties_dicts = []
  for i, row in whole_dataset.iterrows():
    parties_values = {k:0 for k in parties}
    current_parties = str(row.PARTY).split(",")
    for p in current_parties: parties_values[c] = 1 
    parties_dicts.append(parties_values)

  whole_dataset = pd.concat([whole_dataset, pd.DataFrame(parties_dicts)], axis=1)

# USE OF CONTEXT DATA

In [None]:
#statement_embeddings = whole_dataset.STATEMENT.apply(lambda s: np.array(pipeline(s)).mean(axis=1))

In [None]:
import pickle

In [None]:
#with open(f"{project_path}/statement_embeddings.pickle", 'wb') as f:
#    pickle.dump(statement_embeddings, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(f"{project_path}/statement_embeddings.pickle", "rb") as f:
    statement_embeddings = pickle.load(f)

In [None]:
context_data = whole_dataset.drop(["ID", "LABEL", "STATEMENT", "TOPICS", "SPEAKER", "ROLE", "CITY", "PARTY", "CONTEXT"], axis=1).values.tolist()

In [None]:
X = [list(se[0])+cd for se, cd in list(zip(statement_embeddings, context_data))]

In [None]:
X = np.array(X)

In [None]:
X_train = X[:10251]
X_valid = X[10251:11535]
X_test = X[11535:]

In [None]:
Y_train = whole_dataset.LABEL[:10251]
Y_valid = whole_dataset.LABEL[10251:11535]
Y_test = whole_dataset.LABEL[11535:]

# SVM

In [None]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test, Y_test)

In [None]:
preds = clf.predict(X_test)

In [None]:
confusion_matrix(Y_test, preds, normalize="true")

In [None]:
print(classification_report(Y_test, preds, target_names=["fake","true"]))

# Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=None, n_estimators=2000)
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test, Y_test)

In [None]:
preds = clf.predict(X_test)

In [None]:
confusion_matrix(Y_test, preds, normalize="true")

In [None]:
print(classification_report(Y_test, preds, target_names=["fake","true"]))