# Data Ingestion - experiment 1

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/NLP project dataset/DATASET-3_politifact.csv')  # Replace with your file path

data=pd.DataFrame(data);
data.head()

Unnamed: 0,statement,source,link,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,/web/20180705082623/https://www.politifact.com...,Pants on Fire!
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,/web/20180705082623/https://www.politifact.com...,Mostly True
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,/web/20180705082623/https://www.politifact.com...,Pants on Fire!
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,/web/20180705082623/https://www.politifact.com...,Pants on Fire!
4,"""There are a lot of private charters. And in f...",Kelda Helen Roys,/web/20180705082623/https://www.politifact.com...,Half-True


In [None]:
data.drop(columns=['link'],inplace=True)
data.head()

Unnamed: 0,statement,source,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,Pants on Fire!
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,Mostly True
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,Pants on Fire!
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,Pants on Fire!
4,"""There are a lot of private charters. And in f...",Kelda Helen Roys,Half-True


In [None]:
pip install ftfy


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


# Data pre-processing - experiment 1

In [None]:
import pandas as pd
import re
import ftfy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from textblob import TextBlob

nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load('en_core_web_sm')

def preprocess_statement(statement):
    statement = ftfy.fix_text(statement)

    quoted_text = re.findall(r'"(.*?)"', statement)
    statement = ' '.join(quoted_text) if quoted_text else statement

    statement = re.sub(r'^says\s+', '', statement, flags=re.IGNORECASE)

    statement = statement.lower()

    statement = re.sub(r'[^\w\s]', '', statement)
    statement = re.sub(r'\d+', '', statement)

    stop_words = set(stopwords.words("english")) - {"not", "no", "never"}
    statement = ' '.join([word for word in statement.split() if word not in stop_words])

    lemmatizer = WordNetLemmatizer()
    statement = ' '.join([lemmatizer.lemmatize(word) for word in statement.split()])

    doc = nlp(statement)
    tokens = []
    for token in doc:
        if token.ent_type_:
            tokens.append(token.text)
        else:
            tokens.append(token.lemma_)
    statement = ' '.join(tokens)

    sentiment = TextBlob(statement).sentiment.polarity

    return statement



data['cleaned_statement'] = data['statement'].apply(preprocess_statement)

print(data[['statement', 'cleaned_statement']])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                               statement  \
0      Sen. Kamala Harris is "supporting the animals ...   
1      Says Ronald Reagan said immigrants "brought wi...   
2      Says Democratic Senators "demand Supreme Court...   
3      "Tim Kaine doesn’t want a border at all. He wa...   
4      "There are a lot of private charters. And in f...   
...                                                  ...   
14204  "For every single scientist that tells you (gl...   
14205  "New Mexico was 46th in teacher pay (when he w...   
14206  "We now have the greatest income inequality si...   
14207  "Tommy Thompson is the father of welfare refor...   
14208  "Tommy Thompson created the first school choic...   

                                       cleaned_statement  
0                                       support animal m  
1      brought courage value family work freedom let ...  
2      demand supreme court nominee not unduly influe...  
3      tim kaine do not want border want ge

In [None]:
data.head()

Unnamed: 0,statement,source,veracity,cleaned_statement
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,Pants on Fire!,support animal m
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,Mostly True,brought courage value family work freedom let ...
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,Pants on Fire!,demand supreme court nominee not unduly influe...
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,Pants on Fire!,tim kaine do not want border want get rid immi...
4,"""There are a lot of private charters. And in f...",Kelda Helen Roys,Half-True,lot private charter fact tony ever seek receiv...


In [None]:
data['cleaned_statement'][2]

'demand supreme court nominee not unduly influence u constitution'

# Tokenization - experiment 1

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

data['tokenized_statement'] = data['cleaned_statement'].apply(tokenize_text)

print(data[['cleaned_statement', 'tokenized_statement']].head())
data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                   cleaned_statement  \
0                                   support animal m   
1  brought courage value family work freedom let ...   
2  demand supreme court nominee not unduly influe...   
3  tim kaine do not want border want get rid immi...   
4  lot private charter fact tony ever seek receiv...   

                                 tokenized_statement  
0                               [support, animal, m]  
1  [brought, courage, value, family, work, freedo...  
2  [demand, supreme, court, nominee, not, unduly,...  
3  [tim, kaine, do, not, want, border, want, get,...  
4  [lot, private, charter, fact, tony, ever, seek...  


Unnamed: 0,statement,source,veracity,cleaned_statement,tokenized_statement
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,Pants on Fire!,support animal m,"[support, animal, m]"
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,Mostly True,brought courage value family work freedom let ...,"[brought, courage, value, family, work, freedo..."
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,Pants on Fire!,demand supreme court nominee not unduly influe...,"[demand, supreme, court, nominee, not, unduly,..."
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,Pants on Fire!,tim kaine do not want border want get rid immi...,"[tim, kaine, do, not, want, border, want, get,..."
4,"""There are a lot of private charters. And in f...",Kelda Helen Roys,Half-True,lot private charter fact tony ever seek receiv...,"[lot, private, charter, fact, tony, ever, seek..."


# Feature engineering - experiment 1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

data['text_for_tfidf'] = data['tokenized_statement'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')

tfidf_matrix = tfidf_vectorizer.fit_transform(data['text_for_tfidf'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df.head())
data.head()

   abbott  ability  able  ablebodie  abolish  abort  abortion  abraham  \
0     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
1     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
2     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
3     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
4     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   

   absentee  absolutely  ...  yearold  yemen  yes  york  yorker  young  youth  \
0       0.0         0.0  ...      0.0    0.0  0.0   0.0     0.0    0.0    0.0   
1       0.0         0.0  ...      0.0    0.0  0.0   0.0     0.0    0.0    0.0   
2       0.0         0.0  ...      0.0    0.0  0.0   0.0     0.0    0.0    0.0   
3       0.0         0.0  ...      0.0    0.0  0.0   0.0     0.0    0.0    0.0   
4       0.0         0.0  ...      0.0    0.0  0.0   0.0     0.0    0.0    0.0   

   zero  zika  zone  
0   0.0   0.0   0.0  
1   0.0   0.0   0.0  
2 

Unnamed: 0,statement,source,veracity,cleaned_statement,tokenized_statement,text_for_tfidf
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,Pants on Fire!,support animal m,"[support, animal, m]",support animal m
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,Mostly True,brought courage value family work freedom let ...,"[brought, courage, value, family, work, freedo...",brought courage value family work freedom let ...
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,Pants on Fire!,demand supreme court nominee not unduly influe...,"[demand, supreme, court, nominee, not, unduly,...",demand supreme court nominee not unduly influe...
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,Pants on Fire!,tim kaine do not want border want get rid immi...,"[tim, kaine, do, not, want, border, want, get,...",tim kaine do not want border want get rid immi...
4,"""There are a lot of private charters. And in f...",Kelda Helen Roys,Half-True,lot private charter fact tony ever seek receiv...,"[lot, private, charter, fact, tony, ever, seek...",lot private charter fact tony ever seek receiv...


In [None]:
data.shape

(14209, 6)

In [None]:
replacement_dict = {
    'Pants on Fire!': 'False',
    'Mostly True': 'True',
    'Half-True': 'Half-true',
    'False': 'False',
    'True': 'True',
    'Mostly False': 'False',
    'Full Flop': 'False',
    'Half Flip': 'Half-true',
    'No Flip': 'Half-true'
}

data['veracity'] = data['veracity'].replace(replacement_dict)

data['veracity'].unique()

array(['False', 'True', 'Half-true'], dtype=object)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data['veracity_encoded'] = label_encoder.fit_transform(data['veracity'])

print(data)


                                               statement  \
0      Sen. Kamala Harris is "supporting the animals ...   
1      Says Ronald Reagan said immigrants "brought wi...   
2      Says Democratic Senators "demand Supreme Court...   
3      "Tim Kaine doesn’t want a border at all. He wa...   
4      "There are a lot of private charters. And in f...   
...                                                  ...   
14204  "For every single scientist that tells you (gl...   
14205  "New Mexico was 46th in teacher pay (when he w...   
14206  "We now have the greatest income inequality si...   
14207  "Tommy Thompson is the father of welfare refor...   
14208  "Tommy Thompson created the first school choic...   

                             source   veracity  \
0                      Donald Trump      False   
1      Becoming American Initiative       True   
2                       Viral image      False   
3                    Corey  Stewart      False   
4                  Kelda Hele

# Basiv logistic regression model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = tfidf_df
y = data['veracity_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(tfidf_df.head())


Accuracy: 0.47783251231527096
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.71      0.60      1289
           1       0.24      0.05      0.09       586
           2       0.44      0.43      0.43       967

    accuracy                           0.48      2842
   macro avg       0.40      0.40      0.37      2842
weighted avg       0.43      0.48      0.44      2842

   abbott  ability  able  ablebodie  abolish  abort  abortion  abraham  \
0     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
1     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
2     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
3     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
4     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   

   absentee  absolutely  ...  yearold  yemen  yes  york  yorker  young  youth  \
0       0.0         0.0  ...      0.0   

# XG Boost model - experiment 1

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

X = tfidf_df
y = data['veracity_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(tfidf_df.head())


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.48346235045742436
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.79      0.62      1289
           1       0.33      0.04      0.07       586
           2       0.44      0.34      0.39       967

    accuracy                           0.48      2842
   macro avg       0.43      0.39      0.36      2842
weighted avg       0.45      0.48      0.42      2842

   abbott  ability  able  ablebodie  abolish  abort  abortion  abraham  \
0     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
1     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
2     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
3     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   
4     0.0      0.0   0.0        0.0      0.0    0.0       0.0      0.0   

   absentee  absolutely  ...  yearold  yemen  yes  york  yorker  young  youth  \
0       0.0         0.0  ...      0.0   

# Bert model - experiment 1

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
from transformers import AutoModelForSequenceClassification, AdamW
from torch.nn import CrossEntropyLoss

num_labels = len(data['veracity_encoded'].unique())  # Number of target classes
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=num_labels)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()




In [None]:
from sklearn.model_selection import train_test_split

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['cleaned_statement'].tolist(),
    data['veracity_encoded'].tolist(),
    test_size=0.3,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42
)


In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

class VerbalLieDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

MAX_LEN = 128
BATCH_SIZE = 16

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['cleaned_statement'],
    data['veracity_encoded'],
    test_size=0.3,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42
)


train_dataset = VerbalLieDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=MAX_LEN)
val_dataset = VerbalLieDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len=MAX_LEN)
test_dataset = VerbalLieDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Data Loaders Created Successfully")


Data Loaders Created Successfully


In [None]:
from tqdm import tqdm

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Training Loss: {train_loss / len(train_loader)}")

    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            true = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(true)

    val_accuracy = accuracy_score(true_labels, predictions)
    print(f"Epoch {epoch+1} Validation Loss: {val_loss / len(val_loader)}")
    print(f"Epoch {epoch+1} Validation Accuracy: {val_accuracy}")


Epoch 1: 100%|██████████| 622/622 [03:20<00:00,  3.11it/s, loss=1.14]


Epoch 1 Training Loss: 1.0347525547363368
Epoch 1 Validation Loss: 1.0288427734552925
Epoch 1 Validation Accuracy: 0.4941342092914125


Epoch 2: 100%|██████████| 622/622 [03:23<00:00,  3.06it/s, loss=0.943]


Epoch 2 Training Loss: 0.9709519856995709
Epoch 2 Validation Loss: 1.0458410551298911
Epoch 2 Validation Accuracy: 0.48146410136086343


Epoch 3: 100%|██████████| 622/622 [03:23<00:00,  3.06it/s, loss=0.788]


Epoch 3 Training Loss: 0.8297646692998923
Epoch 3 Validation Loss: 1.1102100383879534
Epoch 3 Validation Accuracy: 0.4936649460347255


# Model Evaluation(BERT MODEL) - experiment 1

In [None]:
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            true = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(true)

    acc = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    return acc, report

# Test evaluation
test_accuracy, test_report = evaluate(model, test_loader)
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", test_report)


Test Accuracy: 0.48686679174484054
Test Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.68      0.59       949
           1       0.28      0.11      0.15       442
           2       0.47      0.47      0.47       741

    accuracy                           0.49      2132
   macro avg       0.43      0.42      0.41      2132
weighted avg       0.46      0.49      0.46      2132

