#### **Create and train model**

In [4]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

file_path = '/content/drive/MyDrive/notebooks/new_dataset.csv'
df = pd.read_csv(file_path)
df = df.dropna()

In [5]:
label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'].tolist())

train_df, test_df = train_test_split(df, train_size=0.7, shuffle=True, random_state=0)
val_df, test_df = train_test_split(test_df, train_size=0.5, shuffle=True, random_state=0)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
import tensorflow as tf


tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")


train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_val = val_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

In [8]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=2)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/results/tinybert2",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model('/content/drive/MyDrive/tinybert_second')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0281,0.010804
2,0.0045,0.005402
3,0.003,0.006454
4,0.0015,0.004894
5,0.0008,0.004558


#### **Evaluate on test set from the same dataset**

In [9]:
from sklearn.metrics import classification_report
import numpy as np

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=1)

labels = tokenized_test['label']

print("Validation Classification Report:")
print(classification_report(labels, preds, target_names=["true", "Fake"]))

Validation Classification Report:
              precision    recall  f1-score   support

        true       1.00      1.00      1.00      3168
        Fake       1.00      1.00      1.00      3567

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735



#### **Evaluate on different dataset**   used for training all other models

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import numpy as np


fakenews_file_path = '/content/drive/MyDrive/notebooks/fakenews.csv'
fakenews_df = pd.read_csv(fakenews_file_path)
fakenews_df = fakenews_df[['text', 'label']]
fakenews_df = fakenews_df.fillna('')


tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

def tokenize_data(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

fakenews_dataset = Dataset.from_pandas(fakenews_df)
tokenized_fakenews = fakenews_dataset.map(tokenize_data, batched=True)

model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/tinybert_second')
trainer = Trainer(model=model, tokenizer=tokenizer)

predictions = trainer.predict(tokenized_fakenews)
preds = np.argmax(predictions.predictions, axis=1)

true_labels = fakenews_df['label'].values
num_correct_predictions = np.sum(preds == true_labels)
total_predictions = len(preds)
accuracy = num_correct_predictions / total_predictions

In [5]:
from sklearn.metrics import classification_report
report = classification_report(true_labels, preds, target_names=['True News', 'Fake News'])
print(report)

              precision    recall  f1-score   support

   True News       0.62      0.95      0.75     10387
   Fake News       0.89      0.41      0.56     10413

    accuracy                           0.68     20800
   macro avg       0.76      0.68      0.66     20800
weighted avg       0.76      0.68      0.66     20800



#### Saving news model predicted wrong for further analysis

In [7]:
false_positives = fakenews_df[(preds == 0) & (true_labels == 1)]
false_negatives = fakenews_df[(preds == 1) & (true_labels == 0)]


false_positives_examples = false_positives['text'].head(10).tolist()
false_negatives_examples = false_negatives['text'].head(10).tolist()

with open('/content/drive/MyDrive/notebooks/false_positives.txt', 'w') as f:
    f.write('\n\n\n\n\n'.join(false_positives_examples))

with open('/content/drive/MyDrive/notebooks/false_negatives.txt', 'w') as f:
    f.write('\n\n\n\n\n'.join(false_negatives_examples))