In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


CUDA available: True
Device name: Tesla T4


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
!pip install transformers
!pip install torch
!pip install scikit-learn




In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification


In [None]:
df = pd.read_csv("/content/social media.csv")  # Replace with your file path

# Example: columns are `text` and `label`
print(df.head())

# Drop rows with missing values in the 'is_rumor' column
df.dropna(subset=['is_rumor'], inplace=True)
df['is_rumor'] = df['is_rumor'].astype(int)

                                             content  is_rumor    user.handle  \
0  Charlie Hebdo became well known for publishing...       0.0     BBCDanielS   
1  Now 10 dead in a shooting there today RT "@BBC...       0.0      robbylevy   
2  @BBCDanielS @BBCWorld I'm guessing this is bei...       0.0  ModerateInAll   
3  @BBCDanielS @BBCWorld why would you mention th...       0.0    GabTarquini   
4            @BBCDanielS @BBCWorld perps identified?       0.0  freethought41   

          topic  
0  charliehebdo  
1  charliehebdo  
2  charliehebdo  
3  charliehebdo  
4  charliehebdo  


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['content'],
    df['is_rumor'],
    test_size=0.2,
    stratify=df['is_rumor'],
    random_state=42
)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = CrossEntropyLoss(weight=weights)


In [None]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=256)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train.values)
)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
model.train()
for epoch in range(2):
    for batch in train_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} loss: {loss.item()}")


Epoch 1 loss: 0.3748469650745392
Epoch 2 loss: 0.11063259840011597


In [None]:
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=256)

test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(y_test.values)
)

test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False) # Use shuffle=False for evaluation

model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())


print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

Accuracy: 0.856353591160221
Confusion Matrix:
 [[8364 1360]
 [ 434 2331]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.86      0.90      9724
           1       0.63      0.84      0.72      2765

    accuracy                           0.86     12489
   macro avg       0.79      0.85      0.81     12489
weighted avg       0.88      0.86      0.86     12489



In [None]:
def predict_social_news(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    model.eval()
    with torch.no_grad():
        logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=1).item()

    # Map prediction
    if pred == 1:
        return "❌ FAKE NEWS"
    else:
        return "✅ REAL NEWS"

# Example:
print(predict_social_news("Congratulations to the class of 2025! 🎓"))
print(predict_social_news("Breaking: NASA confirms the sun will explode next week! 😱"))


✅ REAL NEWS
❌ FAKE NEWS


In [None]:
model.save_pretrained("./fine_tuned_roberta_social/")
tokenizer.save_pretrained("./fine_tuned_roberta_social/")

('./fine_tuned_roberta_social/tokenizer_config.json',
 './fine_tuned_roberta_social/special_tokens_map.json',
 './fine_tuned_roberta_social/vocab.json',
 './fine_tuned_roberta_social/merges.txt',
 './fine_tuned_roberta_social/added_tokens.json')

In [None]:
!zip -r roberta_social_model.zip fine_tuned_roberta_social/


  adding: fine_tuned_roberta_social/ (stored 0%)
  adding: fine_tuned_roberta_social/config.json (deflated 49%)
  adding: fine_tuned_roberta_social/vocab.json (deflated 68%)
  adding: fine_tuned_roberta_social/tokenizer_config.json (deflated 76%)
  adding: fine_tuned_roberta_social/model.safetensors (deflated 10%)
  adding: fine_tuned_roberta_social/special_tokens_map.json (deflated 84%)
  adding: fine_tuned_roberta_social/merges.txt (deflated 53%)


In [None]:
from google.colab import files
files.download("roberta_social_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>