In [32]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

import kagglehub

In [33]:
# settings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

input_dim = 10000
learning_rate = 0.001
batch_size = 32
num_epochs = 500

cuda:0


In [34]:
# download fake news dataset

# path = kagglehub.dataset_download("saurabhshahane/fake-news-classification")

# print("Path to dataset files:", path)

In [35]:
df = pd.read_csv("C:/Users/J/.cache/kagglehub/datasets/saurabhshahane/fake-news-classification/versions/77/WELFake_Dataset.csv")
print(df.head(1))
print(df.shape)
print(df.info())

   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
(72134, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB
None


In [36]:
df = df.drop(df.columns[[0, 1]], axis=1)

In [37]:
# Check for NaN values
print(df.isna().sum())

df.reset_index(drop=True, inplace=True)

text     39
label     0
dtype: int64


In [38]:
print(df.head())
print(df.shape)
print(df.info())

                                                text  label
0  No comment is expected from Barack Obama Membe...      1
1     Did they post their votes for Hillary already?      1
2   Now, most of the demonstrators gathered last ...      1
3  A dozen politically active pastors came here f...      0
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1
(72134, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    72095 non-null  object
 1   label   72134 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB
None


In [39]:
# drops rows that has NaN values
df.dropna(subset=['text'], inplace=True)

In [40]:
train_dataset, test_dataset = train_test_split(df, test_size=0.30, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.10, random_state=42)

In [41]:
class FakeNewsDataset():
    def __init__(self, df, vectorizer):
        self.texts = df['text']
        self.labels = df['label']
        self.vectorizer = vectorizer
        self.X = self.vectorizer.transform(self.texts).toarray()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.labels.iloc[idx], dtype=torch.long)

In [42]:
vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(df['text'])

In [43]:
train_dataset = FakeNewsDataset(train_dataset, vectorizer)
val_dataset = FakeNewsDataset(val_dataset, vectorizer)
test_dataset = FakeNewsDataset(test_dataset, vectorizer)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
for inputs, labels in train_loader:
    print(inputs.shape, labels.shape)
    break

torch.Size([32, 10000]) torch.Size([32])


In [45]:
# Check for NaN values 
print(df.isna().sum())

df.reset_index(drop=True, inplace=True)


text     0
label    0
dtype: int64


In [46]:
for batch_idx, (batch_data, batch_labels) in enumerate(train_loader):
    print(f"Batch {batch_idx}:")
    print(f"Data shape: {batch_data.shape}")
    print(f"Labels shape: {batch_labels.shape}")

Batch 0:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 1:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 2:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 3:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 4:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 5:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 6:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 7:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 8:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 9:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 10:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 11:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 12:
Data shape: torch.Size([32, 10000])
Labels shape: torch.Size([32])
Batch 13:

In [47]:
print(f"Dataset length: {len(train_loader.dataset)}")

Dataset length: 45419


In [48]:
for text, labels in train_loader:
    print('Batch dimensions:', text.shape)
    print('Label dimensions:', labels.shape)
    print(f"Sample text data: {text[0]}")
    print(f"Sample label: {labels[0]}")
    break


Batch dimensions: torch.Size([32, 10000])
Label dimensions: torch.Size([32])
Sample text data: tensor([0., 0., 0.,  ..., 0., 0., 0.])
Sample label: 1


In [51]:
class LogisticRegressionModel(torch.nn.Module):
    def __init__(self, input_dim, learning_rate, batch_size, num_epochs):
        super(LogisticRegressionModel, self).__init__()
        
        self.weights = torch.nn.Parameter(torch.zeros(input_dim))
        self.bias = torch.nn.Parameter(torch.zeros(1))
        self.lr = learning_rate
        self.batch_size = batch_size
        self.num_epochs = num_epochs

    def forward(self, x):
        # Logistic regression model forward pass
        return torch.matmul(x, self.weights) + self.bias

    def fit(self, train_loader):
        self.to(device)
        
        # Loss function (binary cross-entropy)
        criterion = nn.BCEWithLogitsLoss()
        
        # Optimizer (stochastic gradient descent)
        optimizer = optim.SGD(self.parameters(), lr=self.lr)
        
        for epoch in range(self.num_epochs):
            running_loss = 0.0

            for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                optimizer.zero_grad()

                # Forward pass
                logits = self(x_batch)
                loss = criterion(logits.squeeze(), y_batch.float())
                
                # Backward pass
                loss.backward()
                
                # Update weights and bias
                optimizer.step()

                running_loss += loss.item()

                if batch_idx % 1000 == 0:
                    print(f"[Epoch {epoch+1}, Batch {batch_idx+1}] Loss: {running_loss / (batch_idx+1):.3f}")
                    
        print('Model Training Finished')


    def predict(self, test_loader):
        self.eval()
        y_pred = []
        y_true = []

        device = next(self.parameters()).device

        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                logits = self(x_batch)

                probabilities = torch.sigmoid(logits)

                predictions = (probabilities >= 0.5).float()

                y_pred.extend(predictions.cpu().numpy())
                y_true.extend(y_batch.cpu().numpy())

        return torch.tensor(y_true, dtype=torch.float32), torch.tensor(y_pred, dtype=torch.float32)

    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [52]:
model = LogisticRegressionModel(input_dim, learning_rate, batch_size, num_epochs)

model.fit(train_loader)

[Epoch 1, Batch 1] Loss: 0.693
[Epoch 1, Batch 1001] Loss: 0.693
[Epoch 2, Batch 1] Loss: 0.692
[Epoch 2, Batch 1001] Loss: 0.691
[Epoch 3, Batch 1] Loss: 0.694
[Epoch 3, Batch 1001] Loss: 0.690
[Epoch 4, Batch 1] Loss: 0.691
[Epoch 4, Batch 1001] Loss: 0.688
[Epoch 5, Batch 1] Loss: 0.681
[Epoch 5, Batch 1001] Loss: 0.687
[Epoch 6, Batch 1] Loss: 0.694
[Epoch 6, Batch 1001] Loss: 0.686
[Epoch 7, Batch 1] Loss: 0.686
[Epoch 7, Batch 1001] Loss: 0.684
[Epoch 8, Batch 1] Loss: 0.685
[Epoch 8, Batch 1001] Loss: 0.683
[Epoch 9, Batch 1] Loss: 0.684
[Epoch 9, Batch 1001] Loss: 0.682
[Epoch 10, Batch 1] Loss: 0.682
[Epoch 10, Batch 1001] Loss: 0.680
[Epoch 11, Batch 1] Loss: 0.688
[Epoch 11, Batch 1001] Loss: 0.679
[Epoch 12, Batch 1] Loss: 0.680
[Epoch 12, Batch 1001] Loss: 0.678
[Epoch 13, Batch 1] Loss: 0.682
[Epoch 13, Batch 1001] Loss: 0.677
[Epoch 14, Batch 1] Loss: 0.668
[Epoch 14, Batch 1001] Loss: 0.675
[Epoch 15, Batch 1] Loss: 0.674
[Epoch 15, Batch 1001] Loss: 0.674
[Epoch 16, Ba

In [53]:
torch.save(model.state_dict(), "model_1_500_epochs.pth")
print("Model saved successfully.")

Model saved successfully.


In [54]:
loaded_model = LogisticRegressionModel(input_dim, learning_rate, batch_size, num_epochs).to(device)
loaded_model.load_state_dict(torch.load("model_1_500_epochs.pth"))
loaded_model.eval()

print("Model loaded successfully.")

Model loaded successfully.


In [55]:
y_true, y_pred = model.predict(test_loader)

accuracy = metrics.accuracy_score(y_true, y_pred)
precision = metrics.precision_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)
f1 = metrics.f1_score(y_true, y_pred)
classification_report = metrics.classification_report(y_true, y_pred)

print(f"accuracy {accuracy*100:.3f}")

print(f"precision {precision*100:.3f}")

print(f"recall {recall*100:.3f}")

print(f"f1 {f1*100:.3f}")

print(classification_report)

accuracy 86.541
precision 86.114
recall 88.171
f1 87.130
              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86     10453
         1.0       0.86      0.88      0.87     11176

    accuracy                           0.87     21629
   macro avg       0.87      0.86      0.87     21629
weighted avg       0.87      0.87      0.87     21629

