In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import torch
from torch.utils.data import Dataset , DataLoader
from sklearn.model_selection import train_test_split


In [2]:
class SpamDetect(nn.Module):
  def __init__(self , input_features):
    super().__init__()
    self.linear_1 = nn.Sequential(
         nn.Linear(in_features= input_features , out_features = 128),
         nn.ReLU(),
         nn.Linear(128 , 64),
         nn.Linear(64 ,1)
    )
    self.layer_2 = nn.Sequential(
        nn.Sigmoid()
    )

  def forward(self, x):
    return self.layer_2(self.linear_1(x))


In [3]:

df = pd.read_csv("hf://datasets/thehamkercat/telegram-spam-ham/dataset.csv")

df.columns = ['label', 'text']

print(df.head())


  label                                               text
0  spam  naturally irresistible your corporate identity...
1  spam  the stock trading gunslinger fanny is merrill ...
2  spam  unbelievable new homes made easy im wanting to...
3  spam  4 color printing special request additional in...
4  spam  do not have money get software cds from here s...


In [4]:

# Convert labels to binary (spam = 1, ham = 0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

print(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}, Testing samples: {len(X_test)}')


       label                                               text
0          1  naturally irresistible your corporate identity...
1          1  the stock trading gunslinger fanny is merrill ...
2          1  unbelievable new homes made easy im wanting to...
3          1  4 color printing special request additional in...
4          1  do not have money get software cds from here s...
...      ...                                                ...
20343      0                                               /ban
20344      0                                               /ban
20345      0                                               /ban
20346      0                                          Kaisi hii
20347      0                                            Shock q

[20348 rows x 2 columns]
Training samples: 16278, Testing samples: 4070


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert text data to numerical data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

print(f'Feature vector shape: {X_train.shape}')


Feature vector shape: (16278, 53450)


In [6]:
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [7]:
# Create dataset objects
train_dataset = SpamDataset(X_train, y_train)
test_dataset = SpamDataset(X_test, y_test)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [8]:
model = SpamDetect(input_features = X_train.shape[1])

In [9]:
loss_fn = nn.BCELoss()

optimizer = optim.SGD(model.parameters(),
                      lr = 0.1)


In [12]:
epochs = 3

for epoch in range(epochs):
  model.train();
  epoch_loss = 0
  for emails , labels in train_loader:
    y_pred = model(emails)
    labels = labels.view(-1,1)
    loss = loss_fn(y_pred , labels)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()
    epoch_loss += loss.item()

    if(epoch):
      print(f'Epoch [{epoch}, Loss: {epoch_loss/len(train_loader):.4f}')


Epoch [1, Loss: 0.0001
Epoch [1, Loss: 0.0003
Epoch [1, Loss: 0.0005
Epoch [1, Loss: 0.0008
Epoch [1, Loss: 0.0009
Epoch [1, Loss: 0.0010
Epoch [1, Loss: 0.0011
Epoch [1, Loss: 0.0012
Epoch [1, Loss: 0.0013
Epoch [1, Loss: 0.0014
Epoch [1, Loss: 0.0015
Epoch [1, Loss: 0.0015
Epoch [1, Loss: 0.0015
Epoch [1, Loss: 0.0019
Epoch [1, Loss: 0.0021
Epoch [1, Loss: 0.0022
Epoch [1, Loss: 0.0022
Epoch [1, Loss: 0.0022
Epoch [1, Loss: 0.0023
Epoch [1, Loss: 0.0024
Epoch [1, Loss: 0.0026
Epoch [1, Loss: 0.0026
Epoch [1, Loss: 0.0027
Epoch [1, Loss: 0.0028
Epoch [1, Loss: 0.0030
Epoch [1, Loss: 0.0032
Epoch [1, Loss: 0.0033
Epoch [1, Loss: 0.0034
Epoch [1, Loss: 0.0035
Epoch [1, Loss: 0.0037
Epoch [1, Loss: 0.0037
Epoch [1, Loss: 0.0044
Epoch [1, Loss: 0.0044
Epoch [1, Loss: 0.0046
Epoch [1, Loss: 0.0050
Epoch [1, Loss: 0.0060
Epoch [1, Loss: 0.0061
Epoch [1, Loss: 0.0062
Epoch [1, Loss: 0.0062
Epoch [1, Loss: 0.0064
Epoch [1, Loss: 0.0064
Epoch [1, Loss: 0.0066
Epoch [1, Loss: 0.0067
Epoch [1, L

In [13]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()  # Set the model to evaluation mode
y_pred = []
y_true = []

with torch.no_grad():
    for emails, labels in test_loader:
        outputs = model(emails)
        predicted = (outputs > 0.5).float()
        y_pred.extend(predicted.squeeze().tolist())
        y_true.extend(labels.tolist())

print(f'Accuracy: {accuracy_score(y_true, y_pred):.4f}')
print(classification_report(y_true, y_pred))


Accuracy: 0.9565
              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97      2913
         1.0       0.95      0.90      0.92      1157

    accuracy                           0.96      4070
   macro avg       0.95      0.94      0.95      4070
weighted avg       0.96      0.96      0.96      4070



In [22]:
def predict(text, model, vectorizer):
    model.eval()
    with torch.no_grad():
        vectorized_text = vectorizer.transform([text]).toarray()
        vectorized_text = torch.tensor(vectorized_text, dtype=torch.float32)
        output = model(vectorized_text)
        prediction = (output > 0.5).float().item()
    return 'spam' if prediction == 1 else 'ham'

# Test with a custom message
test_message = "Congratulations! You've won a free ticket to the Bahamas. Reply with WIN to claim your prize."
test_message_1 = "Hi This is Manideep"
print(predict(test_message, model, vectorizer))


tensor([0])
None
