# 1D CNN text classifier

In [41]:
import csv, time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


## Dataset

In [42]:
def load_csv_dataset(path):
    texts = []
    labels = []

    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            texts.append(row["text"])
            labels.append(int(row["label"]))

    return texts, labels

In [57]:
class TextDataset(Dataset):
    def __init__(self, samples, labels, max_len=256):
        self.samples = samples
        self.labels = labels
        self.max_len = max_len

    def encode(self, s):
        # byte encoding, values 0–255
        x = list(s.encode("utf-8"))[:self.max_len]
        # pad with zeros
        pad_len = self.max_len - len(x)
        x = x + [0] * pad_len
        return torch.tensor(x, dtype=torch.long)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.encode(self.samples[idx]), torch.tensor(self.labels[idx], dtype=torch.float)

train_texts, train_labels = load_csv_dataset("dataset/train_dataset.csv")
train_loader = DataLoader(
    TextDataset(train_texts, train_labels),
    batch_size=16,
    shuffle=True
)

val_texts, val_labels = load_csv_dataset("dataset/val_dataset.csv")
val_loader = DataLoader(
    TextDataset(val_texts, val_labels),
    batch_size=16,
    shuffle=False
)

## Model

In [44]:
class CharCNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim=32,
        num_filters=128,
        kernel_sizes=(3, 5, 7),
        dropout=0.2,
        num_classes=1  # binary classification
    ):
        super().__init__()

        # Character embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Multiple convolution branches
        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters,
                kernel_size=k
            ) for k in kernel_sizes
        ])

        # Final classifier
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        """
        x: (batch_size, seq_len) - tensor of character IDs
        """
        embedded = self.embedding(x)  # (batch, seq, embed)
        embedded = embedded.transpose(1, 2)  # (batch, embed, seq)

        # Apply conv → ReLU → global max pool
        conv_results = []
        for conv in self.convs:
            c = conv(embedded)                 # (batch, filters, seq')
            c = F.relu(c)
            c = F.max_pool1d(c, kernel_size=c.size(2))  # global max pool
            conv_results.append(c.squeeze(2))

        # Concatenate features
        features = torch.cat(conv_results, dim=1)

        out = self.dropout(features)
        out = self.fc(out)

        # Output logits (use BCEWithLogitsLoss)
        return out

## Training

In [73]:
epochs = 5

In [71]:
model = CharCNN(vocab_size=256)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [74]:
for epoch in range(epochs):
    total_loss = 0
    t0 = time.time()
    for x, y in train_loader:
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits.squeeze(1), y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}: time {(time.time()-t0):.2f}s, loss = {total_loss:.4f}")

Epoch 1: time 25.36s, loss = 33.2913
Epoch 2: time 25.37s, loss = 0.4080
Epoch 3: time 27.16s, loss = 0.1158
Epoch 4: time 26.95s, loss = 0.0482
Epoch 5: time 30.80s, loss = 0.0224


### Visualization

In [78]:
print(model)

CharCNN(
  (embedding): Embedding(256, 32, padding_idx=0)
  (convs): ModuleList(
    (0): Conv1d(32, 128, kernel_size=(3,), stride=(1,))
    (1): Conv1d(32, 128, kernel_size=(5,), stride=(1,))
    (2): Conv1d(32, 128, kernel_size=(7,), stride=(1,))
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=384, out_features=1, bias=True)
)


## Evaluation

In [75]:
def predict(model, text):
    model.eval()
    x = torch.tensor([c for c in text.encode()][:256] + [0]*(256-len(text)), dtype=torch.long)
    x = x.unsqueeze(0)
    with torch.no_grad():
        p = torch.sigmoid(model(x)).item()
    return p  # close to 1 = real text, close to 0 = random

In [76]:
TP,TN,FP,FN = 0,0,0,0
model.eval()
with torch.no_grad():
    for xs, ys in val_loader:
        logits = model(xs)
        probs = torch.sigmoid(logits.squeeze(1))
        preds = (probs > 0.5).long()
        targets = ys.long()

        TP += int(((preds == 1) & (targets == 1)).sum().item())
        TN += int(((preds == 0) & (targets == 0)).sum().item())
        FP += int(((preds == 1) & (targets == 0)).sum().item())
        FN += int(((preds == 0) & (targets == 1)).sum().item())


accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000


## Inference

In [52]:
print(predict(model, "This is a sample of real text."))  # Expected: close to 1
print(predict(model, "gif uli afek sms"))  # Expected: close to 0

0.9866163730621338
0.1002049371600151
