In [16]:
import re
import requests
import pandas as pd
import torch
import fasttext
import fasttext.util
from torch.utils.data import Dataset
from tqdm import tqdm
import os
import urllib.request
from gensim.models import KeyedVectors



# Making the dataset and preproccessing it:

In [14]:
class SnappFoodDataset(Dataset):
    def __init__(
        self,
        csv_path,
        max_len=50,
        embed_size=300,
    ):
        self.max_len = max_len
        self.embed_size = embed_size

        url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-fa/master/stopwords-fa.txt"
        self.stopwords = set(requests.get(url).text.splitlines())

        fixed_lines = []
        with open(csv_path, encoding="utf-8") as f:
            for line in f:
                fixed_lines.append(line.replace("\\t", "\t"))

        fixed_path = "train_fixed.csv"
        with open(fixed_path, "w", encoding="utf-8") as f:
            f.writelines(fixed_lines)

        df = pd.read_csv(
            fixed_path,
            sep="\t",
            names=["id", "text", "label", "label_id"],
            header=0,
            engine="python",
        )

        df["text"] = df["text"].astype(str).apply(self.clean_text)

        self.labels = torch.tensor(df["label_id"].values, dtype=torch.long)

        # print("⏳ Loading fastText vectors ...")
        # wv = KeyedVectors.load_word2vec_format(
        #     "cc.fa.300.vec",
        #     binary=False
        # )
        # print("✅ fastText loaded")

        print("⏳ Loading fastText vectors ...")

        wv = KeyedVectors.load_word2vec_format(
            "cc.fa.300.vec",
            binary=False,
            unicode_errors="ignore"
        )

        print("✅ fastText vectors loaded")

        all_vectors = []

        # ---------------- Vectorize (FASTTEXT .VEC) ----------------
        for text in tqdm(df["text"], desc="Vectorizing (fastText .vec)"):
            words = text.split()[: self.max_len]

            vecs = []

            for w in words:
                if w in wv: 
                    vecs.append(torch.tensor(wv[w], dtype=torch.float32))

            if len(vecs) < self.max_len:
                pad = torch.zeros(self.max_len - len(vecs), self.embed_size)
                mat = torch.cat([torch.stack(vecs), pad], dim=0) if vecs else pad
            else:
                mat = torch.stack(vecs[: self.max_len])

            mat = mat.t()  # [300, max_len]
            all_vectors.append(mat)

        self.data = torch.stack(all_vectors)  # [N, 300, 50]

        print("✅ Dataset ready:", self.data.shape)

    def clean_text(self, text):
        text = text.replace("ي", "ی").replace("ك", "ک")

        eng = "0123456789"
        fa = "۰۱۲۳۴۵۶۷۸۹"
        for e, f in zip(eng, fa):
            text = text.replace(e, f)

        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"www\S+", "", text)
        text = re.sub(r"@\S+", "", text)

        text = re.sub(r"\s+", " ", text).strip()

        tokens = [t for t in text.split() if t not in self.stopwords]
        return " ".join(tokens)

    def __len__(self):
        return len(self.labels)

    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


as you can see above we made a dataset class in which clean each comment and then implemented two methodes for vectorizing the datas:

1: 
    
    using mini multilingual LM from sentance transformers which encodes each comment to a 384d tensor . in this methode we dont just vectorize each word and then vectorize the whole sentance by concatanation , the LM make connections with each words in the sentance and make a much more meaningfull vector for each comment.

2: 
    
    the second methode is using the fasttext model and procceed exactly like explained in the projects description.

# Defining the CNN:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

class TextCNN(nn.Module):
    def __init__(self, embed_size=384, num_classes=2):
        super(TextCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=embed_size, out_channels=128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(p=0.5)
        
        # Calculate Linear input size dynamically logic:
        # Sequence Length (50) -> Pool(2) -> becomes 25
        # Channels (128)
        # Flattened size = 128 * 25
        self.fc = nn.Linear(128 * 25, num_classes)

    def forward(self, x):
        # x shape: [Batch, 300, 50]
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = x.flatten(1) 
        x = self.fc(x) 
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = SnappFoodDataset('datas/train.csv', max_len=50)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


model = TextCNN(embed_size=300, num_classes=2).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

num_epochs = 10

for epoch in range(num_epochs):
    model.train() # Enable Dropout
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
    

model.eval() # Disable Dropout
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy on test set: {100 * correct / total:.2f}%")


⏳ Loading fastText vectors ...
✅ fastText vectors loaded


Vectorizing (fastText .vec): 100%|██████████| 56700/56700 [00:19<00:00, 2963.36it/s]


✅ Dataset ready: torch.Size([56700, 300, 50])


RuntimeError: Given groups=1, weight of size [128, 384, 3], expected input[32, 300, 50] to have 384 channels, but got 300 channels instead