# **import the library **

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from torch.nn.utils.rnn import pad_sequence

In [None]:
### Download required resources
nltk.download('stopwords')
arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

### Preprocess function
def preprocess(text):
    text = re.sub(r'[^\u0621-\u064A\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = [stemmer.stem(word) for word in text.split() if word not in arabic_stopwords]
    return tokens

###Load train and test datasets
train_df = pd.read_csv("/content/Algerian Review.csv").dropna(subset=['Commentaire', 'Statut'])
train_df['Statut'] = train_df['Statut'].replace({0:'Neutre',1:'Positif',-1:'Négatif'})
test_df = pd.read_csv("/content/test.csv").dropna(subset=['Commentaire'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Pre procecing

In [None]:
###Label Encoding
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['Statut'])

###Preprocess all comments
train_df['tokens'] = train_df['Commentaire'].astype(str).apply(preprocess)
test_df['tokens'] = test_df['Commentaire'].astype(str).apply(preprocess)

###Build vocabulary
vocab = {'<PAD>': 0, '<UNK>': 1}
for tokens in train_df['tokens']:
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

###Encode tokens
def encode(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

train_df['encoded'] = train_df['tokens'].apply(encode)
test_df['encoded'] = test_df['tokens'].apply(encode)

###Dataset class
class ArabicDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = [torch.tensor(e, dtype=torch.long) for e in encodings]
        self.labels = labels if labels is None else torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.encodings[idx], self.labels[idx]
        else:
            return self.encodings[idx]

def collate_fn(batch):
    if isinstance(batch[0], tuple):
        texts, labels = zip(*batch)
        texts = pad_sequence(texts, batch_first=True)
        labels = torch.stack(labels)
        return texts, labels
    else:
        texts = pad_sequence(batch, batch_first=True)
        return texts
###Dataloaders
train_dataset = ArabicDataset(train_df['encoded'].tolist(), train_df['label'].tolist())
test_dataset = ArabicDataset(test_df['encoded'].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


### build the moodele

In [None]:


### CNN Model
import torch
import torch.nn as nn

class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, dropout_rate=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embed_dim, 128, kernel_size=4, padding=2)
        self.conv3 = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)

        self.batch_norm = nn.BatchNorm1d(128 * 3)

        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(128 * 3, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        # x shape: [batch_size, sequence_length]
        x = self.embedding(x)  # -> [B, T, E]
        x = x.permute(0, 2, 1)  # -> [B, E, T]

        x1 = torch.relu(self.conv1(x))  # -> [B, 128, T]
        x2 = torch.relu(self.conv2(x))  # -> [B, 128, T]
        x3 = torch.relu(self.conv3(x))  # -> [B, 128, T]

        x1 = torch.max(x1, dim=2)[0]  # Global Max Pooling -> [B, 128]
        x2 = torch.max(x2, dim=2)[0]
        x3 = torch.max(x3, dim=2)[0]

        x = torch.cat([x1, x2, x3], dim=1)  # -> [B, 128*3]
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)


### Model setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNTextClassifier(vocab_size=len(vocab), embed_dim=100, num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
###Training loop
for epoch in range(5):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(texts)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    print(f"✅ Epoch {epoch+1} done")



✅ Epoch 1 done
✅ Epoch 2 done
✅ Epoch 3 done
✅ Epoch 4 done
✅ Epoch 5 done


### evaluation

In [None]:
### Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for texts, labels in train_loader:
        texts = texts.to(device)
        output = model(texts)
        preds = torch.argmax(output, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print("📊 Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

### Predict on test set
predictions = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        outputs = model(batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        predictions.extend(preds)

#### Save prediction
test_df['Statut'] = label_encoder.inverse_transform(predictions)
test_df[['ID', 'Statut']].to_csv("predicted_test.csv", index=False)


📊 Accuracy: 0.8037593984962406
              precision    recall  f1-score   support

      Neutre       0.74      0.99      0.85      2934
     Négatif       0.99      0.67      0.80      1819
     Positif       0.93      0.24      0.38       567

    accuracy                           0.80      5320
   macro avg       0.89      0.64      0.68      5320
weighted avg       0.85      0.80      0.78      5320



In [None]:
# prompt: i wana download the modele

# Define the filename for saving the model
model_filename = 'cnn_text_classifier11.pth'

# Save the model state dictionary
torch.save(model.state_dict(), model_filename)

print(f"Model saved to {model_filename}")

# To download the file in Google Colab
try:
  from google.colab import files
  files.download(model_filename)
except ImportError:
  pass # Not running in Colab

Model saved to cnn_text_classifier11.pth


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# prompt: i wana dowlnload also the tokinizer

import pickle

# Define the filename for saving the tokenizer
.

# Save the vocab dictionary
with open(tokenizer_filename, 'wb') as f:
    pickle.dump(vocab, f)

print(f"Tokenizer (vocab) saved to {tokenizer_filename}")

# To download the file in Google Colab
try:
  files.download(tokenizer_filename)
except ImportError:
  pass # Not running in Colab

Tokenizer (vocab) saved to vocab.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# prompt: i waana download label encoder

tokenizer_filename = 'tokenizer_vocab.pkl'
# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
  pickle.dump(label_encoder, f)

print("Label encoder saved to label_encoder.pkl")

try:
  files.download('label_encoder.pkl')
except ImportError:
  pass # Not running in Colab

Label encoder saved to label_encoder.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>