<a href="https://colab.research.google.com/github/parvvaresh/Classification-Persian-News/blob/main/DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import and install libs

In [1]:
from google.colab import output
!pip install hazm
!pip install arabic-reshaper
!pip install python-bidi
!pip install persian_wordcloud
!pip install langdetect
!pip install mapply


import warnings
import hazm
from hazm import *
import re
import string
import glob
from hazm import stopwords_list
import pandas as pd
import time
import os
import mapply



import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount('/content/drive')


output.clear()



# Initialization

1. **Parallel Processing Setup:**  
   - `mapply.init(...)` initializes a parallel processing framework with multiple workers (`n_workers=-1` uses all available processors), processing data in chunks (`chunk_size=100` and `max_chunks_per_worker=8`), with a visible progress bar.

2. **Text Cleaning Components:**  
   - **Punctuations:**  
     - Combines English (`string.punctuation`) and Persian punctuation symbols (`persian_punctuations`) into a single list for later removal.
   - **Diacritics:**  
     - Compiles a regex (`arabic_diacritics`) to remove common Arabic diacritics (e.g., Tashdid, Fatha, etc.).
   - **Lemmatization and Normalization:**  
     - Initializes `hazm.Lemmatizer()` for lemmatizing Persian words.  
     - Initializes a `Normalizer()` to standardize text.


3. **Stopwords Loading:**  
   - Uses `glob` to find all text files with Persian stopwords in a specific folder.
   - Reads each file and compiles a master list of stopwords.
   - Removes newline characters from each stopword.
   - Extends the list with additional stopwords from `stopwords_list()` from hazm.



This setup prepares your environment to clean, normalize, and process Persian text data efficiently in a parallelized manner.

In [2]:
mapply.init(
    n_workers=-1,
    chunk_size=100,
    max_chunks_per_worker=8,
    progressbar=True,
)

persian_punctuations = '''`÷×؛#<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
punctuations_list = string.punctuation + persian_punctuations
arabic_diacritics = re.compile("""
                          ّ    | # Tashdid
                          َ    | # Fatha
                          ً    | # Tanwin Fath
                          ُ    | # Damma
                          ٌ    | # Tanwin Damm
                          ِ    | # Kasra
                          ٍ    | # Tanwin Kasr
                          ْ    | # Sukun
                          ـ     # Tatwil/Kashida
                      """, re.VERBOSE)
lemmatizer = hazm.Lemmatizer()
normalizer = Normalizer()


file_list = glob.glob('/content/drive/MyDrive/NLP/persian_stopwords' + '/*.txt')

stop_words = []

for file_path in file_list:
    with open(file_path) as f:
        stop_words.extend(f.readlines())

for i in range(len(stop_words)):
    stop_words[i]=stop_words[i].replace('\n','')

stop_words.extend(stopwords_list())
output.clear()

# read dataset

In [3]:
df = pd.read_csv("/content/drive/MyDrive/get news/farsnews/farsnews_fainal.csv")

df = df.drop(['Unnamed: 0', 'date'], axis=1)
df = df.dropna()
df.head(5)

Unnamed: 0,title,text,label
0,قارایی: در تلاشم تا سیاه نمایی درباره ایران با...,خبرگزاری فارس - گروه هنر و رسانه - علی عبدالهی...,culture-media
1,توجه ویژه به اقوام و چهره های مردمی در فصل جدی...,به گزارش خبرگزاری فارس، پویان هدایتی، تهیه کن...,culture-media
2,جان فدا| ویژه برنامه هاى تلویزیون در سومین سال...,به گزارش خبرنگار رادیو و تلویزیون خبرگزاری فار...,culture-media
3,محمد علی صائب رئیس خبرگزاری صدا وسیما شد,به گزارش خبرگزاری فارس، علیرضا خدابخشی، معاون ...,culture-media
4,«همراه با خاطره ها» تمدید شد,به گزارش خبرگزاری فارس به نقل از روابط عمومی و...,culture-media


In [10]:
class preprocessing:
  def __init__(self):
    pass

  def _remove_diacritics(self, text):
    text = re.sub(arabic_diacritics, '', text)
    return text


  def _remove_crash_data(self, text):
    if isinstance(text, str):
      return text
    else:
      return None

  def _remove_punctuations(self, text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

  def _remove_repeating_char(self, text):
    return re.sub(r'(.)\1+', r'\1', text)


  def _normalize_persian(self, text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ي", "ی", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ی", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("ك" ,"ک" , text)
    text = re.sub("[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", " ", text)
    text = re.sub("[^\S\n\t]+", ' ', text)
    text = normalizer.normalize(text)
    return text


  def _tokenize(self, text):
    return word_tokenize(text)

  def _remove_stopwords(self, words):
    return [word  for word in words if word not in stop_words and len(word) > 2]

  def _lemmatizer(self, words):
    result = list()
    for token in words:
      result.append(lemmatizer.lemmatize(token))
    return self._remove_stopwords(result)

In [11]:

pp = preprocessing()
df['title - preproces'] = df['title'].mapply(pp._remove_diacritics)
df['title - preproces'] = df['title - preproces'].mapply(pp._remove_punctuations)
df['title - preproces'] = df['title - preproces'].mapply(pp._remove_repeating_char)
df['title - preproces'] = df['title - preproces'].mapply(pp._normalize_persian)
df['title - preproces'] = df['title - preproces'].mapply(pp._tokenize)
df['title - preproces'] = df['title - preproces'].mapply(pp._remove_stopwords)
df['title - preproces'] = df['title - preproces'].mapply(pp._lemmatizer)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

# Data



## apply word2vec

In [12]:
sentences = df['title - preproces'].tolist()
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

max_length = max(len(tokens) for tokens in sentences)

def transform_text_to_tensor(tokens, model, max_length):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    while len(vectors) < max_length:
        vectors.append(np.zeros(model.vector_size))
    return torch.tensor(vectors, dtype=torch.float)

tensor_list = [transform_text_to_tensor(tokens, w2v_model, max_length) for tokens in sentences]
final_tensor = torch.stack(tensor_list)
print(final_tensor.shape)

  return torch.tensor(vectors, dtype=torch.float)


torch.Size([64942, 24, 100])


## Split the data into Train and Test


In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_tensor, df["label"].values, test_size=0.2, random_state=42)

## Convert to torch.Tensor


In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

## Dataset and DataLoader


In [13]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.unsqueeze(1)  # Adding the channel dimension: (batch_size, 1, seq_len, embedding_dim)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Define the CNN model



## Model Architecture
The `TextCNN` model consists of:
- Convolutional layers with varying kernel sizes (3, 4, 5) to capture different n-gram features.
- ReLU activation function.
- Max pooling layers to retain the most important features from each filter.
- A fully connected layer for classification.
- Dropout for regularization.



In [14]:
class TextCNN(nn.Module):
    def __init__(self, embedding_dim, num_filters, num_classes, dropout=0.5):
        super(TextCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(3, embedding_dim))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(4, embedding_dim))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(5, embedding_dim))

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * 3, num_classes)

    def forward(self, x):
        x1 = F.relu(self.conv1(x)).squeeze(3)
        x2 = F.relu(self.conv2(x)).squeeze(3)
        x3 = F.relu(self.conv3(x)).squeeze(3)

        x1 = F.max_pool1d(x1, kernel_size=x1.size(2)).squeeze(2)
        x2 = F.max_pool1d(x2, kernel_size=x2.size(2)).squeeze(2)
        x3 = F.max_pool1d(x3, kernel_size=x3.size(2)).squeeze(2)

        x = torch.cat([x1, x2, x3], dim=1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits


## Train the model



### Hyperparameters
Several hyperparameters are used in training the TextCNN model:
- `embedding_dim`: The size of the word embeddings (e.g., 100, 300). This determines the dimensionality of word representations.
- `num_filters`: The number of filters in each convolutional layer. A higher number allows the model to capture more features.
- `num_classes`: The number of output classes for classification.
- `dropout`: The dropout rate used for regularization to prevent overfitting.
- `learning_rate`: The step size for the Adam optimizer. Typically set to 0.001.
- `num_epochs`: The number of training iterations over the dataset.
- `batch_size`: The number of samples processed before updating the model weights.


In [22]:
# Model, loss function, and optimizer
embedding_dim = 100
num_filters = 100
num_classes = len(set(df["label"]))
model = TextCNN(embedding_dim, num_filters, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/20, Loss: 0.8121
Epoch 2/20, Loss: 0.7134
Epoch 3/20, Loss: 0.6860
Epoch 4/20, Loss: 0.6684
Epoch 5/20, Loss: 0.6539
Epoch 6/20, Loss: 0.6451
Epoch 7/20, Loss: 0.6320
Epoch 8/20, Loss: 0.6301
Epoch 9/20, Loss: 0.6196
Epoch 10/20, Loss: 0.6131
Epoch 11/20, Loss: 0.6118
Epoch 12/20, Loss: 0.6041
Epoch 13/20, Loss: 0.5991
Epoch 14/20, Loss: 0.5959
Epoch 15/20, Loss: 0.5923
Epoch 16/20, Loss: 0.5854
Epoch 17/20, Loss: 0.5861
Epoch 18/20, Loss: 0.5807
Epoch 19/20, Loss: 0.5787
Epoch 20/20, Loss: 0.5738


## Evaluate the model

In [23]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7942


# RNNModel


### **Architecture of `RNNModel`**
This model is designed to process **sequential input data** (such as time-series data, natural language, or any ordered input) and output a classification prediction.

#### **1. Input Layer**
- The model takes input sequences of shape **(batch_size, sequence_length, input_size)**.
- `input_size`: The number of features in each time step.
- `batch_first=True` ensures that batch size is the first dimension.

#### **2. LSTM Layer**
- `self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)`
- This layer consists of **LSTM units** that process sequential data step-by-step, maintaining long-term dependencies.
- `hidden_size`: Defines the number of neurons in the LSTM hidden layers.
- `num_layers`: Specifies how many stacked LSTM layers are used.
- Outputs:
  - `output`: Contains the output of all time steps in the sequence.
  - `(h_n, c_n)`: The final hidden and cell states of the LSTM.

#### **3. Fully Connected (FC) Layer**
- `self.fc = nn.Linear(hidden_size, num_classes)`
- The last hidden state of the LSTM (corresponding to the final time step) is passed through a fully connected (linear) layer.
- `num_classes`: Determines the output size, typically the number of categories in classification problems.



In [17]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNNModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):

        output, (h_n, c_n) = self.lstm(x)
        last_output = output[:, -1, :]
        out = self.fc(last_output)
        return out

##  Train the model


### **Hyperparameters in the RNN Model**  

Hyperparameters are key settings that define how the model learns. In the given `RNNModel`, several hyperparameters affect the performance, efficiency, and accuracy of the model. Let's break them down:

---

### **1. Model Architecture Hyperparameters**
These define the structure of the neural network.

- **`input_size = 100`**  
  - Determines the number of features in each input time step.  
  - In this case, each input vector has 100 features.  

- **`hidden_size = 128`**  
  - Represents the number of neurons in the hidden layer of the LSTM.  
  - A larger hidden size allows the model to learn more complex patterns but increases computational cost.  

- **`num_layers = 1`**  
  - Defines the number of stacked LSTM layers.  
  - More layers allow the network to capture deeper sequential dependencies, but too many layers may lead to vanishing gradients or overfitting.  

- **`num_classes = len(label_encoder.classes_)`**  
  - Defines the output size, corresponding to the number of classes in a classification problem.  
  - The model outputs probabilities for each class using **CrossEntropyLoss**.

---

### **2. Training Hyperparameters**
These define how the model learns from data.

- **`criterion = nn.CrossEntropyLoss()`**  
  - The loss function used to measure the difference between predicted outputs and true labels.  
  - Suitable for multi-class classification tasks.  

- **`optimizer = torch.optim.Adam(model.parameters(), lr=0.001)`**  
  - **Adam (Adaptive Moment Estimation)** is chosen as the optimizer, balancing speed and efficiency in training.  
  - The **learning rate (`lr=0.001`)** controls how much the model updates weights in response to loss gradients.  
  - A **higher learning rate** can make training faster but may cause the model to converge to a suboptimal solution. A **lower learning rate** improves stability but slows training.

- **`num_epochs = 20`**  
  - Defines the number of times the entire dataset is passed through the model.  
  - More epochs allow better learning but can lead to overfitting if too high.  

- **Batch Processing (`train_loader`)**  
  - The training loop iterates over batches of data rather than the entire dataset.  
  - Helps in faster computation and better generalization.  
  - `batch_X.squeeze(1).to(device)` ensures correct input dimensions for LSTM processing.  


In [20]:
input_size = 100
hidden_size = 128
num_layers = 1
num_classes = len(label_encoder.classes_)
model = RNNModel(input_size, hidden_size, num_layers, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.squeeze(1).to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}')

Epoch [1/20], Avg Loss: 0.9675
Epoch [2/20], Avg Loss: 0.6853
Epoch [3/20], Avg Loss: 0.6191
Epoch [4/20], Avg Loss: 0.5856
Epoch [5/20], Avg Loss: 0.5570
Epoch [6/20], Avg Loss: 0.5318
Epoch [7/20], Avg Loss: 0.5114
Epoch [8/20], Avg Loss: 0.4946
Epoch [9/20], Avg Loss: 0.4776
Epoch [10/20], Avg Loss: 0.4634
Epoch [11/20], Avg Loss: 0.4501
Epoch [12/20], Avg Loss: 0.4360
Epoch [13/20], Avg Loss: 0.4220
Epoch [14/20], Avg Loss: 0.4129
Epoch [15/20], Avg Loss: 0.4009
Epoch [16/20], Avg Loss: 0.3878
Epoch [17/20], Avg Loss: 0.3780
Epoch [18/20], Avg Loss: 0.3670
Epoch [19/20], Avg Loss: 0.3559
Epoch [20/20], Avg Loss: 0.3447


## Evaluate the model

In [21]:
def evaluate(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch_X, batch_y in test_loader:
            batch_X = batch_X.squeeze(1).to(device)
            batch_y = batch_y.to(device)

            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)

            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

        accuracy = 100 * correct / total
        return accuracy

accuracy = evaluate(model, test_loader, device)
print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 83.53%
