<a href="https://colab.research.google.com/github/parvvaresh/Classification-Persian-News/blob/main/DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import and install libs

In [1]:
from google.colab import output
!pip install hazm
!pip install arabic-reshaper
!pip install python-bidi
!pip install persian_wordcloud
!pip install langdetect
!pip install mapply


import warnings
import hazm
from hazm import *
import re
import string
import glob
from hazm import stopwords_list
import pandas as pd
import time
import os
import mapply



import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount('/content/drive')


output.clear()



# Initialization

1. **Parallel Processing Setup:**  
   - `mapply.init(...)` initializes a parallel processing framework with multiple workers (`n_workers=-1` uses all available processors), processing data in chunks (`chunk_size=100` and `max_chunks_per_worker=8`), with a visible progress bar.

2. **Text Cleaning Components:**  
   - **Punctuations:**  
     - Combines English (`string.punctuation`) and Persian punctuation symbols (`persian_punctuations`) into a single list for later removal.
   - **Diacritics:**  
     - Compiles a regex (`arabic_diacritics`) to remove common Arabic diacritics (e.g., Tashdid, Fatha, etc.).
   - **Lemmatization and Normalization:**  
     - Initializes `hazm.Lemmatizer()` for lemmatizing Persian words.  
     - Initializes a `Normalizer()` to standardize text.


3. **Stopwords Loading:**  
   - Uses `glob` to find all text files with Persian stopwords in a specific folder.
   - Reads each file and compiles a master list of stopwords.
   - Removes newline characters from each stopword.
   - Extends the list with additional stopwords from `stopwords_list()` from hazm.



This setup prepares your environment to clean, normalize, and process Persian text data efficiently in a parallelized manner.

In [2]:
mapply.init(
    n_workers=-1,
    chunk_size=100,
    max_chunks_per_worker=8,
    progressbar=True,
)

persian_punctuations = '''`÷×؛#<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
punctuations_list = string.punctuation + persian_punctuations
arabic_diacritics = re.compile("""
                          ّ    | # Tashdid
                          َ    | # Fatha
                          ً    | # Tanwin Fath
                          ُ    | # Damma
                          ٌ    | # Tanwin Damm
                          ِ    | # Kasra
                          ٍ    | # Tanwin Kasr
                          ْ    | # Sukun
                          ـ     # Tatwil/Kashida
                      """, re.VERBOSE)
lemmatizer = hazm.Lemmatizer()
normalizer = Normalizer()


file_list = glob.glob('/content/drive/MyDrive/NLP/persian_stopwords' + '/*.txt')

stop_words = []

for file_path in file_list:
    with open(file_path) as f:
        stop_words.extend(f.readlines())

for i in range(len(stop_words)):
    stop_words[i]=stop_words[i].replace('\n','')

stop_words.extend(stopwords_list())
output.clear()

In [3]:
df = pd.read_csv("/content/drive/MyDrive/get news/farsnews/farsnews_fainal.csv")

df = df.drop(['Unnamed: 0', 'date'], axis=1)
df = df.dropna()
df.head(5)

Unnamed: 0,title,text,label
0,قارایی: در تلاشم تا سیاه نمایی درباره ایران با...,خبرگزاری فارس - گروه هنر و رسانه - علی عبدالهی...,culture-media
1,توجه ویژه به اقوام و چهره های مردمی در فصل جدی...,به گزارش خبرگزاری فارس، پویان هدایتی، تهیه کن...,culture-media
2,جان فدا| ویژه برنامه هاى تلویزیون در سومین سال...,به گزارش خبرنگار رادیو و تلویزیون خبرگزاری فار...,culture-media
3,محمد علی صائب رئیس خبرگزاری صدا وسیما شد,به گزارش خبرگزاری فارس، علیرضا خدابخشی، معاون ...,culture-media
4,«همراه با خاطره ها» تمدید شد,به گزارش خبرگزاری فارس به نقل از روابط عمومی و...,culture-media


In [4]:
label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
label_counts

Unnamed: 0,label,count
0,sports,21396
1,social,12801
2,culture-media,11324
3,economy,10163
4,politics,9258


In [5]:
temp = df.copy()

df = pd.DataFrame()

gb = temp.groupby(['label'])


#this is for me please replace to (9258)
min_size = 9258
for label in set(temp["label"]):
  cash = gb.get_group(label).head(min_size)
  df = df._append(cash, ignore_index=True)


df.head(5)


  cash = gb.get_group(label).head(min_size)
  cash = gb.get_group(label).head(min_size)


Unnamed: 0,title,text,label
0,قارایی: در تلاشم تا سیاه نمایی درباره ایران با...,خبرگزاری فارس - گروه هنر و رسانه - علی عبدالهی...,culture-media
1,توجه ویژه به اقوام و چهره های مردمی در فصل جدی...,به گزارش خبرگزاری فارس، پویان هدایتی، تهیه کن...,culture-media
2,جان فدا| ویژه برنامه هاى تلویزیون در سومین سال...,به گزارش خبرنگار رادیو و تلویزیون خبرگزاری فار...,culture-media
3,محمد علی صائب رئیس خبرگزاری صدا وسیما شد,به گزارش خبرگزاری فارس، علیرضا خدابخشی، معاون ...,culture-media
4,«همراه با خاطره ها» تمدید شد,به گزارش خبرگزاری فارس به نقل از روابط عمومی و...,culture-media


In [6]:
label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
label_counts

Unnamed: 0,label,count
0,culture-media,9258
1,social,9258
2,politics,9258
3,economy,9258
4,sports,9258


In [7]:
class preprocessing:
  def __init__(self):
    pass

  def _remove_diacritics(self, text):
    text = re.sub(arabic_diacritics, '', text)
    return text


  def _remove_crash_data(self, text):
    if isinstance(text, str):
      return text
    else:
      return None

  def _remove_punctuations(self, text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

  def _remove_repeating_char(self, text):
    return re.sub(r'(.)\1+', r'\1', text)


  def _normalize_persian(self, text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ي", "ی", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ی", text)# Initialization

1. **Parallel Processing Setup:**
   - `mapply.init(...)` initializes a parallel processing framework with multiple workers (`n_workers=-1` uses all available processors), processing data in chunks (`chunk_size=100` and `max_chunks_per_worker=8`), with a visible progress bar.

2. **Text Cleaning Components:**
   - **Punctuations:**
     - Combines English (`string.punctuation`) and Persian punctuation symbols (`persian_punctuations`) into a single list for later removal.
   - **Diacritics:**
     - Compiles a regex (`arabic_diacritics`) to remove common Arabic diacritics (e.g., Tashdid, Fatha, etc.).
   - **Lemmatization and Normalization:**
     - Initializes `hazm.Lemmatizer()` for lemmatizing Persian words.
     - Initializes a `Normalizer()` to standardize text.


3. **Stopwords Loading:**
   - Uses `glob` to find all text files with Persian stopwords in a specific folder.
   - Reads each file and compiles a master list of stopwords.
   - Removes newline characters from each stopword.
   - Extends the list with additional stopwords from `stopwords_list()` from hazm.



This setup prepares your environment to clean, normalize, and process Persian text data efficiently in a parallelized manner.
    text = re.sub("ة", "ه", text)
    text = re.sub("ك" ,"ک" , text)
    text = re.sub("[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", " ", text)
    text = re.sub("[^\S\n\t]+", ' ', text)
    text = normalizer.normalize(text)
    return text


  def _tokenize(self, text):
    return word_tokenize(text)

  def _remove_stopwords(self, words):
    return [word  for word in words if word not in stop_words and len(word) > 2]

  def _lemmatizer(self, words):
    result = list()
    for token in words:
      result.append(lemmatizer.lemmatize(token))
    return self._remove_stopwords(result)

In [8]:

pp = preprocessing()
df['title - preproces'] = df['title'].mapply(pp._remove_diacritics)
df['title - preproces'] = df['title - preproces'].mapply(pp._remove_punctuations)
df['title - preproces'] = df['title - preproces'].mapply(pp._remove_repeating_char)
df['title - preproces'] = df['title - preproces'].mapply(pp._normalize_persian)
df['title - preproces'] = df['title - preproces'].mapply(pp._tokenize)
df['title - preproces'] = df['title - preproces'].mapply(pp._remove_stopwords)
df['title - preproces'] = df['title - preproces'].mapply(pp._lemmatizer)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [23]:
# Data
sentences = df['title - preproces'].tolist()
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

# Find the maximum sentence length
max_length = max(len(tokens) for tokens in sentences)

def transform_text_to_tensor(tokens, model, max_length):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    while len(vectors) < max_length:
        vectors.append(np.zeros(model.vector_size))
    return torch.tensor(vectors, dtype=torch.float)

tensor_list = [transform_text_to_tensor(tokens, w2v_model, max_length) for tokens in sentences]
final_tensor = torch.stack(tensor_list)
print(final_tensor.shape)  # Shape: (number of sentences, max_length, 100)

torch.Size([46290, 24, 100])


In [24]:
# Split the data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(final_tensor, df["label"].values, test_size=0.2, random_state=42)

# Convert to torch.Tensor
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.unsqueeze(1)  # Adding the channel dimension: (batch_size, 1, seq_len, embedding_dim)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

In [25]:
# Define the CNN model
class TextCNN(nn.Module):
    def __init__(self, embedding_dim, num_filters, num_classes, dropout=0.5):
        super(TextCNN, self).__init__()

        # 3 filters with different sizes
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(3, embedding_dim))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(4, embedding_dim))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(5, embedding_dim))

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * 3, num_classes)

    def forward(self, x):
        # x is already [batch_size, 1, seq_len, embedding_dim]
        x1 = F.relu(self.conv1(x)).squeeze(3)
        x2 = F.relu(self.conv2(x)).squeeze(3)
        x3 = F.relu(self.conv3(x)).squeeze(3)

        # Apply max pooling
        x1 = F.max_pool1d(x1, kernel_size=x1.size(2)).squeeze(2)
        x2 = F.max_pool1d(x2, kernel_size=x2.size(2)).squeeze(2)
        x3 = F.max_pool1d(x3, kernel_size=x3.size(2)).squeeze(2)

        # Concatenate the features
        x = torch.cat([x1, x2, x3], dim=1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits


In [26]:
# Model, loss function, and optimizer
embedding_dim = 100
num_filters = 100
num_classes = len(set(df["label"]))
model = TextCNN(embedding_dim, num_filters, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/5, Loss: 1.0040
Epoch 2/5, Loss: 0.8865
Epoch 3/5, Loss: 0.8564
Epoch 4/5, Loss: 0.8310
Epoch 5/5, Loss: 0.8158


In [27]:
# Evaluate the model
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7152


In [28]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNNModel, self).__init__()
        # LSTM layer: input_size is the embedding dimension (100), batch_first=True for (batch_size, seq_len, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # Fully connected layer to map the last hidden state to the number of classes
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Input x shape: (batch_size, seq_len, input_size)
        # Output shape: (batch_size, seq_len, hidden_size), hidden and cell states
        output, (h_n, c_n) = self.lstm(x)
        # Take the output at the last time step: (batch_size, hidden_size)
        last_output = output[:, -1, :]
        # Pass through the linear layer to get class logits: (batch_size, num_classes)
        out = self.fc(last_output)
        return out

In [29]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model parameters
input_size = 100  # Word2Vec embedding dimension
hidden_size = 128  # Arbitrary choice, can be tuned
num_layers = 1    # Single-layer LSTM
num_classes = len(label_encoder.classes_)  # Number of unique labels
model = RNNModel(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        # Move data to device and remove the extra channel dimension
        batch_X = batch_X.squeeze(1).to(device)  # Shape: (batch_size, max_length, 100)
        batch_y = batch_y.to(device)             # Shape: (batch_size,)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_X)  # Logits: (batch_size, num_classes)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Compute average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}')

Epoch [1/5], Avg Loss: 1.1793
Epoch [2/5], Avg Loss: 0.8708
Epoch [3/5], Avg Loss: 0.7906
Epoch [4/5], Avg Loss: 0.7530
Epoch [5/5], Avg Loss: 0.7209


In [30]:
def evaluate(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch_X, batch_y in test_loader:
            # Move data to device and remove the extra channel dimension
            batch_X = batch_X.squeeze(1).to(device)  # Shape: (batch_size, max_length, 100)
            batch_y = batch_y.to(device)             # Shape: (batch_size,)

            # Forward pass
            outputs = model(batch_X)  # Logits: (batch_size, num_classes)
            _, predicted = torch.max(outputs.data, 1)  # Predicted class indices

            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

        accuracy = 100 * correct / total
        return accuracy

# Evaluate after training (example usage)
accuracy = evaluate(model, test_loader, device)
print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 73.77%
