### Install Pakage

In [None]:
!pip install hazm
!pip install tensorflow-gpu==2.0

Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[?25l[K     |█                               | 10 kB 23.2 MB/s eta 0:00:01[K     |██                              | 20 kB 24.3 MB/s eta 0:00:01[K     |███                             | 30 kB 17.6 MB/s eta 0:00:01[K     |████▏                           | 40 kB 15.6 MB/s eta 0:00:01[K     |█████▏                          | 51 kB 7.2 MB/s eta 0:00:01[K     |██████▏                         | 61 kB 8.4 MB/s eta 0:00:01[K     |███████▎                        | 71 kB 8.1 MB/s eta 0:00:01[K     |████████▎                       | 81 kB 9.0 MB/s eta 0:00:01[K     |█████████▎                      | 92 kB 9.1 MB/s eta 0:00:01[K     |██████████▍                     | 102 kB 7.2 MB/s eta 0:00:01[K     |███████████▍                    | 112 kB 7.2 MB/s eta 0:00:01[K     |████████████▍                   | 122 kB 7.2 MB/s eta 0:00:01[K     |█████████████▌                  | 133 kB 7.2 MB/s eta 0:00:01[K     

### import Pakage 

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, GlobalMaxPool1D
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
from hazm import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/persian_news.zip -d /content/drive/MyDrive

Archive:  /content/drive/MyDrive/persian_news.zip
   creating: /content/drive/MyDrive/persian_news/
  inflating: /content/drive/MyDrive/persian_news/dev.csv  
  inflating: /content/drive/MyDrive/persian_news/train.csv  
  inflating: /content/drive/MyDrive/persian_news/test.csv  


In [None]:
!unzip /content/drive/MyDrive/persian_news.zip -d /content/dataset/

Archive:  /content/drive/MyDrive/persian_news.zip
   creating: /content/dataset/persian_news/
  inflating: /content/dataset/persian_news/dev.csv  
  inflating: /content/dataset/persian_news/train.csv  
  inflating: /content/dataset/persian_news/test.csv  


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/persian_news/train.csv', encoding='utf-8', sep='\t')
train_data

Unnamed: 0.1,Unnamed: 0,content,label,label_id
0,0,به گزارش خبرنگار حوزه بهداشت و درمان گروه علمی...,پزشکی,7
1,1,به گزارش خبرنگار فوتبال و فوتسال گروه ورزشی با...,ورزشی,6
2,2,بهروز اکرمی، در گفتگو با خبرنگار اجتماعی باشگا...,اجتماعی,0
3,3,به گزارش خبرنگار حوزه شهری گروه اجتماعی باشگاه...,اجتماعی,0
4,4,به گزارش باشگاه خبرنگاران و به نقل از روابط عم...,فرهنگی هنری,5
...,...,...,...,...
13309,13309,به گزارش خبرنگار دولت باشگاه خبرنگاران رضا فرج...,سیاسی,3
13310,13310,به گزارش خبرنگار اقتصادی باشگاه خبرنگاران، باز...,اقتصادی,1
13311,13311,ایسوس همیشه سورپرایزهایی را برای کامپیوتکس کنا...,علمی فناوری,4
13312,13312,به گزارش حوزه مجلس گروه سیاسی باشگاه خبرنگاران...,سیاسی,3


### Read dataset

In [None]:
sentence_train = train_data['content']
label_train = train_data['label_id']

print('Number of training sentence: ', sentence_train.shape)
print('Number of training label: ', label_train.shape)


Number of training sentence:  (13314,)
Number of training label:  (13314,)


In [None]:
from collections import Counter
cnt = Counter(label_train)
cnt = dict(cnt)
print(cnt)

{7: 1688, 6: 1119, 0: 1757, 5: 2072, 4: 1973, 3: 1838, 2: 1600, 1: 1267}


In [None]:
# Convert dataframes to numpy arrays
sentence_train = np.asarray(sentence_train)
label_train = np.asarray(label_train)

In [None]:
# Prepare labels for categorical prediction
categorical_label_train = to_categorical(label_train, 8)
categorical_label_train

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

### Preprocessing 

In [None]:
import re
normalizer = Normalizer()
lemmatizer = Lemmatizer()
stemmer = Stemmer()
# turn a doc into clean tokens
def clean_data(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    # ['یه', 'ساله', 'خریدم', 'خیلی', 'ضعیف', 'است', 'بازی', 'هم', 'که', 'چی', 'بگم', 'هیچ', 'کدوم', 'رو', 'باز', 'نمیکنه']
    tokens = []
    for token in tokenized:
      token = re.sub("[،:.,;()/+]", " ", token) 
      token = re.sub(r"\!+","!", token)
      token = re.sub(r"\؟+","؟", token)
      token = re.sub(r"\u200c", " ", token)
      tokens.append(token)

    tokens = [w for w in tokens if not len(w) <= 1] # single character removal 
    tokens = [w for w in tokens if not w.isdigit()] # digit remove
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = [stemmer.stem(w) for w in tokens] 
    tokens = ' '.join(tokens)
    return tokens

### Apply preprocessing to dataset




In [None]:
# Apply preprocessing step to training data
train_docs = np.empty_like(sentence_train)
for index, document in enumerate(sentence_train):
  train_docs[index] = clean_data(document)

### Set tokenizer and encode sentences


```
با کمي هزينه بيشتر يک گوشي سوني در همين رده بگيريد بهتر خواهد بود.
[7, 64, 664, 104, 16, 11, 240, 5, 191, 282, 68, 54, 131, 36, 37]
```



In [None]:
num_words = 2000

# Create the tokenizer
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_docs)
encoded_docs = tokenizer.texts_to_sequences(train_docs)
print(encoded_docs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Find maximum length of training sentences
max_length = max([len(s.split()) for s in train_docs])
max_length

18907

### Padding 

In [None]:
# Pad embeded training sequences
x_train_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
x_train_padded[1]

array([ 2, 45, 28, ...,  0,  0,  0], dtype=int32)

In [None]:
#vocabulary size
vocab_size = len(tokenizer.word_index)

In [None]:
valid_set = pd.read_csv('/content/drive/MyDrive/persian_news/dev.csv',encoding="utf-8",sep='\t')
valid_set

Unnamed: 0.1,Unnamed: 0,content,label,label_id
0,0,به گزارش حوزه بهداشت و درمان گروه علمی پزشکی ب...,پزشکی,7
1,1,به گزارش خبرنگار حوزه شهری گروه اجتماعی باشگاه...,اجتماعی,0
2,2,به گزارش گروه اجتماعی باشگاه خبرنگاران جوان، س...,اجتماعی,0
3,3,به گزارش گروه بین الملل باشگاه خبرنگاران به نق...,بین الملل,2
4,4,به گزارش خبرنگار راه و شهرسازی گروه اقتصادی با...,اقتصادی,1
...,...,...,...,...
1475,1475,به گزارش خبرنگارکلینیک باشگاه خبرنگاران؛ کم خو...,پزشکی,7
1476,1476,رضا قدیمی رئیس کمیته منابع انسانی ستاد اربعین ...,اجتماعی,0
1477,1477,به گزارش خبرنگار گروه علمی و دانشگاهی خبرگزاری...,علمی فناوری,4
1478,1478,به گزارش خبرنگار حوزه کلینیک گروه علمی پزشکی ب...,پزشکی,7


In [None]:
sentence_valid = np.asarray(valid_set['content'])
label_valid = np.asarray(valid_set['label_id'])

In [None]:
categorical_label_valid = to_categorical(label_valid, 8)
categorical_label_valid

array([[0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
# Apply preprocessing step to valid data
valid_docs = np.empty_like(sentence_valid)
for index, document in enumerate(sentence_valid):
  valid_docs[index] = clean_data(document)

In [None]:
# Embed valid sequences
encoded_docs1 = tokenizer.texts_to_sequences(valid_docs)
x_valid_padded = pad_sequences(encoded_docs1, maxlen=max_length, padding='post')

In [None]:
model_blstm = Sequential()

model_blstm.add(Embedding(vocab_size, 100, input_length=max_length))
model_blstm.add(Bidirectional(LSTM(100, return_sequences=True, name='lstm_layer')))

model_blstm.add(GlobalMaxPool1D())
model_blstm.add(Dropout(0.25))
model_blstm.add(Dense(300, activation="relu"))
model_blstm.add(Dropout(0.2))
model_blstm.add(Dense(8, activation='softmax'))

In [None]:
model_blstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=["categorical_accuracy"])

model_blstm.summary()
batch_size_blstm = 64
epochs_blstm = 20

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 18907, 100)        6405100   
_________________________________________________________________
bidirectional (Bidirectional (None, 18907, 200)        160800    
_________________________________________________________________
global_max_pooling1d (Global (None, 200)               0         
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               60300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 2

In [None]:
hist_blstm = model_blstm.fit(x_train_padded, categorical_label_train,validation_data=(x_valid_padded,categorical_label_valid),
                             batch_size=batch_size_blstm, epochs=epochs_blstm,
                             shuffle=True)

Train on 13314 samples, validate on 1480 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/persian_news/test.csv', encoding='utf-8', sep='\t')

x_test = test_data['content']
y_test = test_data['label_id']

In [None]:

print('Number of testing sentence: ', x_test.shape)
print('Number of testing label: ', y_test.shape)

Number of testing sentence:  (1644,)
Number of testing label:  (1644,)


In [None]:
# Convert dataframes to numpy arrays
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

In [None]:
# Applying preprocessing step to test data
test_docs = np.empty_like(x_test)
for index, document in enumerate(x_test):
  test_docs[index] = clean_data(document)

In [None]:
# Embed testing sequences
encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
categorical_y_test = to_categorical(y_test, 8)

In [None]:
# Evaluate model
loss_blstm, acc_blstm = model_blstm.evaluate(x_test_padded, categorical_y_test, verbose=1)
print('Test Accuracy: %f' % (acc_blstm*100))

Test Accuracy: 96.167880


In [None]:
y_pred_blstm = model_blstm.predict_classes(x_test_padded)

In [None]:
text = []
true_label = []
pred_label = []
social = 0
Economical = 0
International = 0
Political = 0
technology = 0
art = 0
Sports = 0
medical = 0

for i in range(len(y_pred_blstm)):
  text.append(x_test[i])
  true_label.append(y_test[i])

  if y_pred_blstm[i] == 0:
    social +=1
    pred_label.append("اجتماعی")
  elif y_pred_blstm[i] == 1:
    Economical +=1
    pred_label.append("اقتصادی")
  elif y_pred_blstm[i] == 2:
    International +=1  
    pred_label.append("بین الملل")
  if y_pred_blstm[i] == 3:
    Political +=1
    pred_label.append("سیاسی")
  elif y_pred_blstm[i] == 4:
    technology +=1
    pred_label.append("علمی فناوری")
  elif y_pred_blstm[i] == 5:
    art +=1
    pred_label.append("فرهنگی هنری")
  if y_pred_blstm[i] == 6:
    Sports +=1
    pred_label.append("ورزشی")
  elif y_pred_blstm[i] == 7:
    medical +=1
    pred_label.append("پزشکی")





```
array([1, 0, 1, 1, 1, 2, 4, 2, 3, 0, 0, 0, 1, 2, 0, 1, 0, 1, 0, 4, 0, 4,
       1, 1, 1, 4, 0, 4, 1, 2, 1, 1, 4, 0, 1, 0, 1, 1, 0, 1, 1, 0, 2, 0,
       3, 4, 0, 4, 1, 1])
       ```



In [None]:
print("اجتماعی: " , (social/1644)*100)
print("اقتصادی: " , (Economical/1644)*100)
print("بین الملل: " , (International/1644)*100)
print("سیاسی: " , (Political/1644)*100)
print("علمی فناوری: " , (technology/1644)*100)
print("فرهنگی هنری: " , (art/1644)*100)
print("ورزشی: " , (Sports/1644)*100)
print("پزشکی: " , (medical/1644)*100)



اجتماعی:  12.530413625304138
اقتصادی:  9.549878345498783
بین الملل:  11.861313868613138
سیاسی:  13.80778588807786
علمی فناوری:  14.78102189781022
فرهنگی هنری:  15.875912408759124
ورزشی:  8.394160583941606
پزشکی:  13.199513381995134


In [None]:
dataFrame = pd.DataFrame({"text":text, "true label":true_label, "prediction label":pred_label})

In [None]:
dataFrame.to_excel("prediction2BLstm.xlsx", index=False)

#LSTM

In [None]:
# Embed valid sequences
encoded_docs1 = tokenizer.texts_to_sequences(valid_docs)
x_valid_padded = pad_sequences(encoded_docs1, maxlen=max_length, padding='post')

In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, 100, input_length=max_length))
model_lstm.add(LSTM(100, return_sequences=True, name='lstm_layer'))
model_lstm.add(GlobalMaxPool1D())
model_lstm.add(Dropout(0.25))
model_lstm.add(Dense(300, activation="relu"))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(8, activation='softmax'))

In [None]:
model_lstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=["categorical_accuracy"])
model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 18907, 100)        6405100   
_________________________________________________________________
lstm_layer (LSTM)            (None, 18907, 100)        80400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 300)               30300     
_________________________________________________________________
dropout_3 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 8)                

In [None]:
hist_lstm = model_lstm.fit(
    x_train_padded, categorical_label_train,validation_data=(x_valid_padded,categorical_label_valid),
    batch_size=64, epochs=10,
    shuffle=True, verbose=1)

Train on 13314 samples, validate on 1480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate model
loss_lstm, acc_lstm = model_lstm.evaluate(x_test_padded, categorical_y_test, verbose=1)
print('Test Accuracy: %f' % (acc_lstm*100))

Test Accuracy: 95.620435


In [None]:
y_pred_lstm = model_lstm.predict_classes(x_test_padded)

In [None]:
text = []
true_label = []
pred_label = []
social = 0
Economical = 0
International = 0
Political = 0
technology = 0
art = 0
Sports = 0
medical = 0

for i in range(len(y_pred_lstm)):
  text.append(x_test[i])
  true_label.append(y_test[i])

   
  if y_pred_blstm[i] == 0:
    social +=1
    pred_label.append("اجتماعی")
  elif y_pred_blstm[i] == 1:
    Economical +=1
    pred_label.append("اقتصادی")
  elif y_pred_blstm[i] == 2:
    International +=1  
    pred_label.append("بین الملل")
  if y_pred_blstm[i] == 3:
    Political +=1
    pred_label.append("سیاسی")
  elif y_pred_blstm[i] == 4:
    technology +=1
    pred_label.append("علمی فناوری")
  elif y_pred_blstm[i] == 5:
    art +=1
    pred_label.append("فرهنگی هنری")
  if y_pred_blstm[i] == 6:
    Sports +=1
    pred_label.append("ورزشی")
  elif y_pred_blstm[i] == 7:
    medical +=1
    pred_label.append("پزشکی")


In [None]:
print("اجتماعی: " , (social/1644)*100)
print("اقتصادی: " , (Economical/1644)*100)
print("بین الملل: " , (International/1644)*100)
print("سیاسی: " , (Political/1644)*100)
print("علمی فناوری: " , (technology/1644)*100)
print("فرهنگی هنری: " , (art/1644)*100)
print("ورزشی: " , (Sports/1644)*100)
print("پزشکی: " , (medical/1644)*100)



اجتماعی:  12.530413625304138
اقتصادی:  9.549878345498783
بین الملل:  11.861313868613138
سیاسی:  13.80778588807786
علمی فناوری:  14.78102189781022
فرهنگی هنری:  15.875912408759124
ورزشی:  8.394160583941606
پزشکی:  13.199513381995134


In [None]:
dataFrame = pd.DataFrame({"text":text, "true label":true_label, "prediction label":pred_label})

In [None]:
dataFrame.to_excel("prediction2Lstm.xlsx", index=False)

#GRU

In [None]:
from tensorflow.keras.layers import GRU
model_GRU = Sequential()
model_GRU.add(Embedding(vocab_size, 100, input_length=max_length))
model_GRU.add(GRU(100, return_sequences=True, name='gru_layer'))
model_GRU.add(GlobalMaxPool1D())
model_GRU.add(Dropout(0.25))
model_GRU.add(Dense(300, activation="relu"))
model_GRU.add(Dropout(0.2))
model_GRU.add(Dense(8, activation='softmax'))

In [None]:
model_GRU.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=["categorical_accuracy"])
model_GRU.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 18907, 100)        6405100   
_________________________________________________________________
gru_layer (GRU)              (None, 18907, 100)        60600     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 300)               30300     
_________________________________________________________________
dropout_7 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 8)                

In [None]:
hist_GRU = model_GRU.fit(
    x_train_padded, categorical_label_train,validation_data=(x_valid_padded,categorical_label_valid),
    batch_size=64, epochs=20,
    shuffle=True, verbose=1)

Train on 13314 samples, validate on 1480 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluate model
loss_GRU, acc_GRU = model_GRU.evaluate(x_test_padded, categorical_y_test, verbose=1)
print('Test Accuracy: %f' % (acc_GRU*100))

Test Accuracy: 96.289539


In [None]:
y_pred_GRU = model_GRU.predict_classes(x_test_padded)

In [None]:
text = []
true_label = []
pred_label = []
social = 0
Economical = 0
International = 0
Political = 0
technology = 0
art = 0
Sports = 0
medical = 0

for i in range(len(y_pred_GRU)):
  text.append(x_test[i])
  true_label.append(y_test[i])

   
  if y_pred_blstm[i] == 0:
    social +=1
    pred_label.append("اجتماعی")
  elif y_pred_blstm[i] == 1:
    Economical +=1
    pred_label.append("اقتصادی")
  elif y_pred_blstm[i] == 2:
    International +=1  
    pred_label.append("بین الملل")
  if y_pred_blstm[i] == 3:
    Political +=1
    pred_label.append("سیاسی")
  elif y_pred_blstm[i] == 4:
    technology +=1
    pred_label.append("علمی فناوری")
  elif y_pred_blstm[i] == 5:
    art +=1
    pred_label.append("فرهنگی هنری")
  if y_pred_blstm[i] == 6:
    Sports +=1
    pred_label.append("ورزشی")
  elif y_pred_blstm[i] == 7:
    medical +=1
    pred_label.append("پزشکی")


In [None]:
print("اجتماعی: " , (social/1644)*100)
print("اقتصادی: " , (Economical/1644)*100)
print("بین الملل: " , (International/1644)*100)
print("سیاسی: " , (Political/1644)*100)
print("علمی فناوری: " , (technology/1644)*100)
print("فرهنگی هنری: " , (art/1644)*100)
print("ورزشی: " , (Sports/1644)*100)
print("پزشکی: " , (medical/1644)*100)



اجتماعی:  12.530413625304138
اقتصادی:  9.549878345498783
بین الملل:  11.861313868613138
سیاسی:  13.80778588807786
علمی فناوری:  14.78102189781022
فرهنگی هنری:  15.875912408759124
ورزشی:  8.394160583941606
پزشکی:  13.199513381995134


In [None]:
dataFrame = pd.DataFrame({"text":text, "true label":true_label, "prediction label":pred_label})

In [None]:
dataFrame.to_excel("prediction2GRU.xlsx", index=False)