<a href="https://colab.research.google.com/github/mamadoz79/NLP-CA/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
!pip install hazm
from hazm import word_tokenize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 14.1 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 50.9 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 32.4 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394487 sha256=96da5ce36b1c5e16d2a9ae5967bc6a96dfd2a31f95de68741fc1b0c122e1a3ad
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file = open("drive/MyDrive/NLP/PersianStopWords.txt", encoding = 'utf-8')
stopwords = set(i.strip() for i in file.readlines())

In [6]:
with open('/content/drive/MyDrive/NLP/tokenizer.h5', 'rb') as f:
    tokenizer = pickle.load(f)

In [7]:
data = pd.read_csv("/content/drive/MyDrive/NLP/hamshahri.csv")
data = data[data.groupby('cat')['cat'].transform('count') > 1000]
data = data[data['corpus'].str.len() <= 10_000]
num_unique_categories = data['cat'].nunique()

In [8]:
vocab_size = len(tokenizer.word_index)+1
data_to_sequences = tokenizer.texts_to_sequences(list(data['corpus']))
maxlen = max(len(i) for i in data_to_sequences)

In [9]:
pad_data_to_sequences = pad_sequences(data_to_sequences, maxlen=maxlen)

In [11]:
Y = pd.get_dummies(data['cat']).values

In [29]:
Y_one_hot = pd.get_dummies(data['cat'])
category_types = Y_one_hot.columns.values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(pad_data_to_sequences, Y, test_size=0.3, random_state=42)

In [13]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=maxlen))
model.add(Bidirectional(LSTM(200, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_unique_categories, activation='softmax'))

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2234, 50)          22836750  
                                                                 
 bidirectional (Bidirectiona  (None, 2234, 400)        401600    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 400)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 400)               0         
                                                                 
 dense (Dense)               (None, 200)               80200     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0

In [16]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=7, batch_size=256)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [17]:
model.save('model.h5')
!cp -r "/content/model.h5" "/content/drive/MyDrive/NLP/model.h5"

In [32]:
preprocess = lambda text, stopwords : ' '.join([w for w in word_tokenize(text) if w not in stopwords])

In [33]:
def prediction(texts, stopwords):
  cleaned_text = preprocess(texts, stopwords)
  text_to_sequences = tokenizer.texts_to_sequences([cleaned_text])
  pad_text_to_sequences = pad_sequences(text_to_sequences, maxlen=2234)
  category = model.predict([pad_text_to_sequences])[0]
  return category_types[np.where(category == max(category))]

In [34]:
example_text = data['corpus'][2]

In [35]:
prediction(example_text, stopwords)

array(['adabh'], dtype=object)

In [36]:
data['cat'][2]

'adabh'