We use a product review dataset

In [None]:
!wget https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/PROD.csv

--2022-01-07 19:01:50--  https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/PROD.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 527639 (515K) [text/plain]
Saving to: ‘PROD.csv’


2022-01-07 19:01:50 (13.5 MB/s) - ‘PROD.csv’ saved [527639/527639]



## Imports

In [None]:
import tensorflow as tf
import re
import numpy as np
import csv 
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle


## Read the Dataset

preprocess a review by removing special characters and long spaces

In [None]:
def process_review(review):
  out = re.sub(r"[^\w\s]", '', review)
  out = re.sub(r"[a-zA-Z]", '', out)
  out = re.sub(r"\n", '', out)
  out = re.sub(r"\s+", ' ', out)
  return out.strip()

In [None]:
with open('PROD.csv', 'r') as csv_file:
  reviews = []
  labels  = []
  all_text = ""
  count = 0
  pos_count = 0
  
  #read the data
  lines = csv.reader(csv_file, delimiter = ",")
  for i, line in enumerate(lines):
    
    #ignore the first line
    if i == 0: continue
    
    #preprocess the data
    review = process_review(line[0])
    label  = int(line[1])
    
    #only allow postiive and negative reviews, 
    #also make them the same length
    
    if label == 1:
      pos_count +=1
    elif label == 0:
      continue
    else:
      label += 1
      
    if label == 1 and pos_count > 862:
      continue
    
    if review == "":
      continue
    reviews.append(review)
    all_text += review +' \n '
    labels.append(label)
    
#shuffle the data
reviews, labels = shuffle(reviews, labels)
print(len(reviews))

1648


Look at the data

In [None]:
for i in range(0, 10):
  print(reviews[i], labels[i])

ممتازة ورائعة 1
السلعة جيدة ولكن ليست السعلة التى اختارتها مو اول مرة تسوها معى 0
رأيت هذه السلعة في أسواق التجهيزات الصوتية بسعر لا يتعدى ال 1150 ريال سعودي 0
احببت هذه السللعه كثيرا ممتازه 1
لم اتوقعها هكذا كنت اتمناها بمواصفات ذكية تناسب مستوى الايفون 0
الجهاز رائع جدا واصلي ومعه ضمان عامان 1
قمت بشراء أكثر من عطر من العربية للعود مثل كلماتي و سحر الكلمات و مستي وود ثبات الرائحة لهذه العطور غير جيدة إذا كنت تبحث عن عطر يدوم وقت أكثر على الجسم أو الملابس فعطور العربية للعود ليست اختيار مناسب على الاطلاق 0
سلعه جيده لكنها باااهظة الثمن وهي صناعه صينيه 0
جيدة جدا وجميلة وانصح الجميع بها وهي لاتسبب خشونه لشعر واوالوانها زاهية تماما ورائعة لمناسبات والاعياد 1
انها عملية وجيدة وسهلة التنظيف 1


In [None]:
#save the dataset
with open('product_review.txt', 'w') as f:
  for i in range(len(reviews)):
    f.write(f"{reviews[i]}, {labels[i]} '\n'")

## Create Sequences
Create sequences by using the most repeated 500 words

In [None]:
tknzr = Tokenizer(lower=True, split=" ")
tknzr.fit_on_texts(reviews)

#making sequences:
X = tknzr.texts_to_sequences(reviews)
X = pad_sequences(X, padding='post', value=0)

## Create Numpy Arrays

In [None]:
X = np.array(X)
y = np.array(labels)

print(X.shape)

(1648, 113)


## Create the model

In [None]:
model = Sequential()
model.add(Embedding(len(tknzr.word_index), 32))
model.add(Bidirectional(GRU(units = 32)))
model.add(Dense(32, activation = 'tanh'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          217056    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               12672     
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 231,841
Trainable params: 231,841
Non-trainable params: 0
__________________________________________________

## Train the model

In [None]:
model.fit(X, y, validation_split = 0.1, epochs = 7, batch_size= 128, shuffle = True)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7fc21039b810>

## Tests

In [None]:
class_names = ['سلبي' , 'إيجابي']
def classify(sentence):
  sentence = process_review(sentence)
  sequence = [tknzr.word_index[word] for word in sentence.split(' ')]
  sequence = pad_sequences([sequence], maxlen = X.shape[1], padding='post', value=0)
  #print(sequence.dtype)
  #print(sequence)
  pred = model.predict(sequence)[0][0]
  print(class_names[np.round(pred).astype('int')], pred)
  

In [None]:
classify("جميل")

إيجابي 0.7856761


In [None]:
classify("السلعة كانت جيدة")

إيجابي 0.76657635


In [None]:
classify("سيء")

سلبي 0.13229607


In [None]:
classify("لا بأس بها")

سلبي 0.13391067


In [None]:
classify("تفاجأت بجودة المنتج")

سلبي 0.36554196


In [None]:
import csv
def create_csv(file, dict):
    with open(file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        for key in dict.keys():
            writer.writerow([key,dict[key]])

In [None]:
create_csv("word2index.csv", tknzr.word_index)

In [None]:
model.save("keras.h5")