In [1]:
import numpy as np
import pandas as pd
import re
from tensorflow.keras.utils import to_categorical
import string

In [2]:
dataset = pd.read_csv('train_Arabic_tweets_positive_20190413.tsv' , delimiter='\t',header =None , names = ['x' , 'text'])

In [3]:
#Removes HTML syntaxes
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#Removes URL data
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#Removes Emojis
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

dataset['text']=dataset['text'].apply(lambda z: remove_html(z))
dataset['text']=dataset['text'].apply(lambda z: remove_url(z))
dataset['text']=dataset['text'].apply(lambda z: remove_emoji(z))

In [4]:
# replace _ by whitespaces 
dataset['text'] = dataset.text.apply(lambda x : x.replace('_' , ' '))

In [5]:
# replace multiple whitespaces by one
dataset['text'] = dataset.text.apply(lambda x : ' '.join(x.split()))

In [6]:
def remove_abb(data):
    abb_clean= re.compile(r"\b[a-zA-Z]\.[a-zA-Z]\b")
    data=abb_clean.sub(r'',data)
    return data
dataset['text'] = dataset['text'].apply(lambda z: remove_abb(z))

In [7]:
dataset = dataset.text.apply(lambda z : z.lower())

In [8]:
#remove any word that starts with http
dataset=dataset.apply(lambda z: ' '.join([word for word in z.split() if not word.startswith('http')]))

In [None]:
dataset

0        نحن الذين يتحول كل ما نود أن نقوله إلى دعاء لل...
1        وفي النهاية لن يبقىٰ معك آحدإلا من رأىٰ الجمال...
2                                            من الخير نفسه
3        #زلزل الملعب نصرنا بيلعب كن عالي الهمه ولا ترض...
4        الشيء الوحيد الذي وصلوا فيه للعالمية هو : المس...
                               ...                        
22756    السحب الليلة على الايفون .. رتويت للمرفقة وطبق...
22757               لابسة احمر ليه يا ست انتي ايه المناسبة
22758    كلاام جمييل تستاهل(من احبه الله جعل محبته ف قل...
22759                       - ألطف صورة ممكن تعبر عن رمضان
22760    قال #الإمام ابن القيم -رحمه الله تعالى- : - " ...
Name: text, Length: 22761, dtype: object

In [9]:
tokens = dataset[0].lower().split()
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
for item in dataset:
    tokenz = item.lower().split()
    for token in tokenz:
        if token not in vocab:
            vocab[token] = index
            index += 1
inverse_vocab = {index: token for token, index in vocab.items()}
dataset['text_tokenized'] = dataset.apply(lambda x: [vocab[x] for x in x.lower().split()])
vocab_size = max(vocab.values())+ 1

In [10]:
vocab_size

48631

In [11]:
X = []
Y = []
for j in range(len(dataset)):
    try :
        for i in range(len(dataset.text_tokenized[j])):
            item = dataset.text_tokenized[j]
            if len(item[i:i+3]) == 3 and item[i+3] != None :
                X.append(item[i:i+3]) 
                Y.append(item[i+3])
    except : pass

In [None]:
len(X)

234140

In [12]:
class KerasBatchGenerator(object):
    def __init__(self , X,Y , batch_size , vocab_size , num_steps = 3):
      self.X = X
      self.Y = Y
      self.batch_size = batch_size
      self.vocab_size = vocab_size
      self.current_idx = 0
      self.num_steps = num_steps

    def generate(self):

        while True:
                if self.current_idx + 128 >= len(self.X):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x = self.X[self.current_idx : self.current_idx + 128]
                x = np.array(x)
                # convert all of temp_y into a one hot representation
                y = to_categorical(Y[self.current_idx : self.current_idx + 128], num_classes=self.vocab_size)
                self.current_idx += 128
                yield (x, y)


In [13]:
from keras.models import Sequential
from keras.layers import Embedding , LSTM , Dense
from keras.callbacks import EarlyStopping
model = Sequential(
    [
        Embedding(vocab_size , 128 , input_length= 3) ,
        LSTM(128 , return_sequences = True ),
        LSTM(64  , return_sequences = True),
        LSTM(32 , return_sequences = True),
        LSTM(16 ),
        Dense(vocab_size, activation = 'softmax')
    ]
)

In [14]:
gen = KerasBatchGenerator(X , Y , 128 , vocab_size)

In [16]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam' ,metrics = ['accuracy'] )
model.fit(gen.generate()  , steps_per_epoch = int(234140/128) , epochs = 100 ) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1a38a0cef70>

In [17]:
model.save("language_model_twitter.h5")


In [210]:
# concat indx_ , prediction
indx_ = np.array([[129 , 95]])
prediction = np.argmax(model.predict(indx_))
indx_ = np.concatenate((indx_ , [[prediction]]), axis = 1)

In [211]:
word = [inverse_vocab[i[0]] for i in indx_]
indx_

array([[129,  95, 892]], dtype=int64)

In [212]:
prediction

892

In [213]:
prediction = np.argmax(model.predict(indx_))
indx_ = np.concatenate((indx_ , [[prediction]]), axis = 1)

In [214]:
[inverse_vocab[i] for i in indx_.tolist()[0]]

['اللهم', 'الله', 'صباح', 'شيء']

In [215]:
prediction

612