In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from keras import models
from keras import layers
from keras import callbacks
from keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
import matplotlib.pyplot as plt
from nltk import word_tokenize, sent_tokenize
import pickle
from sklearn.metrics import classification_report


In [3]:
#For google colab we mount google drive 
from google.colab import drive 
drive.mount("/content/drive/")

Mounted at /content/drive/


In [4]:
tweets_data_path = "/content/drive/MyDrive/preprocessed_tweets.csv"

In [5]:
df = pd.read_csv(tweets_data_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    458197 non-null  int64 
 1   Unnamed: 0.1  458197 non-null  int64 
 2   id            458197 non-null  int64 
 3   dialect       458197 non-null  int64 
 4   tweets        457992 non-null  object
dtypes: int64(4), object(1)
memory usage: 17.5+ MB
None


In [6]:
features = df.tweets.values.astype(str)
lables = pd.get_dummies(df['dialect']).values #One-hot encoding the lables

In [7]:
vocab_size = 20000
max_length= 300
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=False)
tokenizer.fit_on_texts(features)

In [8]:
print('Found %s unique tokens.' % len(tokenizer.word_index))

Found 422859 unique tokens.


In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, lables, random_state=42, test_size=0.1, shuffle=True)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.1, shuffle=True)

In [11]:
y_train.shape, y_val.shape, y_test.shape

((371139, 18), (41238, 18), (45820, 18))

In [12]:
X_train.shape, X_val.shape, X_test.shape

((371139,), (41238,), (45820,))

In [13]:
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_train_tok = pad_sequences(X_train_tok, maxlen=max_length)
print('Shape of data tensor:', X_train_tok.shape)

Shape of data tensor: (371139, 300)


In [14]:
X_val_tok = tokenizer.texts_to_sequences(X_val)
X_val_tok = pad_sequences(X_val_tok, maxlen=max_length)
print('Shape of data tensor:', X_val_tok.shape)

Shape of data tensor: (41238, 300)


In [15]:
X_test_tok = tokenizer.texts_to_sequences(X_test)
X_test_tok = pad_sequences(X_test_tok, maxlen=max_length)
print('Shape of data tensor:', X_test_tok.shape)

Shape of data tensor: (45820, 300)


**LSTM Model**

In [20]:
epochs = 1
output_dim = 100
batch_size = 64

In [21]:
checkpoint = ModelCheckpoint("/content/drive/MyDrive/Arabic_dialect_models/lstm_model.h5", monitor='loss', verbose=1,save_best_only=True, mode='auto', period=1)



In [22]:
model = models.load_model('/content/drive/MyDrive/Arabic_dialect_models/lstm_best_model.h5')



In [None]:
model = models.Sequential()
model.add(Embedding(vocab_size, output_dim, input_shape=(max_length,)))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(18, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train_tok, y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_val_tok, y_val), callbacks=[checkpoint])

Epoch 1/2
Epoch 1: loss improved from inf to 1.98692, saving model to lstm_best_model.h5
Epoch 2/2
Epoch 2: loss improved from 1.98692 to 1.69025, saving model to lstm_best_model.h5


In [None]:
history = model.fit(X_train_tok, y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_val_tok, y_val), callbacks=[checkpoint])

Epoch 1: loss improved from 1.59976 to 1.54278, saving model to lstm_best_model.h5


In [None]:
history = model.fit(X_train_tok, y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_val_tok, y_val), callbacks=[checkpoint])

Epoch 1: loss improved from 1.54278 to 1.50234, saving model to lstm_best_model.h5


In [16]:
model = models.load_model('/content/drive/MyDrive/Arabic_dialect_models/lstm_model.h5')# loading
# loading
max_length= 300
with open('/content/drive/MyDrive/Arabic_dialect_models/lstm_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)



In [17]:
model.evaluate(X_test_tok, y_test, batch_size=64)



[1.6087932586669922, 0.49487122893333435]

In [18]:
print("[INFO] Calculating the classification report")
y_pred = model.predict(X_test_tok, batch_size=512, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
y_test_report = np.argmax(y_test, axis=1)
print(
    f"Classification Report : \n\n{classification_report(y_test_report, y_pred_bool)}")

[INFO] Calculating the classification report
Classification Report : 

              precision    recall  f1-score   support

           0       0.42      0.36      0.39      2753
           1       0.29      0.28      0.29      2647
           2       0.54      0.49      0.51      1678
           3       0.64      0.84      0.73      5691
           4       0.54      0.50      0.52      1510
           5       0.42      0.27      0.33      2858
           6       0.42      0.52      0.46      4157
           7       0.56      0.68      0.61      2860
           8       0.58      0.63      0.60      3664
           9       0.69      0.55      0.61      1110
          10       0.34      0.31      0.32      1904
          11       0.45      0.51      0.48      4263
          12       0.47      0.41      0.44      3114
          13       0.35      0.41      0.38      2653
          14       0.69      0.54      0.60      1432
          15       0.49      0.26      0.34      1611
          

In [None]:
new_complaint = ['شلونك شو تسوي ']
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
CLASS_DICT = {0: "AE", 1: "BH", 2: "DZ", 3: "EG", 4: "IQ", 5: "JO", 6: "KW", 7: "LB", 8: "LY", 9: "MA",
              10: "OM", 11: "PL", 12: "QA", 13: "SA", 14: "SD", 15: "SY", 16: "TN", 17: "YE"}
print(pred, CLASS_DICT[np.argmax(pred)])

[[0.24816488 0.04875014 0.01015061 0.001222   0.12700514 0.11624358
  0.09715858 0.02003108 0.00512088 0.00527489 0.10820533 0.07317767
  0.01700952 0.01626699 0.00129211 0.09270472 0.00224872 0.00997321]] AE
