In [37]:
import pandas as pd
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:
country_mapper = {
    0 : "Libya",
    1 : "Morocco",
    2 : "Egypt",
    3 : "Lebanon",
    4 : "Sudan"
}

### **Data Reading**

In [3]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [4]:
train_data = train_data.dropna()

In [5]:
test_data = test_data.dropna()

### **Data Splitting**

In [6]:
x_train, y_train, x_test, y_test = train_data['text'], train_data['dialect'], test_data['text'], test_data['dialect']

### **Data Preparation**

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train_sequence = tokenizer.texts_to_sequences(x_train)
x_test_sequence = tokenizer.texts_to_sequences(x_test)

In [8]:
max_sequence_len = 100
x_train_padded = pad_sequences(x_train_sequence, maxlen=max_sequence_len, padding='post')
x_test_padded = pad_sequences(x_test_sequence, maxlen=max_sequence_len, padding='post')

In [9]:
y_train_cat = np.eye(5)[y_train]
y_test_cat = np.eye(5)[y_test]

In [10]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

214200

### **Model Architecture**

In [20]:
model = Sequential([
    Embedding(vocab_size, 64),
    SimpleRNN(64),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])

In [21]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          13708800  
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 13721541 (52.34 MB)
Trainable params: 13721541 (52.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### **Model Training**

In [23]:
history = model.fit(x_train_padded, y_train_cat, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### **Model Testing**

In [85]:
test = ["ازيك عامل ايه"]

In [86]:
tokenizer.fit_on_texts(test)
x_train_sequence = tokenizer.texts_to_sequences(test)
x_train_sequence

[[7258, 7, 2162]]

In [87]:
max_sequence_len = 100
x_train_padded = pad_sequences(x_train_sequence, maxlen=max_sequence_len, padding='post')
x_train_padded

array([[7258,    7, 2162,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

In [88]:
pred = model.predict(x_train_padded)



In [89]:
country_mapper[np.argmax(model.predict(x_train_padded), axis=1)[0]]



'Egypt'

In [21]:
gru_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [22]:
gru_bi.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [23]:
history = gru_bi.fit(x_train_padded, y_train_cat, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
gru_bi.evaluate(x_test_padded, y_test_cat)



[0.8836421370506287, 0.8169752955436707]

**Testing**

In [28]:
test = ["ازيك عامل ايه"]

In [29]:
tokenizer.fit_on_texts(test)
sequence = tokenizer.texts_to_sequences(test)
sequence

[[7567, 511, 27]]

In [30]:
padded = pad_sequences(sequence, maxlen=max_sequence_len, padding='post')
padded

array([[7567,  511,   27,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

In [32]:
pred = gru_bi.predict(padded)



In [33]:
pred

array([[0.03950484, 0.02069795, 0.6253492 , 0.30769765, 0.00675031]],
      dtype=float32)

In [35]:
country_mapper[np.argmax(gru_bi.predict(padded), axis=1)[0]]



'Egypt'

In [None]:
country_mapper[np.argmax(gru_bi.predict(padded), axis=1)[0]]

### **Pipeline**

In [46]:
def pipeline(test, tokenizer, max_sequence_len, gru_bi, country_mapper):

    sequence = tokenizer.texts_to_sequences(test)
    padded = pad_sequences(sequence, maxlen=max_sequence_len, padding='post')
    return country_mapper[np.argmax(gru_bi.predict(padded), axis=1)[0]]

In [48]:
pipeline(['كيف حالك يا زول'], tokenizer, max_sequence_len, gru_bi, country_mapper)



'Sudan'

In [50]:
pipeline(['ازيك يا مها، عامله ايه'], tokenizer, max_sequence_len, gru_bi, country_mapper)



'Egypt'

In [52]:
pipeline(["كنحس بالعيا فاش منبدا نقرا"], tokenizer, max_sequence_len, gru_bi, country_mapper)



'Morocco'

In [55]:
pipeline(["للي تخاصمه ما تقطعش أحبال اوصاله"], tokenizer, max_sequence_len, gru_bi, country_mapper)



'Egypt'

**logisticRegression know this liban proverb correctly**

In [54]:
pipeline(["بدّك تبهدل رجّال، فلِّت عليه مرا"], tokenizer, max_sequence_len, gru_bi, country_mapper)



'Lebanon'