In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [76]:
country_mapper = {
    0 : "Libya",
    1 : "Morocco",
    2 : "Egypt",
    3 : "Lebanon",
    4 : "Sudan"
}

### **Data Reading**

In [11]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [12]:
train_data = train_data.dropna()

In [13]:
test_data = test_data.dropna()

### **Data Splitting**

In [14]:
x_train, y_train, x_test, y_test = train_data['text'], train_data['dialect'], test_data['text'], test_data['dialect']

### **Data Preparation**

In [60]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train_sequence = tokenizer.texts_to_sequences(x_train)
x_test_sequence = tokenizer.texts_to_sequences(x_test)

In [17]:
max_sequence_len = 100
x_train_padded = pad_sequences(x_train_sequence, maxlen=max_sequence_len, padding='post')
x_test_padded = pad_sequences(x_test_sequence, maxlen=max_sequence_len, padding='post')

In [18]:
y_train_cat = np.eye(5)[y_train]
y_test_cat = np.eye(5)[y_test]

In [19]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

214200

### **Model Architecture**

In [20]:
model = Sequential([
    Embedding(vocab_size, 64),
    SimpleRNN(64),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])

In [21]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          13708800  
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 13721541 (52.34 MB)
Trainable params: 13721541 (52.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### **Model Training**

In [23]:
history = model.fit(x_train_padded, y_train_cat, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### **Model Testing**

In [85]:
test = ["ازيك يا معلم"]

In [86]:
tokenizer.fit_on_texts(test)
x_train_sequence = tokenizer.texts_to_sequences(test)
x_train_sequence

[[7258, 7, 2162]]

In [87]:
max_sequence_len = 100
x_train_padded = pad_sequences(x_train_sequence, maxlen=max_sequence_len, padding='post')
x_train_padded

array([[7258,    7, 2162,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

In [88]:
pred = model.predict(x_train_padded)



In [89]:
country_mapper[np.argmax(model.predict(x_train_padded), axis=1)[0]]



'Egypt'