In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder as OHE
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('FINAL_data2.csv')

In [3]:
df.head()

Unnamed: 0,text,dialect,word_count
0,المجتمع مثلكم بتقول عيونك زايغه تطالع النسوان ...,Khaleeji,14
1,لكل فعل رده فعل المصرين بداو بالسب والشتم وسائ...,Khaleeji,14
2,تحسين بضمير تري بنتك تقدر شي عكس حتنقهر عشانك ...,Khaleeji,14
3,مجانين لولوه الاكثر مصيبه الفئه الي تدافع تعز ...,Khaleeji,14
4,بمقوله غيره ارتاي الحق دربه فهو لحق اسبق يصرفه...,Khaleeji,14


In [4]:
dialect = pd.get_dummies(df['dialect'])

In [5]:
dialect = dialect.values

In [6]:
features = df.text.values.astype(str)

In [7]:
vocab_size = 1500000
tokenizer = Tokenizer(num_words=vocab_size, lower=False)
tokenizer.fit_on_texts(features)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    features, dialect, random_state=42, test_size=0.1, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, random_state=42, test_size=0.1, shuffle=True)

In [9]:
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_train_tok = pad_sequences(X_train_tok, maxlen=20)

X_val_tok = tokenizer.texts_to_sequences(X_val)
X_val_tok = pad_sequences(X_val_tok, maxlen=20)

X_test_tok = tokenizer.texts_to_sequences(X_test)
X_test_tok = pad_sequences(X_test_tok, maxlen=20)

In [13]:
checkpoint = ModelCheckpoint("NN/nn_best_model.h5", monitor='loss',
                            verbose=1, save_best_only=True, mode='auto', period=1)



In [14]:
model = models.Sequential()
model.add(Embedding(vocab_size, dialect.shape[1], input_shape=(20,)))
model.add(SpatialDropout1D(0.5))
model.add(layers.Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy',
            optimizer='adam', metrics=['accuracy'])

In [15]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_tok, y_train,
    epochs=10,
    batch_size=15,
    validation_data=(X_val_tok, y_val),
    callbacks=[checkpoint, early_stop]
)

Epoch 1/10
Epoch 1: loss improved from inf to 1.45473, saving model to NN\nn_best_model.h5
Epoch 2/10
Epoch 2: loss improved from 1.45473 to 1.06029, saving model to NN\nn_best_model.h5
Epoch 3/10
Epoch 3: loss improved from 1.06029 to 0.89580, saving model to NN\nn_best_model.h5
Epoch 4/10
Epoch 4: loss improved from 0.89580 to 0.78792, saving model to NN\nn_best_model.h5
Epoch 5/10
Epoch 5: loss improved from 0.78792 to 0.70178, saving model to NN\nn_best_model.h5
Epoch 6/10
Epoch 6: loss improved from 0.70178 to 0.63917, saving model to NN\nn_best_model.h5


In [16]:
model2 = models.Sequential()
model2.add(Embedding(vocab_size, dialect.shape[1], input_shape=(20,)))
model2.add(SpatialDropout1D(0.5))
model2.add(layers.Flatten())
model2.add(Dense(128, activation='relu'))
model2.add(Dense(128, activation='relu'))
model2.add(Dense(7, activation='softmax'))
model2.compile(loss='categorical_crossentropy',
            optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history2 = model2.fit(
    X_train_tok, y_train,
    epochs=10,
    batch_size=15,
    validation_data=(X_val_tok, y_val),
    callbacks=[checkpoint, early_stop]
)

Epoch 1/10
Epoch 1: loss did not improve from 0.63917
Epoch 2/10
Epoch 2: loss did not improve from 0.63917
Epoch 3/10
Epoch 3: loss did not improve from 0.63917
Epoch 4/10
Epoch 4: loss did not improve from 0.63917
Epoch 5/10
Epoch 5: loss did not improve from 0.63917
Epoch 6/10
Epoch 6: loss did not improve from 0.63917


# Model analysis

In [17]:
model = models.load_model("NN/nn_best_model.h5")

In [18]:
test_loss, test_acc = model.evaluate(X_test_tok, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

394/394 - 1s - loss: 0.9923 - accuracy: 0.6692 - 601ms/epoch - 2ms/step

Test accuracy: 0.6691812872886658
