# Import packages

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

In [2]:
NB_WORDS = 50000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 30  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent

In [3]:
df = pd.read_excel('/content/drive/MyDrive/MasterThesis/DataSet-forUsing/7-offDSA2020-dev-train-8000.xlsx')
df = df.reindex(np.random.permutation(df.index))  
df = df[['tweets', 'off']]
df.head()

Unnamed: 0,tweets,off
7982,RT @USER: يا عمري بس يا #الهلال هذا اللي أقدر ...,NOT_OFF
4608,@USER @USER حببتي يا مولي يا جميلة تسلم ايدك ب...,NOT_OFF
4015,@USER صار عندهم عبد الفتاح كمان يا بركاتك يا ع...,NOT_OFF
7116,اللهم لا تجعل مصيبتنا في ديننا، ولا تجعل الدُّ...,NOT_OFF
7389,RT @USER: اللهمَّ يا ذا الجلال والإكرام، يا عز...,NOT_OFF


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#Data cleaning

In [5]:
def remove_Eng_Char(input_text):
        return re.sub('[a-zA-Z]', '', str(input_text))
        # return re.sub(r'[a-zA-Z]|\d+|[٠١٢٣٤٥٦٧٨٩]', '', str(input_text))
        
def remove_Numbers(input_text):
    return re.sub(r'\d+|[٠١٢٣٤٥٦٧٨٩]', ' ', str(input_text)) 

def remove_Punct(input_text):
    return re.sub(r'[.،<>@,\\-_”“٪ًَ]', ' ', str(input_text)) 

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1\1', str(text))     # keep 2 repeat
    
def normalize_arabic(text):
    # text = remove_diacritics(text)
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

# x=remove_repeating_char('مشكووووور ،!؟')     
print(df.head())

df.tweets = df.tweets.apply(remove_repeating_char).apply(remove_Eng_Char).apply(normalize_arabic).apply(remove_Numbers).apply(remove_Punct)
print(df.head())

                                                 tweets      off
7982  RT @USER: يا عمري بس يا #الهلال هذا اللي أقدر ...  NOT_OFF
4608  @USER @USER حببتي يا مولي يا جميلة تسلم ايدك ب...  NOT_OFF
4015  @USER صار عندهم عبد الفتاح كمان يا بركاتك يا ع...  NOT_OFF
7116  اللهم لا تجعل مصيبتنا في ديننا، ولا تجعل الدُّ...  NOT_OFF
7389  RT @USER: اللهمَّ يا ذا الجلال والإكرام، يا عز...  NOT_OFF
                                                 tweets      off
7982    : يا عمري بس يا #الهلال هذا اللي اقدر اقوله ...  NOT_OFF
4608      حببتي يا مولي يا جميله تسلم ايدك بس خساره ...  NOT_OFF
4015    صار عندهم عبد الفتاح كمان يا بركاتك يا عبد ا...  NOT_OFF
7116  اللهم لا تجعل مصيبتنا في ديننا  ولا تجعل الدُّ...  NOT_OFF
7389    : اللهم ّ يا ذا الجلال والاكرام  يا عزيز يا ...  NOT_OFF


# Train-Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.tweets, df.off, test_size=0.1, random_state=37)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]


# Train data samples: 7200
# Test data samples: 800


# Converting words to numbers

In [7]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

print('Fitted tokenizer on {} documents'.format(tk.document_count))
print('{} words in dictionary'.format(tk.num_words))
print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5))

Fitted tokenizer on 7200 documents
50000 words in dictionary
Top 5 most common words are: [('يا', 20319), ('و', 1861), ('من', 1577), ('الله', 1278), ('في', 1259)]


In [8]:
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

print('"{}" is converted into {}'.format(X_train[0], X_train_seq[0]))

"في حاجات مينفعش نلفت نظركوا ليها زي الاصول كده يا اتربيتوا عليها يا لا 😇" is converted into [598, 1554, 6365, 2324, 161, 11055, 11056, 2, 2325, 6366, 5, 668, 713, 8, 247, 1555, 3490, 2, 133, 11057, 209, 11058, 2326, 4530, 22, 1034, 1117, 6367, 1, 209, 714, 90, 2, 133, 58, 9, 12, 11059, 11060, 22, 669, 2786, 3491, 1, 11061, 6368, 1, 11062, 62, 11063]


In [9]:
def one_hot_seq(seqs, nb_features = NB_WORDS):
    ohs = np.zeros((len(seqs), nb_features))
    for i, s in enumerate(seqs):
        ohs[i, s] = 1.
    return ohs

X_train_oh = one_hot_seq(X_train_seq)
X_test_oh = one_hot_seq(X_test_seq)

print('"{}" is converted into {}'.format(X_train_seq[0], X_train_oh[0]))
print('For this example we have {} features with a value of 1.'.format(X_train_oh[0].sum()))

"[598, 1554, 6365, 2324, 161, 11055, 11056, 2, 2325, 6366, 5, 668, 713, 8, 247, 1555, 3490, 2, 133, 11057, 209, 11058, 2326, 4530, 22, 1034, 1117, 6367, 1, 209, 714, 90, 2, 133, 58, 9, 12, 11059, 11060, 22, 669, 2786, 3491, 1, 11061, 6368, 1, 11062, 62, 11063]" is converted into [0. 1. 1. ... 0. 0. 0.]
For this example we have 43.0 features with a value of 1.


In [10]:
#Converting the target classes to numbers
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

print(le.classes_)
# X_train_rest = np.asarray(X_train_rest).astype('float32').reshape((-1,1))
# y_train_rest = np.asarray(y_train_rest).astype('float32').reshape((-1,1))

print('"{}" is converted into {}'.format(y_train[0], y_train_le[0]))
print('"{}" is converted into {}'.format(y_train_le[0], y_train_oh[0]))

['NOT_OFF' 'OFF']
"NOT_OFF" is converted into 1
"1" is converted into [0. 1.]


#Splitting of a validation set


In [11]:
X_train_rest, X_valid, y_train_rest, y_valid = train_test_split(X_train_oh, y_train_oh, test_size=0.1, random_state=37)

assert X_valid.shape[0] == y_valid.shape[0]
assert X_train_rest.shape[0] == y_train_rest.shape[0]

print('Shape of validation set:',X_valid.shape)

Shape of validation set: (720, 50000)


#Deep learning

In [12]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding


In [13]:
base_model = models.Sequential()
base_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
base_model.add(layers.Dense(64, activation='relu'))
base_model.add(Dropout(0.5))
base_model.add(Dense(2,activation='sigmoid'))
base_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                3200064   
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 3,204,354
Trainable params: 3,204,354
Non-trainable params: 0
_________________________________________________________________


In [14]:
# # Our vectorized labels
# X_train_rest = np.asarray(X_train_rest).astype('float32').reshape((-1,1))
# y_train_rest = np.asarray(y_train_rest).astype('float32').reshape((-1,1))

def deep_model(model):
    model.compile(optimizer='adam'
                  , loss='binary_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train_rest
                       , y_train_rest
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=0
                        )
    
    return history

In [15]:
base_history = deep_model(base_model)

In [16]:
def eval_metric(history, metric_name):
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

In [17]:
drop_model = models.Sequential()
drop_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(64, activation='relu'))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(2))
drop_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                3200064   
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 130       
Total params: 3,204,354
Trainable params: 3,204,354
Non-trainable params: 0
_________________________________________________________________


In [18]:
drop_history = deep_model(drop_model)


In [19]:
compare_loss_with_baseline(drop_history, 'Dropout Model')


NameError: ignored

In [None]:
def test_model(model, epoch_stop):
    model.fit(X_train_oh
              , y_train_oh
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=0)
    results = model.evaluate(X_test_oh, y_test_oh)
    
    return results

In [None]:
base_results = test_model(base_model, 4)
print('/n')
print('Test accuracy of baseline model: {0:.2f}%'.format(base_results[1]*100))

In [None]:
reduced_results = test_model(reduced_model, 10)
print('/n')
print('Test accuracy of reduced model: {0:.2f}%'.format(reduced_results[1]*100))

In [None]:
reg_results = test_model(reg_model, 5)
print('/n')
print('Test accuracy of regularized model: {0:.2f}%'.format(reg_results[1]*100))

In [None]:
drop_results = test_model(drop_model, 6)
print('/n')
print('Test accuracy of dropout model: {0:.2f}%'.format(drop_results[1]*100))

In [None]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop


In [None]:
model = models.Sequential()
model.add(LSTM(256, input_shape=(6, 10)))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
hist = model.fit(X_train_rest, y_train_rest, epochs=1000, batch_size=30, validation_split=0.1)
                #  hist = model.fit(X_train_rest, y_train_rest, nb_epoch=10000, batch_size=30, validation_split=0.1)

#    history = model.fit(X_train_rest
#                        , y_train_rest
#                        , epochs=NB_START_EPOCHS
#                        , batch_size=BATCH_SIZE
#                        , validation_data=(X_valid, y_valid)
#                        , verbose=0)