In [1]:
import nltk
import os
import shutil
import logging
import re
import string
import numpy as np
import argparse
import pandas as pd
from datetime import datetime
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, concatenate
from tensorflow.keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM
from keras.layers import GRU
from keras.models import Sequential
from keras.layers import Dense, Embedding, Masking
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.layers import Dropout
from keras.utils.vis_utils import plot_model
from keras import metrics
import keras
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def clean_sent(text):
    """ Function for Cleaning Text """
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('_', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [3]:
# Function for importing files and cleaning
def import_and_clean_data(data_dir):
    """ Import Data, Extract Sentences, Clean and Label """
    df_label = pd.read_csv(os.path.join(data_dir, 'labels_Candidate.csv'), encoding='utf-8', header= None)    
    df_train = pd.read_csv(os.path.join(data_dir, 'train.txt'), encoding = "ISO-8859-1", sep=',', header= None)
    ### calculate the maximum sentence length


    # Renaming column names
    df_train.rename(columns =  {0: "id" ,2: "comments"}, inplace = True) 
    df_label.rename(columns =  {0: "id" ,1: "lbl"}, inplace = True)

    max_lenght= max(df_train[1]) 

    # joining the words to generate sentences for each comment
    df = df_train.groupby(["id"])["comments"].agg(lambda x: str(' '.join(x)))
    df = pd.DataFrame({'id':df.index, 'lbl':df.values})


    # merge dataframes to keep existing comments with labels
    merged = pd.merge(left=df,right=df_label, how='left', left_on='id', right_on='id')

    merged = pd.DataFrame({'comment':merged["lbl_x"], 'lbl':merged["lbl_y"]})

    # finding number of labels to check if data is biased or not
    print(merged.groupby("lbl").count())
    #
    merged.sort_values(by=['lbl'])
    #

    sentences = merged["comment"]
    sents_clean = [clean_sent(sent) for sent in sentences if len(sent.strip())>0]

   
    x_raw = np.array(sents_clean)
    y = np.array(merged["lbl"])
    return x_raw, y, max_lenght

In [4]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [5]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [18]:

# define the model
def define_model(length, vocab_size):

    model = Sequential()
    model.add(Embedding(vocab_size+1, 100, input_length = length, mask_zero=True))
    #model.add(SpatialDropout1D(0.2))
    model.add(GRU(100, recurrent_dropout =0.2))# return_sequences=True,
    #model.add(Dropout(0.5))
    #model.add(GRU(256,  recurrent_dropout =0.2))# 
    #model.add(Dropout(0.5))
    model.add(Dense(8, activation = 'relu')) 
    model.add(Dense(1, activation='softmax'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer= 'adam',
                  metrics=['accuracy',f1_m,precision_m, recall_m])

    return model

In [7]:

def plot_roc_curv(fpr, tpr, auc_val, method='RF'):
    """ plot ROC curve given tpr and fpr data"""
    plt.figure(figsize=(15, 10))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=method)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve --- AUC = {auc_val:2.2}')
    plt.legend(loc='best')
    return

In [8]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [20]:

def main():
    # shuffle x and y and split into train and test datasets, take first 4000 rows for both training and testing
    x_raw, y, length = import_and_clean_data(data_dir)

    x_train, x_test, y_train, y_test = train_test_split(
        x_raw[:3999], y[:3999], test_size=0.2,
        random_state=1,
        shuffle=True
    )

    x_train, x_val, y_train, y_val = train_test_split(
        x_train, y_train, test_size=0.1,
        random_state=1,
        shuffle=True
    )

    # create tokenizer
    tokenizer = create_tokenizer(x_train)
    # calculate vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    # encode data
    padded_train = encode_text(tokenizer, x_train, length)
    padded_val = encode_text(tokenizer, x_val, length)

    # define model
    model = define_model(length, vocab_size)

    # Create a TensorBoard instance with the path to the logs directory
#     tensorboard = TensorBoard(
#         log_dir=os.path.join(
#             log_dir, f'{datetime.now()}')
#     )

     # fit model
    history = model.fit(padded_train, y_train,
                        epochs=10, batch_size=16,
                        validation_data=(padded_val, y_val)
                       ) #,callbacks=[tensorboard]

        
    # save the model
    model.save(os.path.join(models_dir, 'lstm.h5'))
    #model = keras.models.load_model(r'\TD\rf\saved_models\\lstm.h5')
    

    # encode data
    padded_test = encode_text(tokenizer, x_test, length)
   
    # evaluate model
    y_pred = model.predict(padded_test)

    loss, accuracy, f1_score, precision, recall = model.evaluate(padded_test, y_test, verbose=0)
    print("loss: {} , accuracy: {}, F1-measure: {}, Precision: {}, Recall: {}".format(loss, accuracy, f1_score, precision, recall))
    
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_dl = np.float32(auc(fpr, tpr))
    plot_roc_curv(fpr, tpr, auc_dl, method='LSTM')
    plt.savefig(os.path.join(save_dir, 'roc_curve_lstm.pdf'))
    plot_model(model, to_file='lstm_model.png', show_shapes=True, show_layer_names=True)

    
    # now preddicting the unlabled data!
    padded_test_unlabelled = encode_text(tokenizer, x_raw[3999:], length)
    y_pred_unlabelled = model.predict(padded_test_unlabelled)
    # writing prediction to file
    print(y_pred_unlabelled)
    df = pd.DataFrame(y_pred_unlabelled)
    df.to_csv(os.path.join(predictions_dir, 'lstm.csv'),index=False)
    

In [21]:

if __name__ == '__main__':

    data_dir = r'TD\\'
    os.makedirs(
        os.path.join(
            data_dir,
            'proccessed_data'
        ),
        exist_ok=True
    )
    save_dir = os.path.join(data_dir, 'proccessed_data')
    os.makedirs(
        os.path.join(
            data_dir,
            'logs'
        ),
        exist_ok=True
    )
    log_dir = os.path.join(data_dir, 'logs')
    os.makedirs(
        os.path.join(
            data_dir,
            'saved_models'
        ),
        exist_ok=True)
    models_dir = os.path.join(data_dir, 'saved_models')
    
    os.makedirs(
        os.path.join(
            data_dir,
            'predicted_results'
        ),
        exist_ok=True)
    predictions_dir = os.path.join(data_dir, 'predicted_results')
    # setup logging
    logging.basicConfig(
        filename=os.path.join(
            log_dir, 'rf.log'
        ),
        level=logging.INFO
    )
    logger = logging.getLogger('cnn_model')
    try:
        shutil.rmtree(
            os.path.join(
                log_dir, os.listdir(log_dir)[0]
            )
        )
    except:
        logger.warning('File doesnt exist')

    # run main
    main()


     comment
lbl         
0.0     3464
1.0      535
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 190, 100)          616800    
_________________________________________________________________
gru_3 (GRU)                  (None, 100)               60300     
_________________________________________________________________
dense_9 (Dense)              (None, 8)                 808       
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 9         
Total params: 677,917
Trainable params: 677,917
Non-trainable params: 0
_________________________________________________________________
Train on 2879 samples, validate on 320 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
loss: 13.770234107971191 , accuracy: 0.13625, F1-measure: 