In [1]:
import nltk
import os
import shutil
import logging
import re
import string
import numpy as np
import argparse
from datetime import datetime
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, concatenate
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Dense, Conv1D, MaxPool1D,
                                     UpSampling1D, Flatten, Reshape, Dropout)
from tensorflow.keras.callbacks import TensorBoard
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
def clean_sent(text):
    """ Function for Cleaning Text """
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('_', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [3]:
#### Function for importing files and cleaning
def import_and_clean_data(data_dir):
    """ Import Data, Extract Sentences, Clean and Label """
    df_label = pd.read_csv(os.path.join(data_dir, 'labels_Candidate.csv'), encoding='utf-8', header= None)    
    df_train = pd.read_csv(os.path.join(data_dir, 'train.txt'), encoding = "ISO-8859-1", sep=',', header= None)
    ### calculate the maximum sentence length


    # Renaming column names
    df_train.rename(columns =  {0: "id" ,2: "comments"}, inplace = True) 
    df_label.rename(columns =  {0: "id" ,1: "lbl"}, inplace = True)

    max_lenght= max(df_train[1]) 

    # joining the words to generate sentences for each comment
    df = df_train.groupby(["id"])["comments"].agg(lambda x: str(' '.join(x)))
    df = pd.DataFrame({'id':df.index, 'lbl':df.values})


    # merge dataframes to keep existing comments with labels
    merged = pd.merge(left=df,right=df_label, how='left', left_on='id', right_on='id')

    merged = pd.DataFrame({'comment':merged["lbl_x"], 'lbl':merged["lbl_y"]})

    # finding number of labels to check if data is biased or not
    print(merged.groupby("lbl").count())
    #
    merged.sort_values(by=['lbl'])
    #

    sentences = merged["comment"]
    sents_clean = [clean_sent(sent) for sent in sentences if len(sent.strip())>0]

   
    x_raw = np.array(sents_clean)
    y = np.array(merged["lbl"])
    return x_raw, y, max_lenght

In [4]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [5]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [6]:

# define the model
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPool1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPool1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPool1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='softmax')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

In [7]:

def plot_roc_curv(fpr, tpr, auc_val, method='RF'):
    """ plot ROC curve given tpr and fpr data"""
    plt.figure(figsize=(15, 10))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=method)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve --- AUC = {auc_val:2.2}')
    plt.legend(loc='best')
    return

In [8]:

def main():
    # shuffle x and y and split into train and test datasets, take first 4000 rows for both training and testing
    x_raw, y, length = import_and_clean_data(data_dir)

    x_train, x_test, y_train, y_test = train_test_split(
        x_raw[:4000], y[:4000], test_size=0.2,
        random_state=1,
        shuffle=True
    )

    x_train, x_val, y_train, y_val = train_test_split(
        x_train, y_train, test_size=0.1,
        random_state=1,
        shuffle=True
    )

    # create tokenizer
    tokenizer = create_tokenizer(x_train)
    # calculate vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    # encode data
    padded_train = encode_text(tokenizer, x_train, length)
    padded_val = encode_text(tokenizer, x_val, length)

    # define model
    model = define_model(length, vocab_size)

    # Create a TensorBoard instance with the path to the logs directory
#     tensorboard = TensorBoard(
#         log_dir=os.path.join(
#             log_dir, f'{datetime.now()}')
#     )
    # fit model
    model.fit(
        [padded_train, padded_train, padded_train],
        np.array(y_train),
        epochs=1,
        batch_size=16,
        validation_data=([padded_val, padded_val, padded_val], y_val),
        verbose=0 ) #callbacks=[tensorboard],
        
    # save the model
    model.save(os.path.join(models_dir, 'multi_scale_cnn.h5'))

    # encode data
    padded_test = encode_text(tokenizer, x_test, length)

    # evaluate model
    y_pred = model.predict([padded_test, padded_test, padded_test])
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_dl = np.float32(auc(fpr, tpr))
    plot_roc_curv(fpr, tpr, auc_dl, method='Multiscale CNN')
    plt.savefig(os.path.join(save_dir, 'roc_curve_multiscale_cnn.jpg'))
    
    #### Now preddicting the unlabled data!
    padded_test_unlabelled = encode_text(tokenizer, x_raw[4000:], length)
    y_pred_unlabelled = model.predict([padded_test_unlabelled, padded_test_unlabelled, padded_test_unlabelled])
    # writing prediction to file
    #print(y_pred_unlabelled)
    df = pd.DataFrame(y_pred_unlabelled)
    df.to_csv(os.path.join(predictions_dir, 'multi_scale_cnn.csv'),index=False)
    

In [11]:

if __name__ == '__main__':

    data_dir = r'TD\\'
    os.makedirs(
        os.path.join(
            data_dir,
            'proccessed_data'
        ),
        exist_ok=True
    )
    save_dir = os.path.join(data_dir, 'proccessed_data')
    os.makedirs(
        os.path.join(
            data_dir,
            'logs'
        ),
        exist_ok=True
    )
    log_dir = os.path.join(data_dir, 'logs')
    os.makedirs(
        os.path.join(
            data_dir,
            'saved_models'
        ),
        exist_ok=True)
    models_dir = os.path.join(data_dir, 'saved_models')
    
    os.makedirs(
        os.path.join(
            data_dir,
            'predicted_results'
        ),
        exist_ok=True)
    predictions_dir = os.path.join(data_dir, 'predicted_results')
    # setup logging
    logging.basicConfig(
        filename=os.path.join(
            log_dir, 'rf.log'
        ),
        level=logging.INFO
    )
    logger = logging.getLogger('cnn_model')
    try:
        shutil.rmtree(
            os.path.join(
                log_dir, os.listdir(log_dir)[0]
            )
        )
    except:
        logger.warning('File doesnt exist')

    # run main
    main()


     comment
lbl         
0.0     3464
1.0      535


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]
