In [1]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
     

class TextToTensor():

    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def string_to_tensor(self, string_list: list) -> list:
        """
        A method to convert a string list to a tensor for a deep learning model
        """    
        string_list = self.tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=self.max_len)
        
        return string_list

def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=[]) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', str(string))

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string        

In [2]:
from keras.models import Input, Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

class RnnModel():
    """
    A recurrent neural network for semantic analysis
    """

    def __init__(self, embedding_matrix, embedding_dim, max_len, X_additional=None):
        
        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = LSTM(256, return_sequences=True)(x)
        x = LSTM(128)(x)
        x = Dropout(0.1)(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(1, activation="sigmoid")(x)    
        model = Model(inputs=inp1, outputs=x)

        model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
        self.model = model

In [3]:
import numpy as np


class Embeddings():
    """
    A class to read the word embedding file and to create the word embedding matrix
    """

    def __init__(self, path, vector_dimension):
        self.path = path 
        self.vector_dimension = vector_dimension
    
    @staticmethod
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
        return embeddings_index

    def create_embedding_matrix(self, tokenizer=None, max_features=None):
        """
        A method to create the embedding matrix
        """
        model_embed = self.get_embedding_index()

        if max_features is None:
            max_features = len(model_embed)

        word_index = model_embed
        if tokenizer is not None: 
            word_index = tokenizer.word_index

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for index, word in enumerate(word_index.keys()):
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        return embedding_matrix

In [4]:
import numpy as np

# The main model class
#from RNN_model import RnnModel

# Importing the word preprocesing class
#from text_preprocessing import TextToTensor, clean_text

# Importing the word embedding class
#from embeddings import Embeddings

# Loading the word tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

# For accuracy calculations
from sklearn.metrics import accuracy_score, f1_score


class Pipeline:
    """
    A class for the machine learning pipeline
    """
    def __init__(
        self, 
        X_train: list, 
        Y_train: list, 
        embed_path: str, 
        embed_dim: int,
        stop_words=[],
        X_test=[], 
        Y_test=[],
        max_len=None,
        epochs=3,
        batch_size=256
        ):

        # Preprocecing the text
        X_train = [clean_text(text, stop_words=stop_words) for text in X_train]
        Y_train = np.asarray(Y_train)
        
        # Tokenizing the text
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        # Saving the tokenizer
        self.tokenizer = tokenizer

        # Creating the embedding matrix
        embedding = Embeddings(embed_path, embed_dim)
        embedding_matrix = embedding.create_embedding_matrix(tokenizer, len(tokenizer.word_counts))

        # Creating the padded input for the deep learning model
        if max_len is None:
            max_len = np.max([len(text.split()) for text in X_train])
        TextToTensor_instance = TextToTensor(
            tokenizer=tokenizer, 
            max_len=max_len
            )
        X_train = TextToTensor_instance.string_to_tensor(X_train)

        # Creating the model
        rnn = RnnModel(
            embedding_matrix=embedding_matrix, 
            embedding_dim=embed_dim, 
            max_len=max_len
        )
        rnn.model.fit(
            X_train,
            Y_train, 
            batch_size=batch_size, 
            epochs=epochs
        )

        self.model = rnn.model

        # If X_test is provided we make predictions with the created model
        if len(X_test)>0:
            X_test = [clean_text(text) for text in X_test]
            X_test = TextToTensor_instance.string_to_tensor(X_test)
            yhat = [x[0] for x in rnn.model.predict(X_test).tolist()]
            
            self.yhat = yhat

            # If true labels are provided we calculate the accuracy of the model
            if len(Y_test)>0:
                self.acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
                self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])

In [None]:
import pandas as pd 

# Package for array math
import numpy as np 

# Package for system path traversal
import os

# Package for working with dates
from datetime import date

# K fold analysis package
from sklearn.model_selection import KFold

# Import the main analysis pipeline
#from pipeline import Pipeline

# Tensor creation class
#from text_preprocessing import TextToTensor

# Reading the configuration file
import yaml
with open("conf.yml", 'r') as file:
    conf = yaml.safe_load(file).get('pipeline')

# Reading the stop words
stop_words = []
try:
    stop_words = pd.read_csv('stop_words.txt', sep='\n', header=None)[0].tolist()
except Exception as e:
    # This exception indicates that the file is missing or is in a bad format
    print('Bad stop_words.txt file: {e}')

# Reading the data
train = pd.read_csv('clean_data.csv')
test = pd.read_csv('Sexual Abusive Comments by Roma3 & INNO.csv')

# Shuffling the data for the k fold analysis
train = train.sample(frac=1)

# Creating the input for the pipeline
X_train = train['text'].tolist()
Y_train = train['is_offensive'].tolist()

X_test = test['Comment'].tolist()

if conf.get('k_fold'):
    kfold = KFold(n_splits=5)
    acc = []
    f1 = []
    for train_index, test_index in kfold.split(X_train):
        # Fitting the model and forecasting with a subset of data
        k_results = Pipeline(
            X_train=np.array(X_train)[train_index],
            Y_train=np.array(Y_train)[train_index], 
            embed_path='glove.840B.300d.txt',
            embed_dim=300,
            X_test=np.array(X_train)[test_index],
            Y_test=np.array(Y_train)[test_index],
            max_len=conf.get('max_len'),
            epochs=conf.get('epochs'),
            batch_size=conf.get('batch_size')
        )
        # Saving the accuracy
        acc += [k_results.acc]
        f1 += [k_results.f1]
        print(f'The accuracy score is: {acc[-1]}') 
        print(f'The f1 score is: {f1[-1]}') 
    print(f'Total mean accuracy is: {np.mean(acc)}')
    print(f'Total mean f1 score is: {np.mean(f1)}')



Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
The accuracy score is: 0.9309484418648802
The f1 score is: 0.821232972897065
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
The accuracy score is: 0.9308399555205988
The f1 score is: 0.8259148006553796


In [7]:
# Running the pipeline with all the data
results = Pipeline(
    X_train=X_train,
    Y_train=Y_train, 
    embed_path='glove.840B.300d.txt',
    embed_dim=300,
    stop_words=stop_words,
    X_test=X_test,
    max_len=conf.get('max_len'),
    epochs=conf.get('epochs'),
    batch_size=conf.get('batch_size')
)

# Some sanity checks


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [9]:
good = ["fine"]
bad = ["Fuck you bloody bitch"]

TextToTensor_instance = TextToTensor(
tokenizer=results.tokenizer,
max_len=conf.get('max_len')
)

# Converting to tensors
good_nn = TextToTensor_instance.string_to_tensor(good)
bad_nn = TextToTensor_instance.string_to_tensor(bad)

# Forecasting
p_good = results.model.predict(good_nn)[0][0]
p_bad = results.model.predict(bad_nn)[0][0]

print(f'Sentence: {good_nn} Score: {p_good}')
print(f'Sentence: {bad_nn} Score: {p_bad}')

# Saving the predictions
test['prob_is_genuine'] = results.yhat
test['target'] = [1 if x > 0.5 else 0 for x in results.yhat]
 
# Saving the predictions to a csv file
if conf.get('save_results'):
    if not os.path.isdir('output'):
        os.mkdir('output')    
    test[['target']].to_csv(f'output/submission_{date.today()}.csv', index=False)

Sentence: [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0 515]] Score: 0.0010506808757781982
Sentence: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0   45 8274 2388   54]] Score: 0.9999997615814209


In [40]:
import pickle
import dill
import joblib
try:
    joblib.dump(results,'results')
except:
    pass

In [None]:
filename = 'model_v1.pk'
with open('./'+filename, 'wb') as file:
    joblib.dump(results, file) 
file.close()
        