### 2. FastText

In this section we wil use an other model developed by Facebook, which is built around the concept of embeddings. The main difference compared to the standard word2vec tecnique is that we will avoid the problem of OOV and word srtructure. 

#### a. Importing the required libraries

In [55]:
import pandas as pd
import numpy as np

from fasttext import train_supervised

#Sklearn Library
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import re
import nltk.corpus 
from nltk.corpus import stopwords   # import library for stepword
from nltk.stem.porter import PorterStemmer  # import library for Stemming
from nltk.stem import WordNetLemmatizer   # import library for lemmattazing

#### b. Load the dataset

In [86]:
# Load and Read the two dataset, Train and Test
df_train = pd.read_csv("/Users/stefano/UDACITY/Data Engeeniring/Capstone/Data/train.csv")   # used to train our model

df_train.shape

(159571, 8)

In [87]:
df_train.head()   # Display the first n rows of our training dataset

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#### c. Pre processing data

In this section we will clean and prepare the testing data for FastText. 

1. we punctuation, digit, etc. 

2. we normalize text, i.e. stemming and lemmatisation

3. we convert the binary labels into __class0__ and __class 1__ as FastText use this vocabulary for prediction

In [88]:
# Creating a function for text pre-processing, which is a necessary steps for any NLP algorithm.

# inizialize the main function variables
def pre_processing_txt(text, stemm=True, lemm=True, text_stopwords=None):
    
    text = ''.join((word for word in text if not word.isdigit()))    # eliminate all digits from our target text
    
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())   # Regex text + all text lower capital
    
    ## Tokenize our target text (convert from string to list)
    token_text = text.split()
    
    ## remove Stopwords (all english vocabulary that is not providing meaning added value)
    if text_stopwords is not None:
        token_text = [word for word in token_text if word not in 
                    text_stopwords]
                
    ## Stemming process (remove -ing, -ly, ...)
    if stemm == True:
        porter_stemmer = nltk.stem.porter.PorterStemmer()  
        token_text = [porter_stemmer.stem(word) for word in token_text]
                
    ## Lemmatisation (convert the word into root word)
    if lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        token_text = [lem.lemmatize(word) for word in token_text]
            
    ## back to text string from list
    text = " ".join(token_text)
    return text

In [89]:
import sys
sys.setrecursionlimit(10000)

In [90]:
# Test pre_processing_txt function

df_train["comment_text"] = df_train["comment_text"].apply(pre_processing_txt)
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explan whi the edit made under my usernam hard...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww he match thi background colour im seeming...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im realli not tri to edit war it just ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cant make ani real suggest on improv i ...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero ani chanc you rememb what ...,0,0,0,0,0,0


In [182]:

print(df_train.loc[[497]])

                   id                   comment_text        toxic  \
497  014c96f873db11ff  nazi filth is impolit jan utc  __class__1    

    severe_toxic      obscene       threat       insult identity_hate  
497  __class__0   __class__0   __class__0   __class__1    __class__0   


In [91]:
''' THIS IS actually my code but it is super slow

label_prefix = "__class__"

for index, row in df_train.iterrows():
    df_train["toxic"] = label_prefix + str(row["toxic"])
    df_train["severe_toxic"] = label_prefix + str(row["severe_toxic"])
    df_train["obscene"] = label_prefix + str(row["obscene"])
    df_train["threat"] = label_prefix + str(row["threat"])
    df_train["insult"] = label_prefix + str(row["insult"])
    df_train["identity_hate"] = label_prefix + str(row["identity_hate"])
df_train.head()

'''

' THIS IS actually my code but it is super slow\n\nlabel_prefix = "__class__"\n\nfor index, row in df_train.iterrows():\n    df_train["toxic"] = label_prefix + str(row["toxic"])\n    df_train["severe_toxic"] = label_prefix + str(row["severe_toxic"])\n    df_train["obscene"] = label_prefix + str(row["obscene"])\n    df_train["threat"] = label_prefix + str(row["threat"])\n    df_train["insult"] = label_prefix + str(row["insult"])\n    df_train["identity_hate"] = label_prefix + str(row["identity_hate"])\ndf_train.head()\n\n'

In [92]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']   # target columns
label_prefix = "__class__"   # prefix to be added

# for loop to add prefix to each target column
for col in df_train[classes]:
    df_train[col] = label_prefix + df_train[col].astype(str) + ' '


#### d. Splitting dataset into training and validation

In [117]:
#Prepare the dataset to be splitted into Train and Test data
classes = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

X = df_train.comment_text  #predicotors
y = df_train[classes] # independent variable

# Split of the dataset. Test size 33% and random seed = 42
X_train, X_test = train_test_split(df_train, test_size=0.33, shuffle = True, random_state=42) 

#### e. Training the model

- as FastText is accepting only .cvs file as input, we cannot use a Multi-Output Classifier;
- we will use the train_supervised function (as the comments are alreadzy prelabelled;
- as in FastText is not possible to calculate the probability of each comment, we will need to loop trhough the whole dataset to obtain this metrics.

In [134]:
prediction = []   # container for all probability per each comments

for i in classes: 
    model = '/Users/stefano/UDACITY/Data Engeeniring/Capstone/working/model.csv'   # Saving the model
    
    #Saving to a .csv file the output
    X_train[[i, "comment_text"]].to_csv(model, index=False, header=None, columns=[i, "comment_text"]) 
    
    #Use FastText train_supervised
    model = train_supervised(input=model, label="__class__", lr=1.0, epoch=2, loss='ova', wordNgrams=2, dim=100, thread=2, verbose=100)
    
    
    # container for all probability per each comments for validation set
    prediction_val = []
    
    #loop over validation set
    for g in X_test["comment_text"].values:
        
        #Get the prediction per each class
        pred_val = model.predict(g, k = 2)[1][1]
        
        #Append the prediction obtained
        prediction_val.append(pred_val)
        
    #Append all prediction to the first list
    prediction.append(prediction_val)

In [135]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

X_test["toxic"] = X_test.toxic.str.replace('__class__' , '')
X_test["severe_toxic"] = X_test.severe_toxic.str.replace('__class__' , '')
X_test["obscene"] = X_test.obscene.str.replace('__class__' , '')
X_test["threat"] = X_test.threat.str.replace('__class__' , '')
X_test["insult"] = X_test.insult.str.replace('__class__' , '')
X_test["identity_hate"] = X_test.identity_hate.str.replace('__class__' , '')
X_test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,7ca72b5b9c688e9e,geez are you forget weve alreadi discus whi ma...,0,0,0,0,0,0
131631,c03f72fd8f8bf54f,carioca rfa thank for your support on my reque...,0,0,0,0,0,0
125326,9e5b8e8fc1ff2e84,birthday no worri it what i do enjoy ur daytalk,0,0,0,0,0,0
111256,5332799e706665a6,pseudosci categori im assum that thi articl is...,0,0,0,0,0,0
83590,dfa7d8f0b4366680,and if such phrase exist it would be provid by...,0,0,0,0,0,0


In [136]:
y_test = X_test[classes].astype("int").to_numpy()

all_preds_array = np.transpose(np.array(prediction))
all_preds_array

array([[1.56114891e-01, 7.13142985e-03, 4.74358723e-02, 1.55876123e-03,
        9.80893224e-02, 2.16253344e-02],
       [1.00000034e-05, 4.68312966e-04, 3.89984576e-04, 1.00000034e-05,
        5.98408747e-04, 5.13335632e-04],
       [3.62300538e-02, 2.33316235e-03, 2.09742412e-02, 1.00000034e-05,
        1.85565669e-02, 1.00000034e-05],
       ...,
       [1.14356913e-03, 8.39589280e-04, 1.25484320e-03, 1.00000034e-05,
        3.18268221e-03, 8.65900831e-04],
       [5.29304962e-04, 4.97857109e-04, 1.60784519e-03, 1.00000034e-05,
        8.65900831e-04, 1.00000034e-05],
       [1.04223099e-03, 3.28306481e-03, 1.65848271e-03, 3.18268221e-03,
        1.20637799e-02, 2.99103255e-03]])

In [145]:
def auc_roc(y_test, y_pred):
    auc_roc = []
    for i in range(y_test.shape[1]):
        aucs.append(roc_auc_score(y_test[:,i],y_pred[:,i]))
    return aauc_roc

In [138]:
mean_auc = mean(auc_roc(y_test,all_preds_array))
mean_auc

0.7737228647064045

The ROC-AUC has scored 77% which is actually a bit better compared to the 72% of the Forest model we have used in the Beg of Words model. 