<a href="https://colab.research.google.com/github/markerenberg/AI-Poker-Player/blob/master/FastText_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Check if GPU is enabled
import tensorflow as tf
tf.test.gpu_device_name()

''

In [2]:
##
## =======================================================
## Mark Erenberg 
## Toxic Comment Classification Challenge
## =======================================================
##

# Objective: Create a model which predicts a probability of each type of toxicity for each comment.

# import dependencies and files

import os
import time
import pandas as pd
import numpy as np
from scipy import sparse
import itertools
import matplotlib.pyplot as plt
import re

import lightgbm as lgb

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize

import gensim
import gensim.models.keyedvectors as word2vec
from gensim.models.fasttext import FastText

import spacy
import en_core_web_sm
#spacy_nlp = en_core_web_sm.load()
#spacy_nlp = spacy.load('en_core_web_sm')

from collections import Counter, defaultdict

from sklearn import utils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import Adam
from keras.callbacks import Callback

################### Data Loading ###################
#os.chdir('C:\\Users\\marke\\Downloads\\Toxic Classification')
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

train_text = train[['id','comment_text']].drop_duplicates()
df = pd.concat([train_text,test],ignore_index=True)

################### Data Cleaning ####################
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
cv = CountVectorizer(min_df=0., max_df=1.)
stemmer = SnowballStemmer("english")
wnl = WordNetLemmatizer()


# Simple way to get the number of occurence of a regex
def count_regexp_occ(regexp="", text=None):
    return len(re.findall(regexp, text))

# Determine if file words exist:
#print(len(df[df['comment_text'].str.contains('jpg')]))
#print(len(df[df['comment_text'].str.contains('jpeg')]))
#print(len(df[df['comment_text'].str.contains('http')]))
#print(len(df[df['comment_text'].str.contains('pdf')]))
#print(len(df[df['comment_text'].str.contains('html')]))

# Remove non-alphabetic characters and split tokens by spaces/newlines
def clean_document(doc):
    # 1) Convert string to lower
    #doc = bytes(doc.lower(), encoding="utf-8")
    doc = doc.lower()
    # 2) Replace contracion patterns
    cont_patterns = [
    (r'(W|w)on\'t', r'will not'),
    (r'(C|c)an\'t', r'can not'),
    (r'(I|i)\'m', r'i am'),
    (r'(A|a)in\'t', r'is not'),
    (r'(\w+)\'ll', r'\g<1> will'),
    (r'(\w+)n\'t', r'\g<1> not'),
    (r'(\w+)\'ve', r'\g<1> have'),
    (r'(\w+)\'s', r'\g<1> is'),
    (r'(\w+)\'re', r'\g<1> are'),
    (r'(\w+)\'d', r'\g<1> would'),
    ]
    patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]
    for (pattern, repl) in patterns:
        doc = re.sub(pattern, repl, doc)
    # 3) Remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]+', '', doc)
    #doc = doc.encode('utf-8')
    #doc = str(doc,'utf-8').strip()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    #doc = ' '.join(tokens)
    return doc

# POS tagging for lemmatizer
def get_wordnet_pos(tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN


# Stem tokens (UNUSED):
def stem_comment(comment):
    return [stemmer.stem(word) for word in comment]
        
# Lemmaitze comments:
# NOTE: POS_TAG was changed to V for all of them because it's the only one that works
def lemmatize_comment(comment):
        comment_list = comment.split()
        pos = nltk.pos_tag(comment_list)
        lemmatized = [wnl.lemmatize(word,'v') for (word,pos_t) in pos]
        return lemmatized            
                

df['clean_comments'] = [clean_document(x) for x in df['comment_text']]
df['clean_comments_list'] = df['clean_comments'].apply(lambda x: x.split())
train['clean_comments'] = [clean_document(x) for x in train['comment_text']]
train['clean_comments_list'] = train['clean_comments'].apply(lambda x: x.split())

# train['clean_lemmed'] = [lemmatize_comment(x) for x in train['clean_comments']]
# train['clean_lemmed'] = [x.drop_duplicates() for x in train['clean_lemmed']]

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
## ========== Feature Engineering=====================

# Set values for various parameters
feature_size = 300    # Word vector dimensionality  
window_context = 10          # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
sample = 1e-3   # Which higher-frequency words are randomly downsampled
sg = 1          # sg {0,1} = 1 if skip-gram, 0 if CBOW,
epochs = 50     # epochs = number of training epochs over corpus (default 5)

tokenized_corpus = df['clean_comments_list']

# Fit FastText model on pre-processed corpus
start_time = time.time()
ft_model = FastText(tokenized_corpus, size=feature_size, window=window_context, 
                    min_count=min_word_count,sample=sample, sg=sg, iter=epochs)
print("--- %s minutes ---" % ((time.time() - start_time)/60))
w2v = {w: vec for w, vec in zip(ft_model.wv.index2word, ft_model.wv.syn0)}

word2vec = ft_model
w2v_dim = len(w2v)

def tf2vec_fit(X):
    tfidf = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer=lambda x: x)
    tfidf.fit(X)
    # if a word was never seen - it must be at least as infrequent
    # as any of the known words - so the default idf is the max of 
    # known idf's
    max_idf = max(tfidf.idf_)
    word2weight = defaultdict(
        lambda: max_idf, 
        [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

    return word2weight

def tf2vec_transform(X, word2weight):
    return [
            np.mean([word2vec[w] * word2weight[w]
                      for w in words if w in word2vec] or
                    [np.zeros(w2v_dim)], axis=0)
            for words in X
        ]



In [0]:
X = train['clean_comments_list']
tf2vec_features = tf2vec_transform(X,tf2vec_fit(X))
tf2vec_df = pd.DataFrame(tf2vec_features)
#numpy.savetxt("tf2vec_features.csv", tf2vec_features, delimiter=",")



In [0]:
tf2vec_matrix = np.concatenate(tf2vec_features, axis=0)
tf2vec_matrix = sparse.csr_matrix(tf2vec_matrix)

In [0]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1, multi_class='auto'):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs
        self.multi_class = multi_class

    def predict(self, x):
        # Verify that model has been fit
        #check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        #check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        # x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [0]:
training_features = tf2vec_matrix
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#train['none'] = 1-train[label_cols].max(axis=1)
#preds = np.zeros((len(test), len(label_cols)))
seed = 1234
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
n_rows = len(list(enumerate(folds.split(training_features, train['toxic'])))[0][1][0])
preds = np.zeros((n_rows, len(class_names)))

for i, j in enumerate(label_cols):
    training_labels = train[j]
    scores = []
    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(training_features, training_labels)):
      model = NbSvmClassifier(C=4, dual=True, n_jobs=2, multi_class='ovr').fit(training_features[trn_idx], training_labels[trn_idx])
      preds[:,i] = model.predict(training_features[val_idx].multiply(model._r))[:,1]
      scores.append(roc_auc_score(training_labels[val_idx], preds[:,i]))
    print('fit: ' + j + ' | score: ' + str(round(np.mean(scores,2))))




In [0]:
training_features = tf2vec_features.tocsr()
seed = 1234
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
l = enumerate(folds.split(training_features, training_labels))
#training_features[list(l)[1][1][0]]

In [0]:
ft_model.wv.most_similar(['weird'], topn=20)