In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
from scipy import linalg
import matplotlib.pyplot as pltb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mujahidabdullahi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df_sample = train_df.iloc[1:100000,:]
val_df = train_df.iloc[100000:-1,:]
train_df_sample.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0


In [3]:
def add_non_toxic(*argv):
    if sum(argv) > 0:
        val = 0
    else:
        val = 1
    return(val)   

In [4]:
def prep_data(df):
    df['non_toxic'] = df.apply(lambda df: add_non_toxic(
    df['toxic'], df['severe_toxic'],df['obscene'], df['threat'], df['insult'], df['insult']), axis = 1)
    df = pd.melt(df, id_vars = ['id', 'comment_text'], value_vars = ['toxic', 'severe_toxic', 'obscene', 
                             'threat', 'insult', 'identity_hate', 'non_toxic'])
    df = df[df['value'] == 1].drop(['value'], axis = 1)
    df.rename({'variable':'label'}, axis = 1, inplace = True)
    
    return(df)

In [5]:
train_df, val_df= [prep_data(df) for df in [train_df, val_df]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
train_df_comments = train_df['comment_text'].tolist()

In [7]:
toxic_comments = train_df['comment_text'][train_df['label'] == 'toxic']

In [8]:
len(toxic_comments)

15294

In [9]:
print("\n".join(toxic_comments[:10]))

COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
Hey... what is it..
@ | talk .
What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?

Bye! 

Don't look, come or think of comming back! Tosser.
You are gay or antisemmitian? 

Archangel WHite Tiger

Meow! Greetingshhh!

Uh, there are two ways, why you do erased my comment about WW2, that holocaust was brutally slaying of Jews and not gays/Gypsys/Slavs/anyone...

1 - If you are anti-semitian, than shave your head bald and go to the skinhead meetings!

2 - If you doubt words of the Bible, that homosexuality is a deadly sin, make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals!


Beware of the Dark Side!
FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!
I'm Sorry 

I'm sorry I screwed around with someones talk page.  It was very bad to do.  I know how having the t

In [10]:
non_toxic_comments = train_df.comment_text[train_df['label'] == 'non_toxic']

In [11]:
len(non_toxic_comments)

143400

In [12]:
print("\n".join(non_toxic_comments[:10]))

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
"
More
I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want 

In [13]:
vectorizer = CountVectorizer(stop_words='english')

In [14]:
word_doc_matrix = vectorizer.fit_transform(train_df_comments).todense()

In [15]:
vocab = np.array(vectorizer.get_feature_names())

In [16]:
print(len(vocab), word_doc_matrix.shape)

189460 (178498, 189460)


In [17]:
vocab[10000:10010]

array(['aagadu', 'aage', 'aagf', 'aagin', 'aah', 'aahahahahahaha',
       'aahank', 'aahh', 'aahil', 'aahoa'], dtype='<U4955')

In [18]:
##u, s, v = np.linalg.svd(word_doc_matrix, full_matrices = False)

In [19]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
def preprocess_text(df):
    detokenized_doc = []
    df['comment_text'] = df['comment_text'].str.replace("[^a-zA-Z]", " ")
    df['comment_text'] = df['comment_text'].apply(lambda x: x.split())
    df['comment_text'] = df['comment_text'].apply(lambda x: [item for item in x if item.lower() not in stopwords])
    df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(x))
    
    return(df)                                                     

In [21]:
train_df, val_df, test_df = [preprocess_text(df) for df in [train_df, val_df, test_df]]

Using fast ai library

In [22]:
from fastai import *
from fastai.text import *

In [23]:
path = '/Users/mujahidabdullahi/.fastai/data/toxic_comments_classification'

In [24]:
toxic_comments = TextClasDataBunch.from_df(path, train_df=train_df, valid_df=val_df, test_df=test_df, label_cols=2,
                                      text_cols=["comment_text"], no_check=True)

In [25]:
print(toxic_comments.train_dl.x[1000], toxic_comments.train_dl.y[1000])

xxbos xxmaj hey xxmaj rama xxmaj hey xxmaj rama xxmaj xxunk whatever name keep nose business tend affairs want post message users xxmaj david xxmaj xxunk talk pages business stay toxic


In [26]:
print(len(toxic_comments.train_dl.vocab.itos), len(toxic_comments.train_dl.vocab.stoi))

60000 284532


In [27]:
toxic_comments.vocab.itos[2000:2020]

['historians',
 'industry',
 'advance',
 'base',
 'tend',
 'applies',
 'push',
 'philosophy',
 'understood',
 'uncivil',
 'profile',
 'wanna',
 'mainly',
 'activity',
 'expanded',
 'portal',
 'hide',
 'whoever',
 'judge',
 'draft']

In [28]:
def get_term_doc_matrix(label_list, vocab_len):
    j_indices = []
    indptr = []
    values = []
    indptr.append(0)

    for i, doc in enumerate(label_list):
        feature_counter = Counter(doc.data)
        j_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        indptr.append(len(j_indices))
        
#     return (values, j_indices, indptr)

    return scipy.sparse.csr_matrix((values, j_indices, indptr),
                                   shape=(len(indptr) - 1, vocab_len),
                                   dtype=int)

In [29]:
word_doc_train = get_term_doc_matrix(toxic_comments.train_dl.x, len(toxic_comments.vocab.itos))

In [30]:
word_doc_valid = get_term_doc_matrix(toxic_comments.valid_dl.x, len(toxic_comments.vocab.itos))

In [31]:
print(word_doc_train.todense().shape, word_doc_valid.todense().shape)

(178498, 60000) (66575, 60000)


In [32]:
identity_hate = toxic_comments.train_dl.y.c2i['identity_hate']
insult = toxic_comments.train_dl.y.c2i['insult']
non_toxic = toxic_comments.train_dl.y.c2i['non_toxic']
obscene = toxic_comments.train_dl.y.c2i['obscene']
severe_toxic = toxic_comments.train_dl.y.c2i['severe_toxic']
threat = toxic_comments.train_dl.y.c2i['threat']
toxic = toxic_comments.train_dl.y.c2i['toxic']

In [33]:
np.squeeze(np.asarray(word_doc_train[toxic_comments.train_dl.y.items == "identity_hate"].sum(0)))


  """Entry point for launching an IPython kernel.


array([0, 0, 1, 0, ..., 0, 0, 0, 0], dtype=int64)

In [34]:
def probability(word_doc_matrix, category):
    n_category = np.squeeze(np.asarray(word_doc_matrix[toxic_comments.train_dl.y.items == category].sum(0)))
    p_category = (n_category + 1)/((toxic_comments.train_dl.y.items == category).sum() + 1)
    return(p_category)  

In [35]:
pr_identity_hate, pr_insult, pr_non_toxic,pr_obscene,\
pr_severe_toxic, pr_threat, pr_toxic = [probability(word_doc_train, category) for \
                                                            category in [identity_hate, insult, non_toxic,obscene,\
                                                                         severe_toxic, threat, toxic]]                                                                                                                                                                                                                                                                                      

In [36]:
r_identity_hate, r_insult, r_non_toxic,r_obscene,\
r_severe_toxic, r_threat, r_toxic = [np.log(x/np.mean([pr_identity_hate, pr_insult, pr_non_toxic,pr_obscene, pr_severe_toxic, \
                                      pr_threat, pr_toxic])) for x in  [pr_identity_hate, pr_insult, pr_non_toxic, \
                                                                   pr_obscene,  pr_severe_toxic, pr_threat, pr_toxic]]
    

In [37]:
b_identity_hate, b_insult, b_non_toxic,b_obscene,\
b_severe_toxic, b_threat, b_toxic = [np.log(x.mean()/np.mean(np.mean([pr_identity_hate, pr_insult, pr_non_toxic,pr_obscene, pr_severe_toxic, \
                                      pr_threat, pr_toxic]))) for x in  [pr_identity_hate, pr_insult, pr_non_toxic, \
                                                                   pr_obscene,  pr_severe_toxic, pr_threat, pr_toxic]]
    

In [38]:
top_insulting_words = np.argpartition(r_toxic, -10)[-10:]
[print(k) for k in [toxic_comments.vocab.itos[i] for i in top_insulting_words]]

go
u
wikipedia
like
fuck
fucking
xxup
xxmaj
xxbos
xxunk


[None, None, None, None, None, None, None, None, None, None]

In [39]:
top_identity_hate_words = np.argpartition(r_identity_hate, -10)[-10:]
[print(k) for k in [toxic_comments.vocab.itos[i] for i in top_identity_hate_words]]

fucking
faggot
nigger
gay
fuck
jew
fat
xxmaj
xxbos
xxup


[None, None, None, None, None, None, None, None, None, None]

In [40]:
top_non_toxic_words = np.argpartition(r_non_toxic, -10)[-10:]
[print(k) for k in [toxic_comments.vocab.itos[i] for i in top_non_toxic_words]]

would
please
talk
article
wikipedia
page
xxup
xxmaj
xxbos
xxunk


[None, None, None, None, None, None, None, None, None, None]

In [41]:
print(word_doc_valid.shape, r_identity_hate.shape) 

(66575, 60000) (60000,)


In [42]:
log_likelyhood = [r_identity_hate, r_insult, r_non_toxic,r_obscene, r_severe_toxic, r_threat, r_toxic]
avg_log_likelyhood = [b_identity_hate, b_insult, b_non_toxic,b_obscene, b_severe_toxic, b_threat, b_toxic]

In [43]:
predictions_all_val = list(map(lambda x, y: word_doc_valid @ x.T + y, log_likelyhood, avg_log_likelyhood))
predictions_all_val

[array([35.710597, 28.647658, 38.363015, 77.651178, ..., 35.791084, 21.364082, 44.33685 , 27.436918]),
 array([30.70846 , 31.265318, 35.184624, 72.119036, ..., 34.699944, 21.908641, 37.827717, 28.250304]),
 array([22.853079, 17.742472, 36.114789, 32.825966, ..., 61.718221, 23.333437, 42.910034, 38.362042]),
 array([31.459666, 28.497778, 33.623783, 71.705547, ..., 38.364751, 22.32773 , 37.504851, 29.099556]),
 array([30.761464, 32.512364, 32.250011, 84.423839, ..., 31.325096, 21.209258, 40.681086, 22.457835]),
 array([36.423088, 28.904838, 36.29625 , 67.707815, ..., 38.393286, 28.060976, 48.40332 , 28.037028]),
 array([34.14729 , 29.516133, 35.735309, 68.836833, ..., 41.11718 , 23.478427, 37.387809, 29.884888])]

In [44]:
pred_index = list(np.argmax(predictions_all_val, 0))

In [45]:
prediction_accuracy = np.mean(list(map(lambda x, y : x == y, pred_index, list(toxic_comments.valid_dl.y.items))))

Binarized naive bayes

In [46]:
word_doc_binarized = word_doc_train.sign()

In [47]:
word_doc_binarized.todense()

matrix([[0, 0, 1, 0, ..., 0, 0, 0, 0],
        [0, 0, 1, 0, ..., 0, 0, 0, 0],
        [0, 0, 1, 0, ..., 0, 0, 0, 0],
        [0, 0, 1, 0, ..., 0, 0, 0, 0],
        ...,
        [0, 0, 1, 0, ..., 0, 0, 0, 0],
        [0, 0, 1, 0, ..., 0, 0, 0, 0],
        [0, 0, 1, 0, ..., 0, 0, 0, 0],
        [0, 0, 1, 0, ..., 0, 0, 0, 0]])

In [48]:
pr_identity_hate_bn, pr_insult_bn, pr_non_toxic_bn,pr_obscene_bn,\
pr_severe_toxic_bn, pr_threat_bn, pr_toxic_bn = [probability(word_doc_binarized, category) for \
                                                            category in [identity_hate, insult, non_toxic,obscene,\
                                                                         severe_toxic, threat, toxic]]  

In [49]:
r_identity_hate_bn, r_insult_bn, r_non_toxic_bn,r_obscene_bn,\
r_severe_toxic_bn, r_threat_bn, r_toxic_bn = [np.log(x/np.mean([pr_identity_hate_bn, pr_insult_bn, pr_non_toxic_bn,pr_obscene_bn, pr_severe_toxic_bn, \
                                      pr_threat, pr_toxic])) for x in  [pr_identity_hate_bn, pr_insult_bn, pr_non_toxic_bn, \
                                                                   pr_obscene_bn,  pr_severe_toxic_bn, pr_threat_bn, pr_toxic_bn]]

In [50]:
b_identity_hate_bn, b_insult_bn, b_non_toxic_bn,b_obscene_bn,\
b_severe_toxic_bn, b_threat_bn, b_toxic_bn = [np.log(x.mean()/np.mean(np.mean([pr_identity_hate_bn, pr_insult_bn, pr_non_toxic_bn,pr_obscene_bn, pr_severe_toxic_bn, \
                                      pr_threat_bn, pr_toxic_bn]))) for x in  [pr_identity_hate_bn, pr_insult_bn, pr_non_toxic_bn, \
                                                                   pr_obscene_bn,  pr_severe_toxic_bn, pr_threat_bn, pr_toxic_bn]]

In [51]:
top_insulting_words_bn = np.argpartition(r_toxic_bn, -10)[-10:]
[print(k) for k in [toxic_comments.vocab.itos[i] for i in top_insulting_words_bn]]

go
fucking
like
get
wikipedia
fuck
xxup
xxmaj
xxbos
xxunk


[None, None, None, None, None, None, None, None, None, None]

In [52]:
top_identity_hate_words_bn = np.argpartition(r_identity_hate_bn, -10)[-10:]
[print(k) for k in [toxic_comments.vocab.itos[i] for i in top_identity_hate_words_bn]]

shit
faggot
go
fucking
like
gay
xxmaj
xxup
xxbos
fuck


[None, None, None, None, None, None, None, None, None, None]

In [53]:
log_likelyhood = [r_identity_hate_bn, r_insult_bn, r_non_toxic_bn,r_obscene_bn, r_severe_toxic_bn, r_threat_bn, r_toxic_bn]
avg_log_likelyhood = [b_identity_hate_bn, b_insult_bn, b_non_toxic_bn,b_obscene_bn, b_severe_toxic_bn, b_threat_bn, b_toxic_bn]


In [54]:
predictions_all_val_bn = list(map(lambda x, y: word_doc_valid.sign() @ x.T + y, log_likelyhood, avg_log_likelyhood))
predictions_all_val_bn

[array([31.475845, 27.1714  , 39.68173 , 29.59837 , ..., 40.091648, 22.763644, 21.724847, 29.155634]),
 array([27.039495, 26.966857, 35.182129, 26.887888, ..., 38.517033, 22.649561, 15.308577, 29.599753]),
 array([26.01916 , 16.895582, 34.903207, 13.687447, ..., 62.754843, 23.89858 , 16.004504, 38.729749]),
 array([27.687488, 26.63898 , 33.712374, 27.310464, ..., 40.914053, 22.728843, 14.712221, 30.295257]),
 array([26.087095, 26.701964, 29.844244, 29.726823, ..., 34.358271, 20.743167, 16.339791, 23.454324]),
 array([33.062303, 28.232593, 38.232978, 28.153114, ..., 42.904236, 26.798872, 27.452647, 29.937506]),
 array([28.574514, 25.836765, 35.675138, 25.76594 , ..., 43.283309, 24.067169, 14.635176, 30.880935])]

In [55]:
pred_index_bn = list(np.argmax(predictions_all_val_bn, 0))
prediction_accuracy_bn = (pred_index_bn == toxic_comments.valid_dl.y.items).mean()
prediction_accuracy_bn

0.5204205782951559

Logistic regression

In [56]:
m = LogisticRegression(random_state=0, solver='lbfgs', C = 0.1, multi_class= 'multinomial')
m.fit(word_doc_train, toxic_comments.train_dl.y.items.astype(int))



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [57]:
pred_log = m.predict(word_doc_valid)

In [58]:
prediction_accuracy_log = (toxic_comments.valid_dl.y.items == pred_log).mean()
prediction_accuracy_log

0.8389936162223056

Logistic regression- binarized word_doc_matrix

In [59]:
m_bn = LogisticRegression(random_state=0, solver='lbfgs', C = 0.1, multi_class= 'multinomial')
m_bn.fit(word_doc_binarized, toxic_comments.train_dl.y.items.astype(int))



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [60]:
pred_bin_log = m_bn.predict(word_doc_valid.sign())

In [61]:
prediction_accuracy_bin_log = (toxic_comments.valid_dl.y.items == pred_bin_log).mean()
prediction_accuracy_bin_log

0.8605182125422456

N gram models using CountVectorizer from scikitlearner

In [62]:
count_vcz = CountVectorizer(ngram_range=(1, 3), preprocessor=noop, tokenizer=noop)

In [63]:
docs_train = toxic_comments.train_dl.x
docs_valid = toxic_comments.valid_dl.x

In [64]:
train_words = [[docs_train.vocab.itos[i] for i in doc.data] for doc in docs_train]
valid_words = [[docs_valid.vocab.itos[i] for i in doc.data] for doc in docs_valid]

In [65]:
train_ngram_doc = count_vcz.fit_transform(train_words)

In [66]:
valid_ngram_doc = count_vcz.transform(valid_words)

Naive bayes with ngram word document matrix

In [67]:
train_ngram_doc.shape, valid_ngram_doc.shape

((178498, 6631681), (66575, 6631681))

In [68]:
pr_identity_hate_ngram, pr_insult_ngram, pr_non_toxic_ngram,pr_obscene_ngram,\
pr_severe_toxic_ngram, pr_threat_ngram, pr_toxic_ngram = [probability(train_ngram_doc, category) for \
                                                            category in [identity_hate, insult, non_toxic,obscene,\
                                                                         severe_toxic, threat, toxic]] 

In [69]:
r_identity_hate_ngram, r_insult_ngram, r_non_toxic_ngram,r_obscene_ngram,\
r_severe_toxic_ngram, r_threat_ngram, r_toxic_ngram = [np.log(x/np.mean([pr_identity_hate_ngram, pr_insult_ngram, pr_non_toxic_ngram,pr_obscene_ngram, pr_severe_toxic_ngram, \
                                      pr_threat_ngram, pr_toxic_ngram])) for x in  [pr_identity_hate_ngram, pr_insult_ngram, pr_non_toxic_ngram, \
                                                                   pr_obscene_ngram,  pr_severe_toxic_ngram, pr_threat_ngram, pr_toxic_ngram]]

In [70]:
b_identity_hate_ngram, b_insult_ngram, b_non_toxic_ngram,b_obscene_ngram,\
b_severe_toxic_ngram, b_threat_ngram, b_toxic_ngram = [np.log(x.mean()/np.mean(np.mean([pr_identity_hate_ngram, pr_insult_ngram, pr_non_toxic_ngram,pr_obscene_ngram, pr_severe_toxic_ngram, \
                                      pr_threat_ngram, pr_toxic_ngram]))) for x in  [pr_identity_hate_ngram, pr_insult_ngram, pr_non_toxic_ngram, \
                                                                   pr_obscene_ngram,  pr_severe_toxic_ngram, pr_threat_ngram, pr_toxic_ngram]]

In [71]:
top_insulting_words_ngram = np.argpartition(r_toxic_ngram, -10)[-10:]
[print(k) for k in [count_vcz.get_feature_names()[i] for i in top_insulting_words_ngram]]

u
fucking
xxbos
wikipedia
like
xxmaj
fuck
xxup
xxunk
xxbos xxmaj


[None, None, None, None, None, None, None, None, None, None]

In [72]:
log_likelyhood_ngram = [r_identity_hate_ngram, r_insult_ngram, r_non_toxic_ngram,r_obscene_ngram, r_severe_toxic_ngram, r_threat_ngram, r_toxic_ngram]
avg_log_likelyhood_ngram = [b_identity_hate_ngram, b_insult_ngram, b_non_toxic_ngram,b_obscene_ngram, b_severe_toxic_ngram, b_threat_ngram, b_toxic_ngram]

In [73]:
predictions_all_val_ngram = list(map(lambda x, y: valid_ngram_doc @ x.T + y, log_likelyhood_ngram, avg_log_likelyhood_ngram))
predictions_all_val_ngram

[array([ 54.730974,  49.91606 ,  77.472568, 141.2476  , ...,  67.386026,  32.712261,  70.418311,  41.561865]),
 array([ 12.251294,  47.035566,  37.975186, 118.252324, ...,  -5.14457 ,  16.435019,  21.647863,  16.338353]),
 array([-56.931062,  10.761598, -21.908458, -32.770233, ..., -50.012771,   3.663297, -11.947423,   7.245555]),
 array([ 10.264322,  43.870041,  21.119431, 114.668362, ...,  -4.314838,  17.691604,  20.789914,  16.225503]),
 array([ 46.569574,  51.324137,  54.049586, 155.816525, ...,  57.612381,  30.926817,  62.297681,  34.313347]),
 array([ 79.449278,  54.557813,  84.097353, 116.05599 , ..., 118.169225,  50.978835, 102.732784,  59.969331]),
 array([ 17.91361 ,  40.698917,  24.976848, 103.943857, ..., -24.061947,  14.806187,   7.206227,  10.019007])]

In [74]:
pred_index_ngram = list(np.argmax(predictions_all_val_ngram, 0))
prediction_accuracy_ngram = (pred_index_ngram == toxic_comments.valid_dl.y.items).mean()
prediction_accuracy_ngram

0.03385655276004506

Ngram with logistic regression

In [75]:
m_ngram = LogisticRegression(random_state=0, solver='lbfgs', C = 0.1, multi_class= 'multinomial')
m_ngram.fit(train_ngram_doc, toxic_comments.train_dl.y.items.astype(int))



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [76]:
pred_index_ngram_log = m_ngram.predict(valid_ngram_doc)
prediction_accuracy_ngram_log = (pred_index_ngram_log == toxic_comments.valid_dl.y.items).mean()
prediction_accuracy_ngram_log

0.8415471273000376

Ngram binarized

In [77]:
m_ngram_bn = LogisticRegression(random_state=0, solver='lbfgs', C = 0.1, multi_class= 'multinomial')
m_ngram_bn.fit(train_ngram_doc.sign(), toxic_comments.train_dl.y.items.astype(int))



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [78]:
pred_index_ngram_log_bn = m_ngram.predict(valid_ngram_doc.sign())
prediction_accuracy_ngram_log_bn = (pred_index_ngram_log_bn == toxic_comments.valid_dl.y.items).mean()
prediction_accuracy_ngram_log_bn

0.8394442358242583

Language modelling