In [1]:
from collections import Counter
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [3]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [14]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [5]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [6]:
def clean(comment):
    """
    (string)->(list of words)
    This function receives comments and returns clean word-list
    """
    lem=WordNetLemmatizer()
    eng_stopwords = set(stopwords.words("english"))
    
    #Convert to lower case
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words
    words=word_tokenize(comment)
    words = [w for w in words if not w in eng_stopwords]
    
    tags=nltk.pos_tag(words)
    
    lem_words=[]
    for i in range(len(words)):
        if get_wordnet_pos(tags[i][1])!='':
            lem_words.append(lem.lemmatize(tags[i][0], get_wordnet_pos(tags[i][1])))
        else:
            lem_words.append(tags[i][0])
       
    return lem_words

In [26]:
toxic=train[train['toxic']==1]

In [28]:
toxic.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
20,225701312,Why hasn't Alitalia been removed rom the allia...,1,0,0,0,0,0
26,293668009,"""\nThe Graceful Slick....\nIs non other than a...",1,0,0,0,0,0
30,341549388,"""\n\n Stupid? \n\nAs soon as I saw the phrase ...",1,0,0,0,0,0
32,345843351,"""\nBan one side of an argument by a bullshit n...",1,0,1,0,1,0


In [29]:
toxic['comment_text_lem']=toxic['comment_text'].map(clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [38]:
toxic_text=toxic['comment_text_lem'].values
toxic_text=[word for comment in toxic['comment_text_lem'].values for word in comment]
toxic_freq=Counter(toxic_text)
toxic_freq.most_common(20)

In [79]:
def ngrams(text,n):
    '''
    (list of words)->(list of ngrams)
    '''
    ngrams=[]
    for i in range(len(text)-n+1):
        ngrams.append('_'.join(text[i:i+n]))

    return ngrams

In [80]:
bigrams=ngrams(toxic_text,2)
freq_bigrams=Counter(bigrams)
freq_bigrams.most_common(30)

[('!_!', 25954),
 ('nigger_nigger', 2722),
 ('!_fuck', 2087),
 ("''_''", 2033),
 ("``_''", 1751),
 ('fuck_!', 1336),
 ('pig_pig', 1250),
 ('?_?', 1215),
 ('shit_shit', 1143),
 ('bark_bark', 999),
 ('hate_hate', 961),
 ('penis_penis', 939),
 ('ball_ball', 832),
 ('as_.', 770),
 ('go_fuck', 747),
 ('wanker_wanker', 734),
 ('._as', 687),
 ('._``', 626),
 ('die_fag', 625),
 ('!_go', 624),
 ('fag_die', 624),
 ('fuck_yourselfgo', 621),
 ('yourselfgo_fuck', 621),
 ('jew_fat', 613),
 ('fat_jew', 609),
 ('moron_hi', 560),
 ('hi_moron', 560),
 ('suck_suck', 509),
 ('._go', 479),
 ("._n't", 456)]

In [81]:
trigrams=ngrams(toxic_text,3)
freq_trigrams=Counter(trigrams)
freq_trigrams.most_common(30)

[('!_!_!', 22538),
 ('nigger_nigger_nigger', 2713),
 ('pig_pig_pig', 1248),
 ('shit_shit_shit', 1106),
 ('bark_bark_bark', 998),
 ('hate_hate_hate', 938),
 ('?_?_?', 857),
 ('penis_penis_penis', 832),
 ('ball_ball_ball', 831),
 ('wanker_wanker_wanker', 732),
 ('._as_.', 681),
 ('as_._as', 676),
 ('!_!_fuck', 663),
 ('die_fag_die', 624),
 ('fag_die_fag', 624),
 ('fuck_yourselfgo_fuck', 621),
 ('yourselfgo_fuck_yourselfgo', 619),
 ('fat_jew_fat', 609),
 ('jew_fat_jew', 608),
 ('!_go_fuck', 572),
 ('go_fuck_!', 559),
 ('hi_moron_hi', 559),
 ('moron_hi_moron', 558),
 ('fuck_!_go', 555),
 ('!_fuck_!', 513),
 ('fuck_!_fuck', 503),
 ('suck_suck_suck', 498),
 ('wiki_noobs_wiki', 454),
 ('noobs_wiki_noobs', 453),
 ('as_as_as', 448)]