In [75]:
import pandas as pd 
import numpy as np 
import os 
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,log_loss
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import gensim
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
from keras.optimizers import Adam
import math
from sklearn.pipeline import Pipeline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

In [35]:
class CleanStringSpace(object):
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,file_paths,y=None,file=False):
        
        clean_strings=[]
        
        for path in file_paths:
            if file:
                f=open(path,encoding="utf8")
                s=f.read()
                f.close()
            else:
                s=path
            s = s.replace(u'\n', u' ')
            s = s.replace(u'’', u' ')
            s = s.replace(u"'", u' ')
            s = ''.join((c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) != 'Mn'))
            s = s.replace('.', ' ')
            s = ''.join(e for e in s if (e.isalnum() or e == " ")).lower()
            while '  ' in s:
                s = s.replace('  ', ' ')
            clean_strings.append(s)
            
        return clean_strings
    
    def fit_transform(self,file_paths,y=None):
        
        return self.transform(file_paths)

In [3]:
class TokenizeDocs(object):
    
    def __init__(self):
        pass
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,string_list,y=None):
        tokens_list=[x.split() for x in string_list]
        return tokens_list
    
    def fit_transform(self,string_list,y=None):
        return self.transform(string_list)
    

In [36]:
def Tagged_Document_Gen(file_list=None,con=None,chunk=1000,content='content'):
    
    if con is not None:
        cleaner=CleanStringSpace()
        tokenizer=TokenizeDocs()
        
        idx=0
        file_list=np.array(file_list)
        chunks=int(math.ceil(float(len(file_list))/float(chunk))/2)
        for i in range(0,chunks):
            text_chunk=file_list[i*chunk:(i+1)*chunk]
            text_chunk=["'"+str(x)+"'" for x in text_chunk]
            text_chunk=','.join(text_chunk)
            doc_texts= pd.read_sql("""SELECT text FROM plaintext WHERE bucket='epo-history-documents' AND filepath IN(%s)"""%text_chunk,con=con)['text'].tolist()
            
            for text in doc_texts:
                text=cleaner.transform(file_paths=[text],file=False)[0]
                tokens=tokenizer.transform([text])[0]
                doc=doc2vec.TaggedDocument(tokens,[idx])
                idx+=1
                yield doc
                
            
    elif content=='filename':
        cleaner=CleanStringSpace()
        tokenizer=TokenizeDocs()
        for idx,row in file_list.iterrows():
        
            text=cleaner.transform(file_paths=[row[0]])[0]
            tokens=tokenizer.transform([text])[0]
            doc=gensim.models.doc2vec.TaggedDocument(tokens,[idx])
            yield doc
            
    elif content=='content':
        cleaner=CleanStringSpace()
        tokenizer=TokenizeDocs()
        count=0
        for text in file_list:
        
            text=cleaner.transform(file_paths=[text],file=False)[0]
            tokens=tokenizer.transform([text])[0]
            doc=gensim.models.doc2vec.TaggedDocument(tokens,[count])
            count+=1
            yield doc
        
        

In [7]:
class VectorizeDoc(object):
        
    def __init__(self,doc_model):
        self.doc_model=doc_model
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,tokens,y=None):
        
        first=True
        X=None
        
        for row in tokens:
            vector=self.doc_model.infer_vector(np.array(row))
            if first:
                first=False
                X=np.c_[np.reshape(vector,(1,-1))]
            else:
                X=np.vstack((X,np.c_[np.reshape(vector,(1,-1))]))

        return X
    
    def fit_transform(self,tokens,y=None):
        return self.transform(tokens)

In [8]:
data=pd.read_csv("train/train.csv",encoding='utf8')

In [11]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [21]:
doc_model_corpus=gensim.models.doc2vec.Doc2Vec(size=100,window=5,min_count=5,alpha=0.025, min_alpha=0.025)
doc_model_corpus.build_vocab(Tagged_Document_Gen(file_list=data['comment_text'].tolist(),content='content'))

2018-01-22 15:23:46,793 : INFO : collecting all words and their counts
2018-01-22 15:23:46,795 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-01-22 15:23:48,555 : INFO : PROGRESS: at example #10000, processed 689893 words (392565/s), 40037 word types, 10000 tags
2018-01-22 15:23:50,298 : INFO : PROGRESS: at example #20000, processed 1367442 words (388870/s), 61467 word types, 20000 tags
2018-01-22 15:23:51,990 : INFO : PROGRESS: at example #30000, processed 2032727 words (393438/s), 79072 word types, 30000 tags
2018-01-22 15:23:53,757 : INFO : PROGRESS: at example #40000, processed 2722817 words (390872/s), 95257 word types, 40000 tags
2018-01-22 15:23:55,505 : INFO : PROGRESS: at example #50000, processed 3392655 words (383512/s), 109552 word types, 50000 tags
2018-01-22 15:23:57,354 : INFO : PROGRESS: at example #60000, processed 4095745 words (380487/s), 123158 word types, 60000 tags
2018-01-22 15:23:59,094 : INFO : PROGRESS: at example #70000, 

In [28]:
for epoch in range(2):
    doc_model_corpus.train(Tagged_Document_Gen(file_list=data['comment_text'].tolist(),content='content'),total_examples=doc_model_corpus.corpus_count,epochs=1)
    doc_model_corpus.save("doc2vec_model_whole_corpus.model")

2018-01-22 15:29:46,555 : INFO : training model with 3 workers on 44260 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-01-22 15:29:47,587 : INFO : PROGRESS: at 1.83% examples, 144473 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:29:48,620 : INFO : PROGRESS: at 4.18% examples, 168410 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:29:49,633 : INFO : PROGRESS: at 6.49% examples, 174905 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:29:50,663 : INFO : PROGRESS: at 8.79% examples, 178009 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:29:51,686 : INFO : PROGRESS: at 11.03% examples, 177426 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:29:52,706 : INFO : PROGRESS: at 13.41% examples, 178263 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:29:53,727 : INFO : PROGRESS: at 15.86% examples, 179877 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:29:54,778 : INFO : PROGRESS: at 18.27% examples, 180431 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:29:55,787 : INFO

2018-01-22 15:30:59,616 : INFO : PROGRESS: at 60.34% examples, 177400 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:31:00,639 : INFO : PROGRESS: at 62.78% examples, 177828 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:31:01,662 : INFO : PROGRESS: at 65.11% examples, 178189 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:31:02,669 : INFO : PROGRESS: at 67.14% examples, 177897 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:31:03,676 : INFO : PROGRESS: at 69.38% examples, 178072 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:31:04,717 : INFO : PROGRESS: at 71.67% examples, 177864 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:31:05,719 : INFO : PROGRESS: at 74.11% examples, 178272 words/s, in_qsize 5, out_qsize 0
2018-01-22 15:31:06,775 : INFO : PROGRESS: at 76.49% examples, 178407 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:31:07,786 : INFO : PROGRESS: at 78.94% examples, 178777 words/s, in_qsize 6, out_qsize 0
2018-01-22 15:31:08,828 : INFO : PROGRESS: at 81.39% examples, 179123 wor

In [29]:
doc_model_corpus.wv.most_similar("car")

2018-01-22 15:31:28,483 : INFO : precomputing L2-norms of word weight vectors


[('camera', 0.6153793931007385),
 ('battery', 0.5974810123443604),
 ('store', 0.5901130437850952),
 ('cars', 0.5810708999633789),
 ('terminal', 0.5806562900543213),
 ('shop', 0.5793058276176453),
 ('plane', 0.5783781409263611),
 ('studio', 0.5726466178894043),
 ('cd', 0.5711696147918701),
 ('pilot', 0.5681911706924438)]

In [37]:
pipeline=Pipeline(steps=[('clean_string',CleanStringSpace()),
                       ('tokenize',TokenizeDocs()),
                       ('vectorize',VectorizeDoc(doc_model_corpus)),
                        ('scale',StandardScaler())])

In [39]:
X_train,X_test,y_train,y_test=train_test_split(np.array(data['comment_text']),np.array(data[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]),test_size=0.2)


X_train=pipeline.fit_transform(X_train)
X_test=pipeline.transform(X_test)

In [40]:
print(X_train.shape)

(127656, 100)


In [41]:
pickle.dump(X_train,open("X_train.p","wb"))
pickle.dump(X_test,open("X_test.p","wb"))

In [78]:
nn=Sequential()
nn.add(Dense(64,activation='relu',input_shape=(100,)))
nn.add(BatchNormalization())
nn.add(Dense(32,activation='relu'))
nn.add(BatchNormalization())
nn.add(Dense(16,activation='relu'))
nn.add(BatchNormalization())
nn.add(Dense(6,activation='sigmoid'))
optimizer=Adam()
nn.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])

In [79]:
nn.fit(X_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f528907400>

In [80]:
y_pred=nn.predict(X_train)
print(classification_report(y_train,(y_pred>0.5).astype(int)))

             precision    recall  f1-score   support

          0       0.84      0.35      0.49     12246
          1       0.38      0.09      0.14      1275
          2       0.81      0.31      0.45      6742
          3       0.08      0.02      0.03       381
          4       0.72      0.25      0.37      6269
          5       0.26      0.05      0.09      1120

avg / total       0.75      0.29      0.42     28033



In [None]:
def clean_string_with_space(s):
    s = s.replace(u'\n', u' ')
    s = s.replace(u'’', u' ')
    s = s.replace(u"'", u' ')
    s = ''.join((c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) != 'Mn'))
    s = s.replace('.', ' ')
    s = ''.join(e for e in s if (e.isalnum() or e == " ")).lower()
    while '  ' in s:
        s = s.replace('  ', ' ')
        
    return s

In [None]:
data['comment_text_clean']=data['comment_text'].apply(lambda row:clean_string_with_space(row))

In [None]:
data.describe()

In [None]:
pickle.dump(data,open("data.p","wb"))

In [None]:
vectorizer=TfidfVectorizer(input='content',min_df=5,max_features=20000)
#vectorizer=CountVectorizer(input='content',min_df=1,max_features=20000)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(np.array(data['comment_text_clean']),np.array(data[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]),test_size=0.2)

In [None]:
X_train=vectorizer.fit_transform(X_train)
scaler=StandardScaler(with_mean=False)
X_train=scaler.fit_transform(X_train)

In [None]:
print(type(X_train))
X_train=X_train.todense()

In [None]:
nn=Sequential()
nn.add(Dense(16,activation='relu',input_shape=(20000,)))
nn.add(Dense(6,activation='sigmoid'))
optimizer=Adam()
nn.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['categorical_accuracy'])

In [None]:
nn.fit(X_train,y_train,epochs=1)

In [None]:
y_pred=nn.predict(X_train)
print(classification_report(y_train,(y_pred>0.5).astype(int)))

In [None]:
X_test=vectorizer.transform(X_test)
X_test=scaler.transform(X_test)
X_test=X_test.todense()
y_pred=nn.predict(X_test)

print(classification_report(y_test,(y_pred>0.5).astype(int)))

In [None]:
data_test=pd.read_csv("test/test.csv",encoding='utf8')

In [None]:
data_test['comment_text_clean']=data_test['comment_text'].apply(lambda row:clean_string_with_space(row))

In [None]:
data_test.head()

In [None]:
X_test=vectorizer.transform(np.array(data_test['comment_text_clean']))

In [None]:
X_test=scaler.transform(X_test)

In [None]:
y_pred=svm.predict(X_test

print(classification_report(np.array(data_test[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]),y_pred))

In [None]:
loss_1=log_loss(np.array(data_test[['toxic']]),y_pred[:,0])
loss_2=log_loss(np.array(data_test[['severe_toxic']]),y_pred[:,1])
loss_3=log_loss(np.array(data_test[['obscene']]),y_pred[:,2])
loss_4=log_loss(np.array(data_test[['threat']]),y_pred[:,3])
loss_5=log_loss(np.array(data_test[['insult']]),y_pred[:,4])
loss_6=log_loss(np.array(data_test[['identity_hate']]),y_pred[:,5])

In [None]:
print(np.average([loss_1,loss_2,loss_3,loss_4,loss_5,loss_6]))