In [4]:

def RemovePunc(comment):
    comment = comment.replace('^\d+|\n|\t|"|==|;|:|@', '')
    comment = comment.replace('\d+', '')
    comment = comment.replace(',', '')
    comment = comment.replace('?', '')
    comment= comment.replace('!', '')
    comment = comment.replace('&', '')
    comment = comment.replace('[...]', '')
    comment = comment.replace('[....]', '')
    for c in string.punctuation:
        comment = comment.replace(c, "")
    return (comment)

def RemoveHttp(comment):
    comment = re.sub(r'(((https|http)?://)|(www.))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F])|#)',
                     '', comment, flags=re.MULTILINE)
    return (comment)

def ConvertToLowerCase(comment):
    comment = comment.lower()
    return(comment)

def StemComment(comment,lowerCase,punc,http):
    if lowerCase:
        comment = ConvertToLowerCase(comment)
    if punc:
        comment = RemovePunc(comment)
    if http:
        comment = RemoveHttp(comment)
        
    comment=''.join([i for i in comment if not i.isdigit()])
    comment = nlp(comment)
    lemmatized = []
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return " ".join(lemmatized)
    
def dataStem(Data,lowerCase,punc,http):
    Data_lemmatized = Data.apply(StemComment, lowerCase=True,punc=False,http=True)
    return Data_lemmatized

def saveInFile(x_train_lemmatized,fileName):
    filehandler = open(fileName+".p", "wb")
    pickle.dump(x_train_lemmatized, filehandler)
    filehandler.close()
    
def loadFromFile(fileName):
    file = open(fileName+".p", 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def sampling(data):
    clean = data.loc[np.random.choice(data[data['clean']==True].index.tolist(),size=9237)].reset_index(drop=True)
    severe_toxic = data.loc[np.random.choice(data[data['severe_toxic']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    obscene = data.loc[np.random.choice(data[data['obscene']==True].index.tolist(),size=2000,replace=True)].reset_index(drop=True)
    threat = data.loc[np.random.choice(data[data['threat']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    insult = data.loc[np.random.choice(data[data['insult']==True].index.tolist(),size=2300,replace=True)].reset_index(drop=True)
    identity_hate = data.loc[np.random.choice(data[data['identity_hate']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    toxic = data[data['toxic']==1].reset_index(drop=True)

    sampled_train = pd.concat([clean,threat,toxic,severe_toxic,insult,identity_hate,obscene]).reset_index(drop=True)
    sampled_train = sampled_train.sample(frac=1).reset_index(drop=True)
    sampled_train['clean'] = sampled_train['clean'].astype(int)
    return sampled_train

def Translate(data):
    translator = Translator()
    translations= translator.translate(data)
    return translations.text

def TranslateTheDataSet(data):
    for index, row in data.iterrows():
        rowString=row['comment_text']
        if(len(rowString)>5000):
            rowString=row['comment_text'][:5000]
        try:
            translate = Translator()
            lang = translate.detect(rowString).lang
        except ValueError:
            continue
        if (lang != 'en'):
            try:
                transComment = Translate(rowString)
                data.at[index, 'comment_text'] = transComment
            except ValueError:
                continue
    return data



# Apply Stemming on Data

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import string
import pickle
import re
import spacy
import en_core_web_sm


#nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
nlp = en_core_web_sm.load()

train = pd.read_csv('New_Data\\train.csv')
test = pd.read_csv('New_Data\\test.csv')
#print(train)
train.index = train['id']
x_train = train['comment_text']
y_train = train.iloc[:, 2:]
test.index = test['id']
x_test = test['comment_text']
y_train['clean'] = 1 - y_train.sum(axis=1) >= 1
x_train2 = x_train.dropna()
x_test2 = x_test.dropna()
print('start train lemmatize')
x_train_lemmatized=dataStem(x_train2,lowerCase=True,punc=False,http=True)
print(x_train_lemmatized)
print('start test lemmatize')
x_test_lemmatized=dataStem(x_test2,lowerCase=True,punc=False,http=True)
train2=pd.concat([x_train_lemmatized,y_train], axis=1)

saveInFile(train2,'train_TestwithPunc')
saveInFile(x_test_lemmatized,'test_TestwithPunc')
print('Lemmatizing and storing done!')

start train lemmatize
id
0000997932d777bf    explanation why the edit make under -PRON- use...
000103f0d9cfb60f    d'aww ! -PRON- match this background colour -P...
000113f07ec002fd    hey man , -PRON- be really not try to edit war...
0001b41b1c6bb37e    " more i can not make any real suggestion on i...
0001d958c54c6e35    -PRON- , sir , be -PRON- hero . any chance -PR...
00025465d4725e87    " congratulation from -PRON- as well , use the...
0002bcb3da6cb337    cocksucker before -PRON- pis around on -PRON- ...
00031b1e95af7921    -PRON- vandalism to the matt shirvington artic...
00037261f536c51d    sorry if the word ' nonsense ' be offensive to...
00040093b2687caa    alignment on this subject and which be contrar...
0005300084f90edc    " fair use rationale for image : wonju.jpg tha...
00054a5e18b50dd4    bbq be a man and let discus -PRON- - maybe ove...
0005c987bdfc9d4b    hey ... what be -PRON- .. @ | talk . what be -...
0006f16e4e9f292e    before -PRON- start throw accusation and warn

# NB with 50,000 features

In [82]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import pandas as pd


train = pickle.load(open('train_TestwithPunc.p','rb'))

test = pickle.load(open('test_TestwithPunc.p','rb'))
SentDict = pickle.load(open('SentimentFullDict.p','rb'))
sample = pd.read_csv('sample_submission.csv')
sampled_train = sampling(train)
print(sampled_train.shape)
feature_model = TfidfVectorizer(ngram_range=(1, 4),max_features=50000)
feature_matrix = feature_model.fit_transform(sampled_train['comment_text'])
#Array=np.array(SentDict.values(), dtype=float)
#print(pd.DataFrame(feature_matrix.toarray(), columns=feature_model.get_feature_names()))
#print(len(SentDict))
array = np.array(list(SentDict.values()), dtype=float)

#array=repr(array)
print(array.shape)
#print(feature_matrix)

#46831
#feature_model.get_feature_names

(46831, 8)
(158966,)
  (0, 47239)	0.0999980606121
  (0, 41554)	0.0600244417755
  (0, 11945)	0.072484668864
  (0, 30700)	0.116039726224
  (0, 46764)	0.124948537873
  (0, 3809)	0.114790414299
  (0, 40684)	0.116959544532
  (0, 29063)	0.104315623039
  (0, 16777)	0.0997299053247
  (0, 25683)	0.111336221244
  (0, 43909)	0.125387262626
  (0, 44036)	0.0622178908049
  (0, 39456)	0.19514026378
  (0, 48825)	0.0934319363988
  (0, 21897)	0.274733178873
  (0, 47382)	0.162879280596
  (0, 12318)	0.120714815712
  (0, 35189)	0.159493023452
  (0, 3858)	0.17614251014
  (0, 34836)	0.159493023452
  (0, 40703)	0.147225036155
  (0, 29125)	0.291393711466
  (0, 16888)	0.271838355751
  (0, 25814)	0.283969841919
  (0, 43996)	0.208308872751
  :	:
  (46830, 34063)	0.107060487562
  (46830, 18904)	0.108605392935
  (46830, 24854)	0.108605392935
  (46830, 18893)	0.108605392935
  (46830, 14886)	0.108605392935
  (46830, 19389)	0.108605392935
  (46830, 16826)	0.108605392935
  (46830, 11478)	0.108605392935
  (46830, 45951)

In [84]:
import numpy
print(train.shape[0])
SentA = numpy.zeros(train.shape[0])
i=0
for comm in train['comment_text']:
    sentVal=SentDict.get(comm)
    ##print(comm,sentVal)
    SentA[i]=sentVal
    i+=1
    
print(SentA)

    

159571
[ 0.3182 -0.25   -0.5994 ..., -0.296   0.3612 -0.3723]


In [91]:
# from scipy.sparse import hstack
# feature_matrix=hstack((feature_matrix,SentA[:,None])).A
# print(feature_matrix)

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import *#classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pickle
from sklearn.linear_model import LogisticRegression
import pandas as pd

train = pickle.load(open('train_Test.p','rb'))
test = pickle.load(open('test_Test.p','rb'))
sample = pd.read_csv('sample_submission.csv')
#sampled_train=sampling(train)
SentA = SentA.reshape(-1, 1)
for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    for ii in range(5):
        X_train, X_test, y_train, y_test = train_test_split(SentA, train[col], test_size=0.33)
        clf = LogisticRegression() #SVC(probability=True)
        clf.fit(X_train, y_train)
        preds = clf.predict_proba(X_test)[:, 1]
        print(col+'-----------------------')
        print(classification_report(y_test,preds.round()))
        #print(preds)
        #print(sample.shape, preds.shape)
        #sample[col] = preds

toxic-----------------------
             precision    recall  f1-score   support

          0       0.90      1.00      0.95     47613
          1       0.95      0.00      0.01      5046

avg / total       0.91      0.90      0.86     52659

toxic-----------------------
             precision    recall  f1-score   support

          0       0.90      1.00      0.95     47513
          1       1.00      0.00      0.01      5146

avg / total       0.91      0.90      0.86     52659

toxic-----------------------
             precision    recall  f1-score   support

          0       0.90      1.00      0.95     47518
          1       0.00      0.00      0.00      5141

avg / total       0.81      0.90      0.86     52659



  'precision', 'predicted', average, warn_for)


toxic-----------------------
             precision    recall  f1-score   support

          0       0.91      1.00      0.95     47664
          1       0.95      0.01      0.02      4995

avg / total       0.91      0.91      0.86     52659

toxic-----------------------
             precision    recall  f1-score   support

          0       0.90      1.00      0.95     47613
          1       0.00      0.00      0.00      5046

avg / total       0.82      0.90      0.86     52659

severe_toxic-----------------------
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     52134
          1       0.00      0.00      0.00       525

avg / total       0.98      0.99      0.99     52659

severe_toxic-----------------------
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     52105
          1       0.00      0.00      0.00       554

avg / total       0.98      0.99      0.98     52659

severe_tox

In [46]:
print(feature_matrix)
test_x = feature_model.transform(test)
for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    clf = MultinomialNB()
    clf.fit(feature_matrix, sampled_train[col])
    preds = clf.predict_proba(test_x)[:, 1]
    print(col)
    print(preds)
    print(sample.shape, preds.shape)
    sample[col] = preds

sample.to_csv("result0.csv", index=False)

  (0, 22366)	0.0585267957048
  (0, 27861)	0.0629050240734
  (0, 41728)	0.047509081812
  (0, 4054)	0.0727650138102
  (0, 8131)	0.0485107724004
  (0, 43900)	0.0825198102868
  (0, 7880)	0.118759581166
  (0, 4566)	0.0623801799844
  (0, 43296)	0.0314898005325
  (0, 15131)	0.0227616167841
  (0, 36416)	0.041783127189
  (0, 39375)	0.0697720391952
  (0, 48121)	0.0275515675144
  (0, 48342)	0.0716111324478
  (0, 37194)	0.0464245998321
  (0, 49202)	0.0813468946057
  (0, 1394)	0.0486491487046
  (0, 30972)	0.0716086107355
  (0, 7599)	0.0325082548046
  (0, 20995)	0.0661476951375
  (0, 3700)	0.0366042267867
  (0, 43430)	0.0359262775712
  (0, 8637)	0.0265954916726
  (0, 8016)	0.0506357942231
  (0, 27161)	0.0184526987108
  :	:
  (46830, 19609)	0.160442090772
  (46830, 45123)	0.171601266115
  (46830, 39800)	0.135989581948
  (46830, 11107)	0.171601266115
  (46830, 6727)	0.169808931838
  (46830, 46633)	0.171601266115
  (46830, 3084)	0.171601266115
  (46830, 45952)	0.168166139768
  (46830, 31612)	0.16593332

KeyboardInterrupt: 

# Title