In [15]:

def RemovePunc(comment):
    comment = comment.replace('^\d+|\n|\t|"|==|;|:|@', '')
    comment = comment.replace('\d+', '')
    comment = comment.replace(',', '')
    comment = comment.replace('?', '')
    comment = comment.replace('!', '')
    comment = comment.replace('&', '')
    comment = comment.replace('[...]', '')
    comment = comment.replace('[....]', '')
    for c in string.punctuation:
        comment = comment.replace(c, "")
    return (comment)

def RemoveHttp(comment):
    comment = re.sub(r'(((https|http)?://)|(www.))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F])|#)',
                     '', comment, flags=re.MULTILINE)
    return (comment)

def ConvertToLowerCase(comment):
    comment = comment.lower()
    return(comment)

def StemComment(comment,lowerCase,punc,http):
    if lowerCase:
        comment = ConvertToLowerCase(comment)
    if punc:
        comment = RemovePunc(comment)
    if http:
        comment = RemoveHttp(comment)
        
    comment=''.join([i for i in comment if not i.isdigit()])
    comment = nlp(comment)
    lemmatized = []
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return " ".join(lemmatized)
    
def dataStem(Data,lowerCase,punc,http):
    Data_lemmatized = Data.apply(StemComment, lowerCase=True,punc=False,http=True)
    return Data_lemmatized

def saveInFile(x_train_lemmatized,fileName):
    filehandler = open(fileName+".p", "wb")
    pickle.dump(x_train_lemmatized, filehandler)
    filehandler.close()
    
def loadFromFile(fileName):
    file = open(fileName+".p", 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def sampling(data):
    clean = data.loc[np.random.choice(data[data['clean']==True].index.tolist(),size=9237)].reset_index(drop=True)
    severe_toxic = data.loc[np.random.choice(data[data['severe_toxic']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    obscene = data.loc[np.random.choice(data[data['obscene']==True].index.tolist(),size=2000,replace=True)].reset_index(drop=True)
    threat = data.loc[np.random.choice(data[data['threat']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    insult = data.loc[np.random.choice(data[data['insult']==True].index.tolist(),size=2300,replace=True)].reset_index(drop=True)
    identity_hate = data.loc[np.random.choice(data[data['identity_hate']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    toxic = data[data['toxic']==1].reset_index(drop=True)

    sampled_train = pd.concat([clean,threat,toxic,severe_toxic,insult,identity_hate,obscene]).reset_index(drop=True)
    sampled_train = sampled_train.sample(frac=1).reset_index(drop=True)
    sampled_train['clean'] = sampled_train['clean'].astype(int)
    return sampled_train

def Translate(data):
    translator = Translator()
    translations= translator.translate(data)
    return translations.text

def TranslateTheDataSet(data):
    for index, row in data.iterrows():
        rowString=row['comment_text']
        if(len(rowString)>5000):
            rowString=row['comment_text'][:5000]
        try:
            translate = Translator()
            lang = translate.detect(rowString).lang
        except ValueError:
            continue
        if (lang != 'en'):
            try:
                transComment = Translate(rowString)
                data.at[index, 'comment_text'] = transComment
            except ValueError:
                continue
    return data



# Apply Stemming on Data

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import string
import pickle
import re
import spacy
import en_core_web_sm


#nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
nlp = en_core_web_sm.load()

train = pd.read_csv('New_Data\\train.csv')
test = pd.read_csv('New_Data\\test.csv')
#print(train)
train.index = train['id']
x_train = train['comment_text']
y_train = train.iloc[:, 2:]
test.index = test['id']
x_test = test['comment_text']
y_train['clean'] = 1 - y_train.sum(axis=1) >= 1
x_train2 = x_train.dropna()
x_test2 = x_test.dropna()
print('start train lemmatize')
x_train_lemmatized=dataStem(x_train2,lowerCase=True,punc=False,http=True)
print(x_train_lemmatized)
print('start test lemmatize')
x_test_lemmatized=dataStem(x_test2,lowerCase=True,punc=False,http=True)
train2=pd.concat([x_train_lemmatized,y_train], axis=1)

saveInFile(train2,'train_TestwithPunc')
saveInFile(x_test_lemmatized,'test_TestwithPunc')
print('Lemmatizing and storing done!')

start train lemmatize
id
0000997932d777bf    explanation why the edit make under -PRON- use...
000103f0d9cfb60f    d'aww ! -PRON- match this background colour -P...
000113f07ec002fd    hey man , -PRON- be really not try to edit war...
0001b41b1c6bb37e    " more i can not make any real suggestion on i...
0001d958c54c6e35    -PRON- , sir , be -PRON- hero . any chance -PR...
00025465d4725e87    " congratulation from -PRON- as well , use the...
0002bcb3da6cb337    cocksucker before -PRON- pis around on -PRON- ...
00031b1e95af7921    -PRON- vandalism to the matt shirvington artic...
00037261f536c51d    sorry if the word ' nonsense ' be offensive to...
00040093b2687caa    alignment on this subject and which be contrar...
0005300084f90edc    " fair use rationale for image : wonju.jpg tha...
00054a5e18b50dd4    bbq be a man and let discus -PRON- - maybe ove...
0005c987bdfc9d4b    hey ... what be -PRON- .. @ | talk . what be -...
0006f16e4e9f292e    before -PRON- start throw accusation and warn

# NB with 50,000 features

In [3]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

train = pickle.load(open('train_Test.p','rb'))
test = pickle.load(open('test_Test.p','rb'))
sample = pd.read_csv('sample_submission.csv')
sampled_train=sampling(train)
feature_model = TfidfVectorizer(ngram_range=(1, 4),max_features=50000)
feature_matrix = feature_model.fit_transform(sampled_train['comment_text'])
print(feature_matrix)
test_x = feature_model.transform(test)
for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    clf = MultinomialNB()
    clf.fit(feature_matrix, sampled_train[col])
    preds = clf.predict_proba(test_x)[:, 1]
    print(col)
    print(preds)
    print(sample.shape, preds.shape)
    sample[col] = preds

sample.to_csv("result0.csv", index=False)


  (2, 1208)	0.550168536241
  (2, 0)	0.575755963143
  (2, 1209)	0.604830267625
  (7, 507)	0.600881067348
  (7, 865)	0.527018888054
  (7, 511)	0.600993373122
  (9, 1208)	0.0807629536639
  (9, 0)	0.0845191048016
  (9, 1209)	0.0887871182393
  (9, 507)	0.261420587444
  (9, 865)	0.229285951573
  (9, 511)	0.261469447431
  (9, 1928)	0.173002750312
  (9, 927)	0.128358494823
  (9, 1134)	0.224917152186
  (9, 1944)	0.235741213389
  (9, 972)	0.123350035109
  (9, 526)	0.139420143773
  (9, 930)	0.128466808371
  (9, 577)	0.233344042482
  (9, 1139)	0.286499209397
  (9, 1945)	0.235741213389
  (9, 538)	0.137389506148
  (9, 973)	0.13488691332
  (9, 527)	0.139420143773
  :	:
  (46820, 1)	0.287994627975
  (46820, 1210)	0.287994627975
  (46820, 2)	0.287994627975
  (46820, 1211)	0.287994627975
  (46820, 120)	0.340715236578
  (46820, 8)	0.341352511923
  (46821, 1208)	0.17999180935
  (46821, 507)	0.19420440685
  (46821, 865)	0.170332194032
  (46821, 511)	0.19424070401
  (46821, 972)	0.274903219799
  (46821, 538

# Title