In [4]:
def RemovePunc(comment):
    comment = comment.replace('^\d+|\n|\t|"|==|;|:|@', '')
    comment = comment.replace('\d+', '')
    comment = comment.replace(',', '')
    comment = comment.replace('?', '')
    comment= comment.replace('!', '')
    comment = comment.replace('&', '')
    comment = comment.replace('[...]', '')
    comment = comment.replace('[....]', '')
    for c in string.punctuation:
        comment = comment.replace(c, "")
    return (comment)

def RemoveHttp(comment):
    comment = re.sub(r'(((https|http)?://)|(www.))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F])|#)',
                     '', comment, flags=re.MULTILINE)
    return (comment)

def ConvertToLowerCase(comment):
    comment = comment.lower()
    return(comment)

def StemComment(comment,lowerCase,punc,http):
    if lowerCase:
        comment = ConvertToLowerCase(comment)
    if punc:
        comment = RemovePunc(comment)
    if http:
        comment = RemoveHttp(comment)
        
    comment=''.join([i for i in comment if not i.isdigit()])
    comment = nlp(comment)
    lemmatized = []
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return " ".join(lemmatized)
    
def dataStem(Data,lowerCase,punc,http):
    Data_lemmatized = Data.apply(StemComment, lowerCase=True,punc=True,http=True)
    return Data_lemmatized

def saveInFile(x_train_lemmatized,fileName):
    filehandler = open(fileName+".p", "wb")
    pickle.dump(x_train_lemmatized, filehandler)
    filehandler.close()
    
def loadFromFile(fileName):
    file = open(fileName+".p", 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def sampling(data):
    clean = data.loc[np.random.choice(data[data['clean']==True].index.tolist(),size=9237)].reset_index(drop=True)
    severe_toxic = data.loc[np.random.choice(data[data['severe_toxic']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    obscene = data.loc[np.random.choice(data[data['obscene']==True].index.tolist(),size=2000,replace=True)].reset_index(drop=True)
    threat = data.loc[np.random.choice(data[data['threat']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    insult = data.loc[np.random.choice(data[data['insult']==True].index.tolist(),size=2300,replace=True)].reset_index(drop=True)
    identity_hate = data.loc[np.random.choice(data[data['identity_hate']==True].index.tolist(),size=6000,replace=True)].reset_index(drop=True)
    toxic = data[data['toxic']==1].reset_index(drop=True)

    sampled_train = pd.concat([clean,threat,toxic,severe_toxic,insult,identity_hate,obscene]).reset_index(drop=True)
    sampled_train = sampled_train.sample(frac=1).reset_index(drop=True)
    sampled_train['clean'] = sampled_train['clean'].astype(int)
    return sampled_train



# Apply Stemming on Data

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import string
import pickle
import re
import spacy


nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])

train = pd.read_csv('New_Data\\train.csv')
test = pd.read_csv('New_Data\\test.csv')
train.index = train['id']
x_train = train['comment_text']
y_train = train.iloc[:, 2:]
test.index = test['id']
x_test = test['comment_text']
y_train['clean'] = 1 - y_train.sum(axis=1) >= 1
x_train2 = x_train.dropna()
x_test2 = x_test.dropna()
print('start train lemmatize')
x_train_lemmatized=dataStem(x_train2,lowerCase=True,punc=True,http=True)
print('start test lemmatize')
x_test_lemmatized=dataStem(x_test2,lowerCase=True,punc=True,http=True)
train2=pd.concat([x_train_lemmatized,y_train], axis=1)
saveInFile(train2,'train_Test')
saveInFile(x_test_lemmatized,'test_Test')
print('Lemmatizing and storing done!')



    Only loading the 'en' tokenizer.

start train lemmatize
start test lemmatize
Lemmatizing and storing done!


In [6]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pickle
import pandas as pd

train = pickle.load(open('train_Test.p','rb'))
test = pickle.load(open('test_Test.p','rb'))
sample = pd.read_csv('sample_submission.csv')
sampled_train=sampling(train)
feature_model = TfidfVectorizer(ngram_range=(1, 4),max_features=50000)
feature_matrix = feature_model.fit_transform(sampled_train['comment_text'])
test_x = feature_model.transform(test)
for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    clf = SVC(probability=True)
    clf.fit(feature_matrix, sampled_train[col])
    preds = clf.predict_proba(test_x)[:, 1]
    print(col)
    print(preds)
    print(sample.shape, preds.shape)
    sample[col] = preds

sample.to_csv("result1_50000Feature.csv", index=False)


toxic
[ 0.84342352  0.83609313  0.83609313 ...,  0.83609313  0.72858266
  0.81115475]
(153164, 7) (153164,)
severe_toxic
[ 0.2507657   0.26309995  0.26309995 ...,  0.26309995  0.24753664
  0.25168878]
(153164, 7) (153164,)
obscene
[ 0.54890267  0.56397065  0.56397065 ...,  0.56397065  0.48287155
  0.56255065]
(153164, 7) (153164,)
threat
[ 0.15623304  0.1626309   0.1626309  ...,  0.1626309   0.1626717
  0.17910228]
(153164, 7) (153164,)
insult
[ 0.53391276  0.546838    0.546838   ...,  0.546838    0.51111807
  0.5462755 ]
(153164, 7) (153164,)
identity_hate
[ 0.25144162  0.27896613  0.27896613 ...,  0.27896613  0.20833266
  0.21976363]
(153164, 7) (153164,)


# Title