In [None]:
path_train='/content/drive/My Drive/Colab_data/Data_Mining/Data-Mining-Project/dataset/train.csv'
path_test='/content/drive/My Drive/Colab_data/Data_Mining/Data-Mining-Project/dataset/test.csv'

In [None]:
import pandas as pd

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

In [None]:
vocab = pd.concat([train['comment_text'],  test['comment_text']])

In [None]:
clean_word_dict={'ａ': 'a',
 '！': ' !',
 '＋': ' +',
 '－': ' -',
 '．': ' .',
 '０': '0',
 '１': '1',
 '２': '2',
 '３': '3',
 '４': '4',
 '５': '5',
 '６': '6',
 '７': '7',
 '８': '8',
 '９': '9',
 '＝': ' =',
 '？': ' ?',
 'Ａ': 'a',
 'Ｂ': 'b',
 'Ｃ': 'c',
 'Ｄ': 'd',
 'Ｅ': 'e',
 'Ｆ': 'f',
 'Ｇ': 'g',
 'Ｈ': 'h',
 'Ｉ': 'i',
 'Ｊ': 'j',
 'Ｋ': 'k',
 'Ｌ': 'l',
 'Ｍ': 'm',
 'Ｎ': 'n',
 'Ｏ': 'o',
 'Ｐ': 'p',
 'Ｑ': 'q',
 'Ｒ': 'r',
 'Ｓ': 's',
 'Ｔ': 't',
 'Ｕ': 'u',
 'Ｖ': 'v',
 'Ｗ': 'w',
 'Ｘ': 'x',
 'Ｙ': 'y',
 'Ｚ': 'z',
 'ｂ': 'b',
 'ｃ': 'c',
 'ｄ': 'd',
 'ｅ': 'e',
 'ｆ': 'f',
 'ｇ': 'g',
 'ｈ': 'h',
 'ｉ': 'i',
 'ｊ': 'j',
 'ｋ': 'k',
 'ｌ': 'l',
 'ｍ': 'm',
 'ｎ': 'n',
 'ｏ': 'o',
 'ｐ': 'p',
 'ｑ': 'q',
 'ｒ': 'r',
 'ｓ': 's',
 'ｔ': 't',
 'ｕ': 'u',
 'ｖ': 'v',
 'ｗ': 'w',
 'ｘ': 'x',
 'ｙ': 'y',
 'ｚ': 'z'}

In [None]:
import re
import string
def clean_dataset(word):
    word = word.lower()
    word = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", word)
    word = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", word)
    for typo, correct in clean_word_dict.items():
        word = re.sub(typo, " " + correct + " ", word)
    symbols = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    word=symbols.sub(r' \1 ', word)
    return word

train_comments = []
test_comments = []
for comment in train['comment_text']:
    train_comments.append(clean_dataset(comment))
    
for comment in test['comment_text']:
    test_comments.append(clean_dataset(comment))

In [None]:
transform_function = TfidfVectorizer(
    sublinear_tf=1,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=10000)

In [None]:
transform_function.fit(vocab)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=1, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [None]:
comments_train = transform_function.transform(train_comments)
comments_test = transform_function.transform(test_comments)

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

losses = []
predictions = []
for class_name in class_names:
    train_target = train[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30) 
    score = np.mean(cross_val_score(classifier, comments_train, train_target, cv=2, scoring='roc_auc'))
    print(class_name, score)  
    classifier.fit(comments_train, train_target)
    predictions.append(classifier.predict_proba(comments_test)[:, 1])

toxic 0.9483975859448478
severe_toxic 0.9356866512204218
obscene 0.972690700510528
threat 0.8646456662150368
insult 0.9551477410459125
identity_hate 0.8819367764977541


In [None]:
labels=pd.read_csv('/content/drive/My Drive/Colab_data/Data_Mining/Data-Mining-Project/dataset/test_labels.csv')
labels=np.array(labels.iloc[:,1:])
sum_labels=np.sum(labels,axis=1)
idx=sum_labels>=0

In [None]:
preds_consider=np.array(predictions)[:,idx]
labels_consider= labels[idx]
preds_consider.shape,labels_consider.shape

((6, 63978), (63978, 6))

In [None]:
from sklearn.metrics import roc_auc_score
scores=[]
for i in range(6):
  scores.append(roc_auc_score(labels_consider[:,i],preds_consider[i,:]))
np.mean(scores)

0.9323676675669743