In [16]:
import numpy as np
import pandas as pd
import os, nltk, re, string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [17]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
train_data = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip', index_col='id')
test_data = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip', index_col='id')

pd.set_option('display.max_rows', None)
df = pd.concat([train_data, test_data])

In [19]:
punc = string.punctuation
sw = set(stopwords.words('english'))
lm = WordNetLemmatizer()

def preprocess(data):
    data = data.lower()
    data = re.sub(r'http\S+|www\S+|https\S+', '', data, flags=re.MULTILINE) # Remove URL
    data = re.sub(r'\@\w+|\#','', data) # Remove hash_tag & mention
    data = re.sub('[^a-zA-Z]', ' ', data) # Remove num & special
    # tokenization
    data = word_tokenize(data)
    # stopwords & lemmatization
    data = [lm.lemmatize(word, pos = 'v') for word in data if word not in sw and word not in punc]
    data = [lm.lemmatize(word, pos = 'n') for word in data if word not in sw and word not in punc]
    return data

In [20]:
train_data['text'] = train_data['comment_text'].apply(preprocess)
test_data['text'] = test_data['comment_text'].apply(preprocess)

print(train_data.head())

                                                       comment_text  toxic  \
id                                                                           
0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

                  severe_toxic  obscene  threat  insult  identity_hate  \
id                                                                       
0000997932d777bf             0        0       0       0              0   
000103f0d9cfb60f             0        0       0       0              0   
000113f07ec002fd             0        0       0       0              0   
0001b41b1c6bb37e             0        0       0       0              0   
0001d958c

In [22]:
train, valid = train_test_split(train_data, train_size=0.8, random_state=42)

In [23]:
vec = TfidfVectorizer(ngram_range=(1, 2), 
                      min_df=3, 
                      max_df=0.9, 
                      strip_accents='unicode', 
                      use_idf=1,
                      smooth_idf=1, 
                      sublinear_tf=1,
                      stop_words='english')
trn_term_doc = vec.fit_transform(train_data['comment_text'])
val_term_doc = vec.transform(valid['comment_text'])
test_term_doc = vec.transform(test_data['comment_text'])



In [25]:
x = trn_term_doc
val_x = val_term_doc

epsilon = 1e-9

In [26]:
def probability(y_i, y):
    occurences = x[y == y_i].sum(0)
    
    return (occurences + 1) / ((y == y_i).sum() + 1)

def get_model(y):
    y = y.values
    loga = np.log((probability(1, y) + epsilon) / (probability(0, y) + epsilon) )
    x_loga = x.multiply(loga)
    model = LogisticRegression(C=1.0,
                               penalty='l2',
                               solver='liblinear',
                               max_iter=100,
                               random_state=42)
    
    return model.fit(x_loga, y), loga

In [28]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = train_data.drop([ 'comment_text'], axis = 1)
valid_labels = valid.drop([ 'comment_text'], axis = 1)

In [29]:
model = {}
ROC_AUC_Scores = {}
for i, col in enumerate(classes):
    print(col)

    model_trained, loga = get_model(train_labels[col])
    model[col] = (model_trained, loga)
    preds = model_trained.predict(val_x.multiply(loga)).reshape(-1, 1)

    roc_auc = roc_auc_score(valid_labels[col], preds)
    ROC_AUC_Scores[col] = roc_auc
    
for col, roc_auc in ROC_AUC_Scores.items():
    print(f"ROC AUC for class: '{col}': {roc_auc}")

toxic
severe_toxic
obscene
threat
insult
identity_hate
ROC AUC for class: 'toxic': 0.8831245637980947
ROC AUC for class: 'severe_toxic': 0.8531552581950475
ROC AUC for class: 'obscene': 0.910244434576101
ROC AUC for class: 'threat': 0.9593809443374469
ROC AUC for class: 'insult': 0.8731124892430991
ROC AUC for class: 'identity_hate': 0.8689053085577547


In [31]:
preds = np.zeros((len(test_data), len(classes)))

for i, col in enumerate(classes):
    print(col)
    preds[:, i] = model[col][0].predict_proba(test_term_doc.multiply(model[col][1]))[:, 1]

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [32]:
submid = pd.DataFrame({'id': test_data.index})  # Use index as 'id' column
submission = pd.concat([submid, pd.DataFrame(preds, columns=classes)], axis=1)
submission.to_csv('submission.csv', index=False)

submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999967,0.068848,0.999766,0.00492,0.978091,0.202026
1,0000247867823ef7,0.005525,0.00127,0.003698,0.000424,0.006345,0.001257
2,00013b17ad220c46,0.013263,0.000837,0.004183,0.000417,0.00645,0.001092
3,00017563c3f7919a,0.003494,0.000921,0.003,0.000547,0.00333,0.000715
4,00017695ad8997eb,0.024692,0.001138,0.004749,0.000419,0.007829,0.000953
