In [1]:
import numpy as np 
import pandas as pd
import os
import gc
import torch

#Text Cleaning
from bs4 import BeautifulSoup
import re 

from sentence_transformers import SentenceTransformer, util

In [2]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()



In [3]:
def get_classification_dataset_as(task="binary", regression_weights=None):
    """
    Args:
        task: ['binary', 'regression']
        regression_weights: dictionary {label => weights} for each label or None, if task=='regression'
        
    """
    assert task in ['binary', 'regression']
    
    df = pd.read_csv('input/jigsaw-toxic-comment-classification-challenge/classification-dataset.csv')

    
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    if task == 'binary':    
        df['y'] = (df[labels].sum(axis=1) > 0).astype(int)
        df = df.drop(labels, axis=1)
    elif task == 'regression':
        if regression_weights is None:
            df['y'] = df[labels].sum(axis=1)
        else:
            weighed_columns = [regression_weights.get(l, 1) * df[l] for l in labels]
            df['y'] = pd.concat(weighed_columns, axis=1).sum(axis=1)
        df = df.drop(labels, axis=1)
    return df

In [4]:
df_jigsaw = get_classification_dataset_as('regression', regression_weights={'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5})

In [5]:
df_jigsaw.head()

Unnamed: 0,id,comment_text,y
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0


In [6]:
df_ruddit = pd.read_csv('input/ruddit-jigsaw-dataset/ruddit_with_text.csv')[['comment_id', 'txt', 'offensiveness_score']]
df_ruddit = df_ruddit.loc[(df_ruddit.txt != '[deleted]') & (df_ruddit.txt != '[removed]')]

In [7]:
df_ruddit.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that's not what they're mea...,-0.083
5,cza2jj3,>Men and women are not payed less for the same...,-0.042


In [8]:
df_jigsaw.comment_text

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
223544    :Jerome, I see you never got around to this…! ...
223545    ==Lucky bastard== \n http://wikimediafoundatio...
223546    ==shame on you all!!!== \n\n You want to speak...
223547    MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...
223548    " \n\n == Unicorn lair discovery == \n\n Suppo...
Name: comment_text, Length: 223549, dtype: object

In [9]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [10]:
df_jigsaw.comment_text=df_jigsaw.comment_text.apply(str)

In [11]:
df_jigsaw.comment_text = df_jigsaw.comment_text.apply(text_cleaning)
df_ruddit.txt = df_ruddit.txt.apply(text_cleaning)

In [12]:
df_jigsaw.head()

Unnamed: 0,id,comment_text,y
0,0000997932d777bf,Explanation Why the edits made under my userna...,0.0
1,000103f0d9cfb60f,D aww He matches this background colour I m se...,0.0
2,000113f07ec002fd,Hey man I m really not trying to edit war It s...,0.0
3,0001b41b1c6bb37e,More I can t make any real suggestions on impr...,0.0
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0.0


In [13]:
df_ruddit.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,The difference in average earnings between men...,-0.083
1,cza1wdh,The myth is that the gap is entirely based on ...,-0.022
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that s not what they re mea...,-0.083
5,cza2jj3,Men and women are not payed less for the same ...,-0.042


In [14]:
ru_toxic_score = df_ruddit["offensiveness_score"].values

In [15]:
ru_toxic_score

array([-0.083, -0.022, -0.146, ..., -0.292,  0.333, -0.625])

In [16]:
model = SentenceTransformer("paraphrase-mpnet-base-v2")

In [17]:
model.max_seq_length

512

In [18]:
#model.max_seq_length = 512

In [19]:
sentences_jigsaw = df_jigsaw.comment_text
sentences_ruddit = df_ruddit.txt.reset_index(drop=True)

In [20]:
sentences_jigsaw

0         Explanation Why the edits made under my userna...
1         D aww He matches this background colour I m se...
2         Hey man I m really not trying to edit war It s...
3         More I can t make any real suggestions on impr...
4         You sir are my hero Any chance you remember wh...
                                ...                        
223544    Jerome I see you never got around to this I m ...
223545    Lucky bastard Heh you are famous now I kida en...
223546    shame on you all You want to speak about gays ...
223547    MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...
223548    Unicorn lair discovery Supposedly a unicorn la...
Name: comment_text, Length: 223549, dtype: object

In [21]:
sentences_ruddit

0       The difference in average earnings between men...
1       The myth is that the gap is entirely based on ...
2       The assertion is that women get paid less for ...
3       You said in the OP that s not what they re mea...
4       Men and women are not payed less for the same ...
                              ...                        
5705    They should only censor things that talk badly...
5706    and one of them is a woman OH SHIT we better b...
5707                    how is this flared as US politics
5708    People in Hong Kong must decide if they are go...
5709    I know this is an old post but I saw him last ...
Name: txt, Length: 5710, dtype: object

In [22]:
print(sentences_jigsaw[4])
print()
print(sentences_ruddit[4])

You sir are my hero Any chance you remember what page that s on

Men and women are not payed less for the same job I don t think many people say this is the case though I think this is a misconception on the other side not something feminists have said Even if women have a lower average wage in all jobs than men I mean that s even bigger than individual cases This shows that women are in an inferior position more than if a woman were to get literally 30 cents less Because averages show the bigger picture Edit Changed 70 to 30


In [23]:
embeddings_jigsaw = model.encode(sentences_jigsaw, show_progress_bar = True, convert_to_tensor=True)

Batches:   0%|          | 0/6986 [00:00<?, ?it/s]

In [24]:
embeddings_jigsaw = embeddings_jigsaw.to('cuda')
#embeddings = util.normalize_embeddings(embeddings_jigsaw)

In [25]:
embeddings_ruddit = model.encode(sentences_ruddit, show_progress_bar = True, convert_to_tensor=True)

Batches:   0%|          | 0/179 [00:00<?, ?it/s]

In [26]:
embeddings_ruddit = embeddings_ruddit.to('cuda')
#embeddings_ruddit = util.normalize_embeddings(embeddings_ruddit)

In [27]:
top_K = 10

In [28]:
hits = util.semantic_search(embeddings_jigsaw, embeddings_ruddit, top_k=top_K)

In [29]:
hits[0][0:10]

[{'corpus_id': 641, 'score': 0.48377907276153564},
 {'corpus_id': 3788, 'score': 0.4662332832813263},
 {'corpus_id': 431, 'score': 0.46103063225746155},
 {'corpus_id': 2162, 'score': 0.45483699440956116},
 {'corpus_id': 674, 'score': 0.45436087250709534},
 {'corpus_id': 2218, 'score': 0.45334410667419434},
 {'corpus_id': 647, 'score': 0.44764620065689087},
 {'corpus_id': 5699, 'score': 0.42927515506744385},
 {'corpus_id': 927, 'score': 0.42191389203071594},
 {'corpus_id': 3797, 'score': 0.4212823510169983}]

In [30]:
def check(idx, top_K):
    c = 0
    for i in range(top_K):
        more_score = ru_toxic_score[hits[idx][i]["corpus_id"]]
        c+=more_score#*more_w
    return c/top_K

In [31]:
sentence_sore = np.empty(len(df_jigsaw))

for i in range(len(df_jigsaw)):
    sentence_sore[i]=check(i, top_K)

In [32]:
df = df_jigsaw.copy()

In [33]:
df['similarity_score'] = sentence_sore
df.head()

Unnamed: 0,id,comment_text,y,similarity_score
0,0000997932d777bf,Explanation Why the edits made under my userna...,0.0,-0.1285
1,000103f0d9cfb60f,D aww He matches this background colour I m se...,0.0,-0.0366
2,000113f07ec002fd,Hey man I m really not trying to edit war It s...,0.0,0.2383
3,0001b41b1c6bb37e,More I can t make any real suggestions on impr...,0.0,-0.2054
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0.0,-0.5708


In [34]:
#df['score'] = df['score'].rank(method='first')
#df.head()

In [35]:
#df.drop('text', axis=1, inplace=True)
df.to_csv("re-ranking-jigsaw-toxic-comment-classification.csv", index=False)