In [1]:
import numpy as np 
import pandas as pd
import os
import gc
import torch

#Text Cleaning
from bs4 import BeautifulSoup
import re 

from sentence_transformers import SentenceTransformer, util

In [2]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()



In [3]:
df_jigsaw = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/unintented-bias-dataset.csv') 

In [4]:
df_jigsaw.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sexual_explicit
0,1083994,He got his money... now he lies in wait till a...,0.373134,0.044776,0.089552,0.014925,0.343284,0.0,0.014925
1,650904,Mad dog will surely put the liberals in mental...,0.605263,0.013158,0.065789,0.065789,0.565789,0.092105,0.013158
2,5902188,And Trump continues his lifelong cowardice by ...,0.666667,0.015873,0.031746,0.0,0.666667,0.047619,0.0
3,7084460,"""while arresting a man for resisting arrest"".\...",0.815789,0.065789,0.552632,0.105263,0.684211,0.0,0.592105
4,5410943,Tucker and Paul are both total bad ass mofo's.,0.55,0.0375,0.3375,0.0,0.4875,0.0375,0.275


In [5]:
df_ruddit = pd.read_csv('input/ruddit-jigsaw-dataset/ruddit_with_text.csv')[['comment_id', 'txt', 'offensiveness_score']]
df_ruddit = df_ruddit.loc[(df_ruddit.txt != '[deleted]') & (df_ruddit.txt != '[removed]')]

In [6]:
df_ruddit.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that's not what they're mea...,-0.083
5,cza2jj3,>Men and women are not payed less for the same...,-0.042


In [7]:
df_jigsaw.comment_text

0          He got his money... now he lies in wait till a...
1          Mad dog will surely put the liberals in mental...
2          And Trump continues his lifelong cowardice by ...
3          "while arresting a man for resisting arrest".\...
4             Tucker and Paul are both total bad ass mofo's.
                                 ...                        
1999511    Another man shamming article. If white men did...
1999512    "no matter what is put in front of you regardi...
1999513    The Democrat party aided and abetted by it's M...
1999514    I just don't find her a very good representati...
1999515    You know the Trump fanatics are trolling the G...
Name: comment_text, Length: 1999516, dtype: object

In [8]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [9]:
df_jigsaw.comment_text=df_jigsaw.comment_text.apply(str)

In [10]:
df_jigsaw.comment_text = df_jigsaw.comment_text.apply(text_cleaning)
df_ruddit.txt = df_ruddit.txt.apply(text_cleaning)



In [11]:
df_jigsaw.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sexual_explicit
0,1083994,He got his money now he lies in wait till afte...,0.373134,0.044776,0.089552,0.014925,0.343284,0.0,0.014925
1,650904,Mad dog will surely put the liberals in mental...,0.605263,0.013158,0.065789,0.065789,0.565789,0.092105,0.013158
2,5902188,And Trump continues his lifelong cowardice by ...,0.666667,0.015873,0.031746,0.0,0.666667,0.047619,0.0
3,7084460,while arresting a man for resisting arrest If ...,0.815789,0.065789,0.552632,0.105263,0.684211,0.0,0.592105
4,5410943,Tucker and Paul are both total bad ass mofo s,0.55,0.0375,0.3375,0.0,0.4875,0.0375,0.275


In [12]:
df_ruddit.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,The difference in average earnings between men...,-0.083
1,cza1wdh,The myth is that the gap is entirely based on ...,-0.022
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that s not what they re mea...,-0.083
5,cza2jj3,Men and women are not payed less for the same ...,-0.042


In [13]:
ru_toxic_score = df_ruddit["offensiveness_score"].values

In [14]:
ru_toxic_score

array([-0.083, -0.022, -0.146, ..., -0.292,  0.333, -0.625])

In [15]:
model = SentenceTransformer("paraphrase-mpnet-base-v2")

In [16]:
model.max_seq_length

512

In [17]:
#model.max_seq_length = 512

In [18]:
sentences_jigsaw = df_jigsaw.comment_text
sentences_ruddit = df_ruddit.txt.reset_index(drop=True)

In [19]:
sentences_jigsaw

0          He got his money now he lies in wait till afte...
1          Mad dog will surely put the liberals in mental...
2          And Trump continues his lifelong cowardice by ...
3          while arresting a man for resisting arrest If ...
4              Tucker and Paul are both total bad ass mofo s
                                 ...                        
1999511    Another man shamming article If white men did ...
1999512    no matter what is put in front of you regardin...
1999513    The Democrat party aided and abetted by it s M...
1999514    I just don t find her a very good representati...
1999515    You know the Trump fanatics are trolling the G...
Name: comment_text, Length: 1999516, dtype: object

In [20]:
sentences_ruddit

0       The difference in average earnings between men...
1       The myth is that the gap is entirely based on ...
2       The assertion is that women get paid less for ...
3       You said in the OP that s not what they re mea...
4       Men and women are not payed less for the same ...
                              ...                        
5705    They should only censor things that talk badly...
5706    and one of them is a woman OH SHIT we better b...
5707                    how is this flared as US politics
5708    People in Hong Kong must decide if they are go...
5709    I know this is an old post but I saw him last ...
Name: txt, Length: 5710, dtype: object

In [21]:
print(sentences_jigsaw[4])
print()
print(sentences_ruddit[4])

Tucker and Paul are both total bad ass mofo s

Men and women are not payed less for the same job I don t think many people say this is the case though I think this is a misconception on the other side not something feminists have said Even if women have a lower average wage in all jobs than men I mean that s even bigger than individual cases This shows that women are in an inferior position more than if a woman were to get literally 30 cents less Because averages show the bigger picture Edit Changed 70 to 30


In [22]:
embeddings_jigsaw = model.encode(sentences_jigsaw, show_progress_bar = True, convert_to_tensor=True)

Batches:   0%|          | 0/62485 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
embeddings_jigsaw = embeddings_jigsaw.to('cuda')
#embeddings = util.normalize_embeddings(embeddings_jigsaw)

In [None]:
embeddings_ruddit = model.encode(sentences_ruddit, show_progress_bar = True, convert_to_tensor=True)

In [None]:
embeddings_ruddit = embeddings_ruddit.to('cuda')
#embeddings_ruddit = util.normalize_embeddings(embeddings_ruddit)

In [None]:
top_K = 10

In [None]:
hits = util.semantic_search(embeddings_jigsaw, embeddings_ruddit, top_k=top_K)

In [None]:
hits[0][0:10]

In [None]:
def check(idx, top_K):
    c = 0
    for i in range(top_K):
        more_score = ru_toxic_score[hits[idx][i]["corpus_id"]]
        c+=more_score#*more_w
    return c/top_K

In [None]:
sentence_sore = np.empty(len(df_jigsaw))

for i in range(len(df_jigsaw)):
    sentence_sore[i]=check(i, top_K)

In [None]:
df = df_jigsaw.copy()

In [None]:
df['similarity_score'] = sentence_sore
df.head()

In [None]:
#df['score'] = df['score'].rank(method='first')
#df.head()

In [None]:
#df.drop('text', axis=1, inplace=True)
df.to_csv("re-ranking-jigsaw-unintended-bias.csv", index=False)