In [1]:
import numpy as np 
import pandas as pd
import os

import pandas as pd
from sklearn.model_selection import KFold
from itertools import combinations

import gc
import torch

#Text Cleaning
from bs4 import BeautifulSoup
import re 

from sentence_transformers import SentenceTransformer, util

In [2]:
gc.collect()
torch.cuda.empty_cache()
#torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()



# Data

Create all pairs for the Ruddit dataset - we preserve the original scores, so that a difference (pointing which one is more toxic) can be adjusted at modeling time. 

In [3]:
SEED = 2021

In [4]:
df = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/unintented-bias-dataset.csv') 

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sexual_explicit
0,1083994,He got his money... now he lies in wait till a...,0.373134,0.044776,0.089552,0.014925,0.343284,0.0,0.014925
1,650904,Mad dog will surely put the liberals in mental...,0.605263,0.013158,0.065789,0.065789,0.565789,0.092105,0.013158
2,5902188,And Trump continues his lifelong cowardice by ...,0.666667,0.015873,0.031746,0.0,0.666667,0.047619,0.0
3,7084460,"""while arresting a man for resisting arrest"".\...",0.815789,0.065789,0.552632,0.105263,0.684211,0.0,0.592105
4,5410943,Tucker and Paul are both total bad ass mofo's.,0.55,0.0375,0.3375,0.0,0.4875,0.0375,0.275


In [6]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'sexual_explicit']

In [7]:
regression_weights ={'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5, 'sexual_explicit': 0.3}

In [8]:
weighed_columns = [regression_weights.get(l, 1) * df[l] for l in labels]
df['y'] = pd.concat(weighed_columns, axis=1).sum(axis=1)
df = df.drop(labels, axis=1)

In [9]:
df.head()

Unnamed: 0,id,comment_text,y
0,1083994,He got his money... now he lies in wait till a...,0.447463
1,650904,Mad dog will surely put the liberals in mental...,0.826842
2,5902188,And Trump continues his lifelong cowardice by ...,0.740317
3,7084460,"""while arresting a man for resisting arrest"".\...",1.221579
4,5410943,Tucker and Paul are both total bad ass mofo's.,0.737


In [10]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [11]:
df.comment_text = df.comment_text.apply(str)

In [12]:
df.comment_text = df.comment_text.apply(text_cleaning)



In [13]:
df.shape

(1999516, 3)

In [14]:
df = df.drop_duplicates(subset='comment_text')

In [15]:
df.shape

(1958437, 3)

In [16]:
df['comment_text_lower'] = df['comment_text'].astype(str).str.lower()

In [17]:
df = df.drop_duplicates(subset='comment_text_lower')

In [18]:
df.drop(['comment_text_lower'], axis=1, inplace=True)

In [19]:
df.shape

(1955462, 3)

In [20]:
df.head()

Unnamed: 0,id,comment_text,y
0,1083994,He got his money now he lies in wait till afte...,0.447463
1,650904,Mad dog will surely put the liberals in mental...,0.826842
2,5902188,And Trump continues his lifelong cowardice by ...,0.740317
3,7084460,while arresting a man for resisting arrest If ...,1.221579
4,5410943,Tucker and Paul are both total bad ass mofo s,0.737


In [21]:
df = df.reset_index(drop=True)

In [22]:
df.shape

(1955462, 3)

In [23]:
df.head()

Unnamed: 0,id,comment_text,y
0,1083994,He got his money now he lies in wait till afte...,0.447463
1,650904,Mad dog will surely put the liberals in mental...,0.826842
2,5902188,And Trump continues his lifelong cowardice by ...,0.740317
3,7084460,while arresting a man for resisting arrest If ...,1.221579
4,5410943,Tucker and Paul are both total bad ass mofo s,0.737


In [24]:
model = SentenceTransformer("paraphrase-mpnet-base-v2")

In [25]:
sentences = df.comment_text

In [26]:
paraphrases = util.paraphrase_mining(model, sentences, batch_size=8, show_progress_bar = True, corpus_chunk_size=100000, max_pairs = 200000, top_k=30)

Batches:   0%|          | 0/244433 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 7.79 GiB total capacity; 5.53 GiB already allocated; 10.06 MiB free; 5.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
len(paraphrases)

In [None]:
paraphrases

In [None]:
df_pair = pd.DataFrame(paraphrases, columns=['score', 'toxic1', 'toxic2'])

In [None]:
df_pair.shape

In [None]:
df_pair.head()

In [None]:
df_pair = df_pair[df_pair.score < 0.95]

In [None]:
df_pair.shape

In [None]:
df_pair.head()

In [None]:
df_pair['toxic1'] = df_pair.apply(lambda row: sentences[row['toxic1']] , axis=1)
df_pair['toxic2'] = df_pair.apply(lambda row: sentences[row['toxic2']] , axis=1)

In [None]:
'''df.txt = df.txt.apply(str)
df_pair.txt = df_pair.txt.apply(str)
df_pair.toxic2 = df_pair.toxic2.apply(str)'''

In [None]:
df_pair.head()

In [None]:
df_pair = df_pair.join(df.set_index('comment_text'), on=['toxic1'], how='left')

In [None]:
df_pair.drop(['id'], axis=1, inplace=True)
df_pair.rename(columns={"y": "toxic_score1"}, inplace=True)

In [None]:
df_pair.shape

In [None]:
df_pair.head()

In [None]:
df_pair = df_pair.join(df.set_index('comment_text'), on=['toxic2'], how='left')

In [None]:
df_pair.drop(['id'], axis=1, inplace=True)
df_pair.rename(columns={"y": "toxic_score2"}, inplace=True)

In [None]:
df_pair.shape

In [None]:
df_pair.head()

In [None]:
df_pair.drop(['score'], axis=1, inplace=True)

In [None]:
df_pair.head()

In [None]:
df_pair = df_pair[df_pair.toxic_score1 != df_pair.toxic_score2]

In [None]:
df_pair = df_pair.drop_duplicates(subset=['toxic1', 'toxic2'])

In [None]:
df_pair.shape

In [None]:
df_pair.head()

In [None]:
df_pair['toxic1'], df_pair['toxic2'], df_pair['toxic_score1'], df_pair['toxic_score2']=np.where(df_pair['toxic_score1'] > df_pair['toxic_score2'],(df_pair['toxic1'], df_pair['toxic2'], df_pair['toxic_score1'], df_pair['toxic_score2']),(df_pair['toxic2'],df_pair['toxic1'], df_pair['toxic_score2'], df_pair['toxic_score1']))

In [None]:
df_pair.head()

In [None]:
df_pair.drop(['toxic_score1', 'toxic_score2'], axis=1, inplace=True)
df_pair.rename(columns={"toxic1": "more_toxic", "toxic2": "less_toxic"})

In [None]:
# dump to file
df_pair.to_csv('jigsaw-unintended-bias-pair.csv', index = False)