In [1]:
import numpy as np 
import pandas as pd
import os

import pandas as pd
from sklearn.model_selection import KFold
from itertools import combinations

import gc
import torch

#Text Cleaning
from bs4 import BeautifulSoup
import re 

from sentence_transformers import SentenceTransformer, util

In [2]:
gc.collect()
torch.cuda.empty_cache()
#torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()



# Data

Create all pairs for the Ruddit dataset - we preserve the original scores, so that a difference (pointing which one is more toxic) can be adjusted at modeling time. 

In [3]:
SEED = 2021

In [4]:
df = pd.read_csv('input/ruddit-jigsaw-dataset/ruddit_with_text.csv')

In [5]:
df.head()

Unnamed: 0,post_id,comment_id,txt,url,offensiveness_score
0,42g75o,cza1q49,> The difference in average earnings between m...,https://www.reddit.com/r/changemyview/comments...,-0.083
1,42g75o,cza1wdh,"The myth is that the ""gap"" is entirely based o...",https://www.reddit.com/r/changemyview/comments...,-0.022
2,42g75o,cza23qx,[deleted],https://www.reddit.com/r/changemyview/comments...,0.167
3,42g75o,cza2bw8,The assertion is that women get paid less for ...,https://www.reddit.com/r/changemyview/comments...,-0.146
4,42g75o,cza2iji,You said in the OP that's not what they're mea...,https://www.reddit.com/r/changemyview/comments...,-0.083


In [6]:
# ruddit
df = pd.read_csv('input/ruddit-jigsaw-dataset/ruddit_with_text.csv')[['comment_id', 'txt', 'offensiveness_score']]
df = df.loc[(df.txt != '[deleted]') & (df.txt != '[removed]')]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
2,cza2bw8,The assertion is that women get paid less for ...,-0.146
3,cza2iji,You said in the OP that's not what they're mea...,-0.083
4,cza2jj3,>Men and women are not payed less for the same...,-0.042


In [7]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [8]:
df.txt = df.txt.apply(text_cleaning)

In [9]:
df.shape

(5710, 3)

In [10]:
model = SentenceTransformer("paraphrase-mpnet-base-v2")

In [11]:
sentences = df.txt

In [12]:
paraphrases = util.paraphrase_mining(model, sentences,show_progress_bar =True, corpus_chunk_size=len(sentences),max_pairs = 4000, top_k=30)

Batches:   0%|          | 0/179 [00:00<?, ?it/s]

In [13]:
len(paraphrases)

2000

In [14]:
paraphrases

[[0.9831713438034058, 2611, 2632],
 [0.9762998819351196, 2734, 2735],
 [0.9759928584098816, 5365, 5366],
 [0.9724558591842651, 3158, 3375],
 [0.9507856369018555, 2206, 4945],
 [0.9316749572753906, 142, 145],
 [0.9258538484573364, 1319, 1344],
 [0.9218746423721313, 2206, 4327],
 [0.9186232686042786, 5079, 5084],
 [0.9120160341262817, 2555, 2558],
 [0.9118480086326599, 3379, 3388],
 [0.9091827273368835, 162, 165],
 [0.9091631174087524, 1847, 1851],
 [0.9086726903915405, 4327, 4945],
 [0.9075998663902283, 1859, 1861],
 [0.905371904373169, 4891, 4892],
 [0.9038047194480896, 5192, 5193],
 [0.8919681310653687, 138, 139],
 [0.8904440402984619, 1361, 1376],
 [0.8857927918434143, 242, 256],
 [0.8838163018226624, 2, 7],
 [0.878718376159668, 3238, 3248],
 [0.8777272701263428, 34, 43],
 [0.8765317797660828, 256, 263],
 [0.8736593127250671, 4328, 4329],
 [0.8718640804290771, 4186, 4187],
 [0.8698402643203735, 135, 141],
 [0.8695032000541687, 191, 219],
 [0.8688641786575317, 188, 191],
 [0.866674065

In [15]:
df_pair = pd.DataFrame(paraphrases, columns=['score', 'toxic1', 'toxic2'])

In [16]:
df_pair.head()

Unnamed: 0,score,toxic1,toxic2
0,0.983171,2611,2632
1,0.9763,2734,2735
2,0.975993,5365,5366
3,0.972456,3158,3375
4,0.950786,2206,4945


In [17]:
df_pair['toxic1'] = df_pair.apply(lambda row: sentences[row['toxic1']] , axis=1)
df_pair['toxic2'] = df_pair.apply(lambda row: sentences[row['toxic2']] , axis=1)

In [18]:
'''df.txt = df.txt.apply(str)
df_pair.txt = df_pair.txt.apply(str)
df_pair.toxic2 = df_pair.toxic2.apply(str)'''

'df.txt = df.txt.apply(str)\ndf_pair.txt = df_pair.txt.apply(str)\ndf_pair.toxic2 = df_pair.toxic2.apply(str)'

In [19]:
df_pair.head()

Unnamed: 0,score,toxic1,toxic2
0,0.983171,598s or 600s for gaming I spend about 70 of my...,Posting this again for more input 598s or 600s...
1,0.9763,if by invade you mean get ganked 4v1 with the ...,if by invade you mean get ganked 4v1 with the ...
2,0.975993,Mark 10 25 It is easier for a camel to go thro...,Mark 10 25 It is easier for a camel to go thro...
3,0.972456,As a reminder this subreddit is for civil disc...,As a reminder this subreddit is for civil disc...
4,0.950786,what what the fuck did i just read,What the hell did I just read


In [20]:
df_pair = df_pair[df_pair.score < 0.95]

In [21]:
df_pair.head()

Unnamed: 0,score,toxic1,toxic2
5,0.931675,Banning guns and explosives doesn t make a sta...,Banning guns and explosives doesn t make a sta...
6,0.925854,I m saying it hasn t been researched and yet t...,I m saying it hasn t been researched and yet t...
7,0.921875,what what the fuck did i just read,The fuck did I just read
8,0.918623,Leading Britain off a cliff and cheating their...,It s almost as if these leave voting money gru...
9,0.912016,Bulgaria Croatia Czech Republic Denmark Hungar...,Bulgaria Croatia Czech Republic Denmark Hungar...


In [22]:
df_pair = df_pair.join(df.set_index('txt'), on=['toxic1'], how='left')

In [23]:
df_pair.drop(['comment_id'], axis=1, inplace=True)
df_pair.rename(columns={"offensiveness_score": "toxic_score1"}, inplace=True)

In [24]:
df_pair.head()

Unnamed: 0,score,toxic1,toxic2,toxic_score1
5,0.931675,Banning guns and explosives doesn t make a sta...,Banning guns and explosives doesn t make a sta...,0.188
6,0.925854,I m saying it hasn t been researched and yet t...,I m saying it hasn t been researched and yet t...,0.042
7,0.921875,what what the fuck did i just read,The fuck did I just read,0.667
8,0.918623,Leading Britain off a cliff and cheating their...,It s almost as if these leave voting money gru...,0.646
9,0.912016,Bulgaria Croatia Czech Republic Denmark Hungar...,Bulgaria Croatia Czech Republic Denmark Hungar...,-0.146


In [25]:
df_pair = df_pair.join(df.set_index('txt'), on=['toxic2'], how='left')

In [26]:
df_pair.drop(['comment_id'], axis=1, inplace=True)
df_pair.rename(columns={"offensiveness_score": "toxic_score2"}, inplace=True)

In [27]:
df_pair.head()

Unnamed: 0,score,toxic1,toxic2,toxic_score1,toxic_score2
5,0.931675,Banning guns and explosives doesn t make a sta...,Banning guns and explosives doesn t make a sta...,0.188,0.292
6,0.925854,I m saying it hasn t been researched and yet t...,I m saying it hasn t been researched and yet t...,0.042,0.277
7,0.921875,what what the fuck did i just read,The fuck did I just read,0.667,0.75
8,0.918623,Leading Britain off a cliff and cheating their...,It s almost as if these leave voting money gru...,0.646,0.792
9,0.912016,Bulgaria Croatia Czech Republic Denmark Hungar...,Bulgaria Croatia Czech Republic Denmark Hungar...,-0.146,-0.292


In [28]:
df_pair.shape

(1995, 5)

In [29]:
df_pair.drop(['score'], axis=1, inplace=True)

In [30]:
df_pair.head()

Unnamed: 0,toxic1,toxic2,toxic_score1,toxic_score2
5,Banning guns and explosives doesn t make a sta...,Banning guns and explosives doesn t make a sta...,0.188,0.292
6,I m saying it hasn t been researched and yet t...,I m saying it hasn t been researched and yet t...,0.042,0.277
7,what what the fuck did i just read,The fuck did I just read,0.667,0.75
8,Leading Britain off a cliff and cheating their...,It s almost as if these leave voting money gru...,0.646,0.792
9,Bulgaria Croatia Czech Republic Denmark Hungar...,Bulgaria Croatia Czech Republic Denmark Hungar...,-0.146,-0.292


In [31]:
df_pair = df_pair[df_pair.toxic_score1 != df_pair.toxic_score2]

In [32]:
df_pair = df_pair.drop_duplicates(subset=['toxic1', 'toxic2'])

In [33]:
df_pair.shape

(1950, 4)

In [34]:
df_pair['toxic1'], df_pair['toxic2'], df_pair['toxic_score1'], df_pair['toxic_score2']=np.where(df_pair['toxic_score1'] > df_pair['toxic_score2'],(df_pair['toxic1'], df_pair['toxic2'], df_pair['toxic_score1'], df_pair['toxic_score2']),(df_pair['toxic2'],df_pair['toxic1'], df_pair['toxic_score2'], df_pair['toxic_score1']))

In [35]:
df_pair.head()

Unnamed: 0,toxic1,toxic2,toxic_score1,toxic_score2
5,Banning guns and explosives doesn t make a sta...,Banning guns and explosives doesn t make a sta...,0.292,0.188
6,I m saying it hasn t been researched and yet t...,I m saying it hasn t been researched and yet t...,0.277,0.042
7,The fuck did I just read,what what the fuck did i just read,0.75,0.667
8,It s almost as if these leave voting money gru...,Leading Britain off a cliff and cheating their...,0.792,0.646
9,Bulgaria Croatia Czech Republic Denmark Hungar...,Bulgaria Croatia Czech Republic Denmark Hungar...,-0.146,-0.292


In [36]:
df_pair.drop(['toxic_score1', 'toxic_score2'], axis=1, inplace=True)
df_pair.rename(columns={"toxic1": "more_toxic", "toxic2": "less_toxic"}, inplace=True)

In [37]:
# dump to file
df_pair.to_csv('ruddit_pairs.csv', index = False)