In [1]:
import numpy as np 
import pandas as pd
import os

import pandas as pd
from sklearn.model_selection import KFold
from itertools import combinations

import gc
import torch

#Text Cleaning
from bs4 import BeautifulSoup
import re 

from sentence_transformers import SentenceTransformer, util

In [2]:
gc.collect()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()
torch.cuda.empty_cache()

# Data

Create all pairs for the Ruddit dataset - we preserve the original scores, so that a difference (pointing which one is more toxic) can be adjusted at modeling time. 

In [3]:
SEED = 2021

In [4]:
def get_classification_dataset_as(task="binary", regression_weights=None):
    """
    Args:
        task: ['binary', 'regression']
        regression_weights: dictionary {label => weights} for each label or None, if task=='regression'
        
    """
    assert task in ['binary', 'regression']
    
    df = pd.read_csv('input/jigsaw-toxic-comment-classification-challenge/classification-dataset.csv')

    
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    if task == 'binary':    
        df['y'] = (df[labels].sum(axis=1) > 0).astype(int)
        df = df.drop(labels, axis=1)
    elif task == 'regression':
        if regression_weights is None:
            df['y'] = df[labels].sum(axis=1)
        else:
            weighed_columns = [regression_weights.get(l, 1) * df[l] for l in labels]
            df['y'] = pd.concat(weighed_columns, axis=1).sum(axis=1)
        df = df.drop(labels, axis=1)
    return df

In [5]:
df = get_classification_dataset_as('regression', regression_weights={'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5})

In [6]:
df.head()

Unnamed: 0,id,comment_text,y
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0


In [7]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [8]:
df.comment_text = df.comment_text.apply(text_cleaning)

In [9]:
df.shape

(223549, 3)

In [10]:
model = SentenceTransformer("paraphrase-mpnet-base-v2")

In [11]:
sentences = df.comment_text

In [12]:
paraphrases = util.paraphrase_mining(model, sentences, show_progress_bar = True, query_chunk_size = 10000, corpus_chunk_size=90000, max_pairs = 150000, top_k=5)

Batches:   0%|          | 0/6986 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 3.35 GiB (GPU 0; 7.79 GiB total capacity; 4.69 GiB already allocated; 758.62 MiB free; 4.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
len(paraphrases)

In [None]:
paraphrases

In [None]:
df_pair = pd.DataFrame(paraphrases, columns=['score', 'toxic1', 'toxic2'])

In [None]:
df_pair.shape

In [None]:
df_pair.head()

In [None]:
df_pair['toxic1'] = df_pair.apply(lambda row: sentences[row['toxic1']] , axis=1)
df_pair['toxic2'] = df_pair.apply(lambda row: sentences[row['toxic2']] , axis=1)

In [None]:
df_pair = df_pair[df_pair.score < 0.95]

In [None]:
'''df.txt = df.txt.apply(str)
df_pair.txt = df_pair.txt.apply(str)
df_pair.toxic2 = df_pair.toxic2.apply(str)'''

In [None]:
df_pair.head()

In [None]:
df_pair.shape

In [None]:
df_pair.join(df.set_index('comment_text'), on=['toxic1'], how='left').shape

In [None]:
df_pair = df_pair.join(df.set_index('comment_text'), on=['toxic1'], how='left')

In [None]:
df_pair.drop(['id'], axis=1, inplace=True)
df_pair.rename(columns={"y": "toxic_score1"}, inplace=True)

In [None]:
df_pair.head()

In [None]:
df_pair.shape

In [None]:
df_pair = df_pair.join(df.set_index('comment_text'), on=['toxic2'], how='left')

In [None]:
df_pair.shape

In [None]:
df_pair.drop(['id'], axis=1, inplace=True)
df_pair.rename(columns={"y": "toxic_score2"}, inplace=True)

In [None]:
df_pair.head()

In [None]:
df_pair.drop(['score'], axis=1, inplace=True)

In [None]:
df_pair.head()

In [None]:
df_pair['toxic1'], df_pair['toxic2'], df_pair['toxic_score1'], df_pair['toxic_score2']=np.where(df_pair['toxic_score1'] > df_pair['toxic_score2'],(df_pair['toxic1'], df_pair['toxic2'], df_pair['toxic_score1'], df_pair['toxic_score2']),(df_pair['toxic2'],df_pair['toxic1'], df_pair['toxic_score2'], df_pair['toxic_score1']))

In [None]:
df_pair.head()

In [None]:
df_pair.drop(['toxic_score1', 'toxic_score2'], axis=1, inplace=True)
df_pair.rename(columns={"toxic1": "more_toxic", "toxic2": "less_toxic"})

In [None]:
# dump to file
df_pair.to_csv('jigsaw-toxic-commenr-classification-pair.csv', index = False)