In [1]:
!pip install ../input/sentence-transformers/sentence_transformers-2.1.0-py3-none-any.whl

Processing /kaggle/input/sentence-transformers/sentence_transformers-2.1.0-py3-none-any.whl
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.1.0


In [2]:
import numpy as np 
import pandas as pd
import os

#Text Cleaning
from bs4 import BeautifulSoup
import re 

from sentence_transformers import SentenceTransformer, util

In [3]:
df_jigsaw = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [4]:
df_jigsaw.head()

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


In [5]:
df_ruddit = pd.read_csv('../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv')[['comment_id', 'txt', 'offensiveness_score']]
df_ruddit = df_ruddit.loc[(df_ruddit.txt != '[deleted]') & (df_ruddit.txt != '[removed]')]

In [6]:
df_ruddit.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that's not what they're mea...,-0.083
5,cza2jj3,>Men and women are not payed less for the same...,-0.042


In [7]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [8]:
df_jigsaw.text = df_jigsaw.text.apply(text_cleaning)
df_ruddit.txt = df_ruddit.txt.apply(text_cleaning)

In [9]:
df_jigsaw.head()

Unnamed: 0,comment_id,text
0,114890,Gjalexei you asked about whether there is an a...
1,732895,Looks like be have an abuser can you please lo...
2,1139051,I confess to having complete and apparently bl...
3,1434512,Freud s ideas are certainly much discussed tod...
4,2084821,It is not just you This is a laundry list of s...


In [10]:
df_ruddit.head()

Unnamed: 0,comment_id,txt,offensiveness_score
0,cza1q49,The difference in average earnings between men...,-0.083
1,cza1wdh,The myth is that the gap is entirely based on ...,-0.022
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that s not what they re mea...,-0.083
5,cza2jj3,Men and women are not payed less for the same ...,-0.042


In [11]:
ru_toxic_score = df_ruddit["offensiveness_score"].values

In [12]:
ru_toxic_score

array([-0.083, -0.022, -0.146, ..., -0.292,  0.333, -0.625])

In [13]:
model = SentenceTransformer('../input/paraphrase-mpnet-base-v2/paraphrase-mpnet-base-v2')

In [14]:
sentences_jigsaw = df_jigsaw.text
sentences_ruddit = df_ruddit.txt.reset_index(drop=True)

In [15]:
sentences_jigsaw

0       Gjalexei you asked about whether there is an a...
1       Looks like be have an abuser can you please lo...
2       I confess to having complete and apparently bl...
3       Freud s ideas are certainly much discussed tod...
4       It is not just you This is a laundry list of s...
                              ...                        
7532                          Go away you annoying vandal
7533                                This user is a vandal
7534    Sorry to sound like a pain but one by followin...
7535    Well it s pretty fucking irrelevant now I m un...
7536    The team name is Great Britain and Northern Ir...
Name: text, Length: 7537, dtype: object

In [16]:
sentences_ruddit

0       The difference in average earnings between men...
1       The myth is that the gap is entirely based on ...
2       The assertion is that women get paid less for ...
3       You said in the OP that s not what they re mea...
4       Men and women are not payed less for the same ...
                              ...                        
5705    They should only censor things that talk badly...
5706    and one of them is a woman OH SHIT we better b...
5707                    how is this flared as US politics
5708    People in Hong Kong must decide if they are go...
5709    I know this is an old post but I saw him last ...
Name: txt, Length: 5710, dtype: object

In [17]:
print(sentences_jigsaw[4])
print()
print(sentences_ruddit[4])

It is not just you This is a laundry list of stupid allegations scooped up from god knows where Probably two thirds of it has little basis in fact

Men and women are not payed less for the same job I don t think many people say this is the case though I think this is a misconception on the other side not something feminists have said Even if women have a lower average wage in all jobs than men I mean that s even bigger than individual cases This shows that women are in an inferior position more than if a woman were to get literally 30 cents less Because averages show the bigger picture Edit Changed 70 to 30


In [18]:
embeddings_jigsaw = model.encode(sentences_jigsaw, convert_to_tensor=True)

Batches:   0%|          | 0/236 [00:00<?, ?it/s]

In [19]:
embeddings_jigsaw = embeddings_jigsaw.to('cuda')
embeddings = util.normalize_embeddings(embeddings_jigsaw)

In [20]:
embeddings_ruddit = model.encode(sentences_ruddit, convert_to_tensor=True)

Batches:   0%|          | 0/179 [00:00<?, ?it/s]

In [21]:
embeddings_ruddit = embeddings_ruddit.to('cuda')
embeddings_ruddit = util.normalize_embeddings(embeddings_ruddit)

In [22]:
top_K = 10

In [23]:
hits = util.semantic_search(embeddings_jigsaw, embeddings_ruddit, top_k=top_K)

In [24]:
hits[0][0:10]

[{'corpus_id': 533, 'score': 0.5917389392852783},
 {'corpus_id': 3375, 'score': 0.5538870692253113},
 {'corpus_id': 3933, 'score': 0.5530230402946472},
 {'corpus_id': 5310, 'score': 0.5463874936103821},
 {'corpus_id': 845, 'score': 0.5418612360954285},
 {'corpus_id': 5318, 'score': 0.5384334325790405},
 {'corpus_id': 3158, 'score': 0.5310785174369812},
 {'corpus_id': 506, 'score': 0.521647572517395},
 {'corpus_id': 4959, 'score': 0.5050034523010254},
 {'corpus_id': 645, 'score': 0.5027069449424744}]

In [25]:
def check(idx, top_K):
    c = 0
    for i in range(top_K):
        more_score = ru_toxic_score[hits[idx][i]["corpus_id"]]
        c+=more_score#*more_w
    return c/top_K

In [26]:
sentence_sore = np.empty(len(df_jigsaw))

for i in range(len(df_jigsaw)):
    sentence_sore[i]=check(i, top_K)

In [27]:
df = df_jigsaw.copy()

In [28]:
df['score'] = sentence_sore
df.head()

Unnamed: 0,comment_id,text,score
0,114890,Gjalexei you asked about whether there is an a...,-0.0351
1,732895,Looks like be have an abuser can you please lo...,-0.0146
2,1139051,I confess to having complete and apparently bl...,0.0105
3,1434512,Freud s ideas are certainly much discussed tod...,-0.1157
4,2084821,It is not just you This is a laundry list of s...,0.2928


In [29]:
df['score'] = df['score'].rank(method='first')
df.head()

Unnamed: 0,comment_id,text,score
0,114890,Gjalexei you asked about whether there is an a...,1197.0
1,732895,Looks like be have an abuser can you please lo...,1399.0
2,1139051,I confess to having complete and apparently bl...,1698.0
3,1434512,Freud s ideas are certainly much discussed tod...,575.0
4,2084821,It is not just you This is a laundry list of s...,5018.0


In [30]:
df.drop('text', axis=1, inplace=True)
df.to_csv("submission.csv", index=False)