# Import Libraries

In [2]:
from sentence_transformers import SentenceTransformer
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import time
import pandas as pd
import numpy as np
import fastparquet

  from tqdm.autonotebook import tqdm, trange





# Convert Train Data's comment and parent_comment to BERT Embeddings

In [None]:
df = pd.read_csv('../Datasets/train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
filtered_df = df[df['subreddit'].isin(['news', 'politics', 'worldnews'])]

In [5]:
filtered_df.loc[:, 'comment'] = filtered_df['comment'].apply(lambda x: str(x).lower() if x is not None else x)
filtered_df.loc[:, 'parent_comment'] = filtered_df['parent_comment'].apply(lambda x: str(x).lower() if x is not None else x)

In [None]:
filtered_df["seq_len"] = filtered_df["comment"].apply(lambda x: len(x.split(" ")))
np.percentile(filtered_df["seq_len"], 75)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["seq_len"] = filtered_df["comment"].apply(lambda x: len(x.split(" ")))


15.0

In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [28]:
def make_embeddings(text, tokenizer, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=15)
    
    # Run the input through BERT
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the token embeddings from the last hidden state
    token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

    # Get attention mask to identify padded tokens
    attention_mask = inputs['attention_mask']  # Shape: (batch_size, sequence_length)

    # Mask out the padding tokens (zero out embeddings for padding)
    attention_mask = attention_mask.unsqueeze(-1)  # Add a singleton dimension for broadcasting
    token_embeddings = token_embeddings * attention_mask  # Apply mask

    # Convert to numpy if needed (depending on further use, you can keep as tensor)
    return token_embeddings.numpy()

In [30]:
filtered_df["bert_comment"] = filtered_df['comment'].apply(lambda x: make_embeddings(x, tokenizer, model))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["bert_comment"] = filtered_df['comment'].apply(lambda x: make_embeddings(x, tokenizer, model))


In [None]:
filtered_df.to_pickle('../Datasets/bert_embeddings_no_pooling_train.pkl')

In [None]:
load_train_df_from_pickle = pd.read_pickle('bert_embeddings_no_pooling_train.pkl')

In [33]:
load_train_df_from_pickle.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,bert_comment,seq_len
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i'd ...","[[[-0.26591256, -0.29882812, -0.040224716, -0....",3
10,0,i think a significant amount would be against ...,ThisIsNotKimJongUn,politics,92,92,0,2016-09,2016-09-20 17:53:52,i bet if that money was poured into college de...,"[[[0.075774364, 0.03500098, -0.04244519, -0.05...",15
17,0,because it's what really bothers him... and it...,kozmo1313,politics,15,-1,-1,2016-12,2016-12-26 20:10:45,he actually acts like a moody emo girl on twit...,"[[[0.03299582, 0.04939469, -0.08288911, -0.163...",12
22,0,conservatism as an ideology is for sure a reac...,MayorMcCheese59,politics,1,-1,-1,2016-12,2016-12-24 00:04:06,"i still doubt that ""all conservatives stand fo...","[[[-0.1579521, -0.00796949, -0.33657235, -0.09...",29
23,0,"maybe not control, but certainly that is evide...",SunTzu-,politics,1,-1,-1,2016-10,2016-10-13 20:48:14,today russian media tweeted out that wikileaks...,"[[[-0.093258426, -0.08815382, -0.058376268, -0...",10


In [None]:
test_df = pd.read_csv('../Datasets/test-balanced.csv', delimiter='\t', header = None)
test_df.columns = ['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment']
test_df = test_df[test_df['subreddit'].isin(['news', 'politics', 'worldnews'])]
test_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,Actually most of her supporters and sane peopl...,Quinnjester,politics,3,3,0,2016-09,1473569605,Hillary's Surrogotes Told to Blame Media for '...
6,0,"""Four Score and Seven Gropes Ago...""",Kanzisbuddy,politics,-1,-1,-1,2016-10,1477159141,Gettysburg Address: The First 100 Days Of A Tr...
9,0,"Yes, because making sure the party in power do...",rydan,politics,-1,-1,0,2016-09,1472954129,"He's already encouraged his supporters to ""obs..."
33,0,"Yes you WILL, democrats cave and compromise ev...",o0flatCircle0o,politics,1,-1,-1,2016-11,1479345713,New Top Judiciary Dem Warns Trump: We Won't Fo...
54,0,You would think that as much as everyone blame...,Old_Army90,politics,3,-1,-1,2016-11,1480531475,This is pure victim-blaming. After the Republi...


In [None]:
test_df.loc[:, 'comment'] = test_df['comment'].apply(lambda x: str(x).lower() if x is not None else x)
test_df.loc[:, 'parent_comment'] = test_df['parent_comment'].apply(lambda x: str(x).lower() if x is not None else x)

In [None]:
test_df.loc[:, "bert_comment"] = test_df['comment'].apply(make_embeddings)

In [None]:
test_df.loc[:, "bert_parent_comment"] = test_df['parent_comment'].apply(make_embeddings)

In [None]:
test_df.to_pickle('../Datasets/bert_embeddings_test.pkl')

In [None]:
load_test_df_from_pickle = pd.read_pickle('../Datasets/bert_embeddings_test.pkl')

In [None]:
load_test_df_from_pickle.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,bert_comment,bert_parent_comment
0,0,actually most of her supporters and sane peopl...,Quinnjester,politics,3,3,0,2016-09,1473569605,hillary's surrogotes told to blame media for '...,"[[0.2728637, -0.17651713, 0.13503198, 0.139170...","[[0.11118117, -0.12834927, 0.066489935, 0.0073..."
6,0,"""four score and seven gropes ago...""",Kanzisbuddy,politics,-1,-1,-1,2016-10,1477159141,gettysburg address: the first 100 days of a tr...,"[[0.03881643, -0.106122, 0.26482913, -0.057139...","[[-0.1293082, -0.0901207, 0.18084918, -0.03291..."
9,0,"yes, because making sure the party in power do...",rydan,politics,-1,-1,0,2016-09,1472954129,"he's already encouraged his supporters to ""obs...","[[-0.05180453, -0.12974288, 0.06564042, 0.1824...","[[0.15203327, -0.10598624, 0.24455981, 0.03325..."
33,0,"yes you will, democrats cave and compromise ev...",o0flatCircle0o,politics,1,-1,-1,2016-11,1479345713,new top judiciary dem warns trump: we won't fo...,"[[-0.10038751, -0.13003553, 0.2965448, 0.18923...","[[0.041260313, -0.3143371, 0.06217872, 0.21281..."
54,0,you would think that as much as everyone blame...,Old_Army90,politics,3,-1,-1,2016-11,1480531475,this is pure victim-blaming. after the republi...,"[[0.042104598, 0.12200161, -0.00376096, 0.1008...","[[-0.36502808, -0.2924202, -0.05986113, 0.1534..."
