In [10]:
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
# Set the model name
MODEL_NAME = 'bert-base-uncased'
PATH_TO_DATA = "../data/training_cleanded.csv"

In [3]:
df = pd.read_csv(filepath_or_buffer=PATH_TO_DATA)
df

Unnamed: 0,Id,Entity,Sentiment,Text,Word_count
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,11
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,13
...,...,...,...,...,...
71651,9200,Nvidia,Positive,Just realized that the Windows partition of my...,26
71652,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,24
71653,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,27
71654,9200,Nvidia,Positive,Just realized between the windows partition of...,32


In [4]:
df_columns = df.columns.str.strip()
df_columns

Index(['Id', 'Entity', 'Sentiment', 'Text', 'Word_count'], dtype='object')

#### Some preprocessing

In [5]:
# Encode sentiments as integers
sentiment_mapping = {'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3}
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)
df.head(4)

Unnamed: 0,Id,Entity,Sentiment,Text,Word_count
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,11
1,2401,Borderlands,1,I am coming to the borders and I will kill you...,13
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,11
3,2401,Borderlands,1,im coming on borderlands and i will murder you...,11


In [9]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42, stratify=df['Sentiment'])

In [4]:
# Build a BERT based tokenizer
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

In [37]:
test_input_1="im getting on borderlands and i will murder you all ,8 So I spent a few hours doing something for fun... If you don't know I'm a HUGE @ Borderlands fan and Maya is one of my favorite characters."
test_input_2='How are you doing zwp?'
inputs=[test_input_1, test_input_2]

print(len(test_input_1.split()))
print(len(test_input_2.split()))
tokenizer.tokenize(inputs)[:10]

39
5


['im',
 'getting',
 'on',
 'border',
 '##lands',
 'and',
 'i',
 'will',
 'murder',
 'you']

In [35]:
max_tweet_length = df['Word_count'].max()
max_tweet_length

198

In [30]:
output=tokenizer(inputs, padding=True, truncation=True, max_length=max_tweet_length)
print(output)

{'input_ids': [[101, 10047, 2893, 2006, 3675, 8653, 1998, 1045, 2097, 4028, 2017, 2035, 1010, 1022, 2061, 1045, 2985, 1037, 2261, 2847, 2725, 2242, 2005, 4569, 1012, 1012, 1012, 2065, 2017, 2123, 1005, 1056, 2113, 1045, 1005, 1049, 1037, 4121, 1030, 3675, 8653, 5470, 1998, 9815, 2003, 2028, 1997, 2026, 5440, 3494, 1012, 102], [101, 2129, 2024, 2017, 2725, 1062, 2860, 2361, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [33]:
len(output.input_ids[1])

52

In [42]:
cross_check = tokenizer.decode(output['input_ids'][0])
cross_check

"[CLS] im getting on borderlands and i will murder you all, 8 so i spent a few hours doing something for fun... if you don't know i'm a huge @ borderlands fan and maya is one of my favorite characters. [SEP]"