In [27]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

Device: cuda:0


In [10]:
# Load the pre-trained BERT model and tokenizer
#mod = "mistralai/Mistral-7B-v0.1"
model_name = "/home/patrick.araujo/llama2/llama/llama-2-7b-hf"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████████████| 3/3 [01:59<00:00, 39.81s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /home/patrick.araujo/llama2/llama/llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
text = "Hi. How are you?"

In [12]:
text

'Hi. How are you?'

In [13]:
encoded_text = tokenizer(text)["input_ids"]

In [14]:
encoded_text

[1, 6324, 29889, 1128, 526, 366, 29973]

In [15]:
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)

Decoded tokens back into text:  <s> Hi. How are you?


In [16]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[1, 6324, 29892, 920, 526, 366, 29973], [1, 306, 29915, 29885, 1781], [1, 3869]]


In [17]:
tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

Using padding:  [[1, 6324, 29892, 920, 526, 366, 29973], [2, 2, 1, 306, 29915, 29885, 1781], [2, 2, 2, 2, 2, 1, 3869]]


In [18]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

Using truncation:  [[1, 6324, 29892], [1, 306, 29915], [1, 3869]]


In [19]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

Using left-side truncation:  [[1, 366, 29973], [1, 29885, 1781], [1, 3869]]


In [20]:
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Using both padding and truncation:  [[1, 366, 29973], [1, 29885, 1781], [2, 1, 3869]]


In [21]:
dataset = pd.read_csv('/home/patrick.araujo/llama2/datasets/balanced_output_train.csv')

In [22]:
dataset

Unnamed: 0.1,level_0,index,Unnamed: 0,reviewId,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sentiment,lengthContent,Language
0,1007,1527,1527,7d6a4b7f-0f17-47c6-8010-3187dd1c86a7,I've been using my visa gift card. All the inf...,1,2,26.23.4.100,2023-12-28 21:20:56,,,26.23.4.100,0,224,en
1,8622,12941,12941,227c23e5-e178-4d69-bb99-3fe1445dc035,the PRIME PRICE IS WAY TO HIGH!!,5,0,26.19.2.100,2023-10-02 12:09:02,,,26.19.2.100,2,32,en
2,7774,11420,11420,bc748a49-bbd9-4773-a63f-f0950ad66310,easy to use and fast free delivery,5,0,26.20.0.100,2023-10-25 03:46:34,,,26.20.0.100,2,34,en
3,5478,8135,8135,0b2c1d90-3026-427a-bea9-918d1e067e8f,The Shopping On Amazon Is The Greatest Of All....,5,0,,2023-11-12 01:54:20,,,,2,111,en
4,13393,19072,19072,bee9613a-352d-4328-abaa-986316fe788d,I am rarely not satisfied seems I'm always sat...,4,0,24.22.0.100,2023-01-10 09:30:07,,,24.22.0.100,2,64,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9307,7444,10822,10822,e95c926d-6510-4b97-8ad7-7b11885a46a7,Amazon is always my go to for everything!!,5,0,26.21.0.100,2023-11-03 01:06:14,,,26.21.0.100,2,42,en
9308,17040,24320,24320,f26c2ae2-6868-49ab-9845-963ce24f14f8,This is a great app but I didn't get my neckla...,3,1,22.14.0.100,2022-06-01 13:35:57,,,22.14.0.100,1,240,en
9309,14542,20557,20557,cd102f54-3164-494c-8287-b2064c82401b,11-3-22- I have 107 S&S items. It is impossibl...,4,9,24.20.2.100,2022-11-03 11:59:23,,,24.20.2.100,2,320,en
9310,16186,22964,22964,d8372268-d77e-4666-a2dd-166decb97443,"App sucks! Glitchy, slow, hurts my eyes cuz im...",3,0,24.12.6.100,2022-07-21 14:35:03,,,24.12.6.100,1,231,en


In [23]:
dataset.shape

(9312, 15)

In [28]:
# Create train and validation datasets
comments = dataset[['content', 'sentiment']]
train_data, val_data = train_test_split(comments, test_size=0.2, random_state=42)

In [31]:
print(train_data.shape)
print(val_data.shape)

(7449, 2)
(1863, 2)


In [50]:
def returnSentiment(sentiment):
    if sentiment == 0:
        return 'negative'
    elif sentiment == 1:
        return 'neutral'
    elif sentiment == 2:
        return 'positive'

def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the comment enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["content"]}] = {returnSentiment(data_point["sentiment"])}
            """.strip()

In [51]:
def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the comment enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

In [52]:
X_train = pd.DataFrame(train_data.apply(generate_prompt, axis=1), columns=['content'])

In [53]:
print(X_train.iloc[0]['content'])

Analyze the sentiment of the comment enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [Usually love the app but lately if I try to use the baby registry and remove things from the registry the app freezes and I have to close it and start over. It's super annoying please fix it.] = neutral
