In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
import pandas as pd
import joblib
from transformers import BertTokenizer, BertModel

In [2]:
channels_model = joblib.load("./models/channels_svm_model.pkl")

In [3]:
issues_model = joblib.load("./models/issues_svm_model.pkl")

In [4]:
sentiment_tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [8]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [9]:
optimizer = AdamW(sentiment_model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [10]:
output_file = "./models/model_bert_sentiment2.pth"
checkpoint = torch.load(output_file, map_location='cpu')
sentiment_model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [11]:
# print(sentiment_model)

In [13]:
def sentiment_score(review):
    tokens = sentiment_tokenizer.encode(review, return_tensors='pt')
    result = sentiment_model(tokens)
    return int(torch.argmax(result.logits))+1

In [14]:
# Initialize the BERT tokenizer and model
classification_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
classification__model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# Tokenize and generate BERT embeddings for your reviews
def generate_bert_embeddings(text):
    inputs = classification_tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = classification__model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Average pooling

In [16]:
df = pd.read_excel("./data/reviews_for_classification.xlsx")

In [17]:
df

Unnamed: 0,Customer,Country,Date,Rating,Title,Reviews
0,xxx xxx,GB,"Updated Apr 4, 2022",5,The best in all that matters,The best in all that matters! It's a great pla...
1,Javier Setovich,US,"Mar 8, 2022",5,Celsius Network ROCKS!,If you are looking for the best #HomeForCrypto...
2,Andrei Franco,VN,"Mar 23, 2022",1,I despise it so much,I despise it so much. Transferring to other wa...
3,Isai Garcia,US,"Feb 23, 2022",1,Worst customer service and worst…,Worst customer service and worst company to de...
4,Tyler McMurray,US,"Mar 22, 2022",5,Celsius is the BEST in Crypto,Celsius is the most transparent and responsive...
...,...,...,...,...,...,...
21961,Ujjval,US,"Dec 12, 2019",1,Worst service ever,Worst service ever. I was emailed to contact t...
21962,Corey,US,"Oct 15, 2019",5,Best banking service I ever had!,
21963,Pat,US,"Sep 25, 2019",5,Preferred bank for over 25+ year,PNC has been my banking partner for over 25+ y...
21964,Robin,US,"Feb 13, 2019",5,My only most trusted bank in 11 years,"My only most trusted bank in 11 years, please ..."


In [18]:
reviews = df.Reviews

In [19]:
max_len = 0

# For every review...
for review in reviews:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = classification_tokenizer.encode(review, add_special_tokens=True)

    # Update the maximum review length.
    max_len = max(max_len, len(input_ids))

print('Max review length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1170 > 512). Running this sequence through the model will result in indexing errors


Max review length:  1170


In [20]:
embeddings = df.Reviews.apply(generate_bert_embeddings)

In [21]:
embeddings

0        [[tensor(-0.0825), tensor(0.0907), tensor(0.40...
1        [[tensor(0.0407), tensor(0.0364), tensor(0.622...
2        [[tensor(0.1009), tensor(0.1393), tensor(0.376...
3        [[tensor(0.0776), tensor(0.0434), tensor(0.342...
4        [[tensor(-0.0458), tensor(0.0515), tensor(0.26...
                               ...                        
21961    [[tensor(0.2661), tensor(-0.0313), tensor(0.36...
21962    [[tensor(0.0632), tensor(0.0832), tensor(-0.20...
21963    [[tensor(0.1839), tensor(-0.0719), tensor(0.48...
21964    [[tensor(0.0011), tensor(-0.1408), tensor(0.38...
21965    [[tensor(-0.2000), tensor(-0.2570), tensor(0.2...
Name: Reviews, Length: 21966, dtype: object

In [22]:
X = torch.cat(embeddings.tolist()).numpy()

In [23]:
df["Channels"] = channels_model.predict(X)

In [24]:
df["Issues"] = issues_model.predict(X)

In [25]:
df["Sentiment"] = (df['Title']+df['Reviews']).apply(lambda x: sentiment_score(x[:512]))

In [26]:
testReview = "The platform is decent. However, customer care and relations are awful! I got banned from their Twitter account for asking for help since I was locked out of my account. They banned me because I demanded urgent help on my support ticket since it was related to my account security. They decided to ban me so they can keep their dignity. They always say that they are doing it for the clients, but the moment someone speaks up they ban him so they can keep everything perfectly looking for the next victim. I managed to get into my account and I withdrew everything. I am not saying they are scammers, but they for sure don't know how to handle clients."

In [27]:
sentiment = sentiment_score(testReview[:512])

In [28]:
sentiment

1

In [29]:
df

Unnamed: 0,Customer,Country,Date,Rating,Title,Reviews,Channels,Issues,Sentiment
0,xxx xxx,GB,"Updated Apr 4, 2022",5,The best in all that matters,The best in all that matters! It's a great pla...,chatbot or live agent,0,5
1,Javier Setovich,US,"Mar 8, 2022",5,Celsius Network ROCKS!,If you are looking for the best #HomeForCrypto...,0,0,5
2,Andrei Franco,VN,"Mar 23, 2022",1,I despise it so much,I despise it so much. Transferring to other wa...,ib or mb,content-clarity,1
3,Isai Garcia,US,"Feb 23, 2022",1,Worst customer service and worst…,Worst customer service and worst company to de...,mobile banking,0,1
4,Tyler McMurray,US,"Mar 22, 2022",5,Celsius is the BEST in Crypto,Celsius is the most transparent and responsive...,ib or mb,0,5
...,...,...,...,...,...,...,...,...,...
21961,Ujjval,US,"Dec 12, 2019",1,Worst service ever,Worst service ever. I was emailed to contact t...,cctr,0,1
21962,Corey,US,"Oct 15, 2019",5,Best banking service I ever had!,,0,0,5
21963,Pat,US,"Sep 25, 2019",5,Preferred bank for over 25+ year,PNC has been my banking partner for over 25+ y...,0,0,5
21964,Robin,US,"Feb 13, 2019",5,My only most trusted bank in 11 years,"My only most trusted bank in 11 years, please ...",0,0,5


In [30]:
# Save the DataFrame to an Excel file
df.to_excel('processed_reviews2.xlsx', index=False)  # Set index=False to exclude the index column in the Excel
