In [28]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd

In [30]:
df = pd.read_csv("/content/hogwarts_legacy_reviews.csv")
df = df.drop(['Unnamed: 0'], axis=1)

In [31]:
df = df.head(100)

In [32]:
df.head()

Unnamed: 0,Playtime,Feedback,Review
0,16,Positive,Greattt Game!
1,26,Positive,9/10Fantastic experience. A true Wizarding Wor...
2,29,Positive,worth it
3,24,Positive,I've been waiting 84 YEARSSSSSSSS.The game is ...
4,7,Positive,very fun game (it is not transphobic at all)


In [33]:
example = df.iloc[80]['Review']

In [34]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [35]:
def polarity_scores_roberta(example):
  encoded_text = tokenizer(example, return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
      #'id': df.index[df['Review'] == example][0],
      'roberta_neg' : scores[0],
      'roberta_neu' : scores[1],
      'roberta_pos' : scores[2],
      'Review' : example
  }
  return scores_dict

In [36]:
roberta_result = {}

for i, row in df.iterrows():
  try:
    text = row['Review']
    roberta_result[i] = polarity_scores_roberta(text)
  except RuntimeError:
    print(f'error at', i)

error at 30


In [37]:
roberta_result

{0: {'roberta_neg': 0.0026876328,
  'roberta_neu': 0.024787739,
  'roberta_pos': 0.9725246,
  'Review': 'Greattt Game!'},
 1: {'roberta_neg': 0.0027845881,
  'roberta_neu': 0.029467132,
  'roberta_pos': 0.96774834,
  'Review': '9/10Fantastic experience. A true Wizarding World experience. Play it and experience it, the castle alone is the worth it. And thats only a third of the game.'},
 2: {'roberta_neg': 0.11699291,
  'roberta_neu': 0.573539,
  'roberta_pos': 0.30946803,
  'Review': 'worth it'},
 3: {'roberta_neg': 0.008601023,
  'roberta_neu': 0.033361707,
  'roberta_pos': 0.95803726,
  'Review': "I've been waiting 84 YEARSSSSSSSS.The game is everything I could have hoped for and more."},
 4: {'roberta_neg': 0.009220093,
  'roberta_neu': 0.06757006,
  'roberta_pos': 0.92320985,
  'Review': 'very fun game (it is not transphobic at all)'},
 5: {'roberta_neg': 0.17814165,
  'roberta_neu': 0.34829932,
  'roberta_pos': 0.47355905,
  'Review': 'Better than expected! But bad optimization.'}

In [38]:
result_df = pd.DataFrame(roberta_result)

In [39]:
result_df = result_df.T
result_df.head()

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos,Review
0,0.002688,0.024788,0.972525,Greattt Game!
1,0.002785,0.029467,0.967748,9/10Fantastic experience. A true Wizarding Wor...
2,0.116993,0.573539,0.309468,worth it
3,0.008601,0.033362,0.958037,I've been waiting 84 YEARSSSSSSSS.The game is ...
4,0.00922,0.06757,0.92321,very fun game (it is not transphobic at all)


In [40]:
combined_dataframe = pd.merge(result_df, df, on='Review', how='inner')

In [41]:
combined_dataframe.head()

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos,Review,Playtime,Feedback
0,0.002688,0.024788,0.972525,Greattt Game!,16,Positive
1,0.002785,0.029467,0.967748,9/10Fantastic experience. A true Wizarding Wor...,26,Positive
2,0.116993,0.573539,0.309468,worth it,29,Positive
3,0.008601,0.033362,0.958037,I've been waiting 84 YEARSSSSSSSS.The game is ...,24,Positive
4,0.00922,0.06757,0.92321,very fun game (it is not transphobic at all),7,Positive
