In [1]:
!pip install --quiet keybert
print('pip install keybert complete')

pip install keybert complete


In [2]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

In [3]:
import pandas as pd

GTAV = '/kaggle/input/grand-theft-auto-v/GTAV_Steam_Reviews.csv'

df = pd.read_csv(filepath_or_buffer=GTAV, parse_dates=['created', 'author_last_played'], index_col=['id']).drop(columns=['language', 'written_during_early_access'])
df['token count'] = df['review'].str.split().str.len()
df['char count'] = df['review'].str.len()
df.head()

Unnamed: 0_level_0,review,created,voted_up,votes_up,comment_count,steam_purchase,recieved_for_free,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,token count,char count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
157337410,"Games good, But Rockstar Your a BILLION DOLLAR...",2024-02-01 16:00:22,True,0,0,True,False,0,9,9105,131,9105,2024-01-30 21:04:02,39.0,187.0
157337371,modders make it better and free everything,2024-02-01 15:59:57,True,0,0,True,True,0,1,2530,1728,2530,2024-02-01 16:01:41,7.0,42.0
157337210,great game,2024-02-01 15:57:48,True,0,0,False,False,0,1,281457,4523,281457,2024-02-01 00:48:38,2.0,10.0
157336468,best,2024-02-01 15:47:51,True,0,0,True,False,34,3,2842,431,2816,2024-02-01 15:50:12,1.0,4.0
157335380,sed,2024-02-01 15:32:26,True,0,0,True,False,11,2,2055,79,2021,2024-02-01 16:12:38,1.0,3.0


In [4]:
df[df['token count'] > 10].shape

(10680, 15)

In [5]:
from plotly.express import histogram
histogram(data_frame=df[df['token count'] > 10], x='created',)

In [6]:
histogram(data_frame=df, x='char count', log_y=True)

We need to cut our dataset down to reviews that are long enough to capture their sentiment but not so long that they break our sentiment model.

In [7]:
import pandas as pd
from arrow import now
from transformers import pipeline

MODEL = 'bhadresh-savani/distilbert-base-uncased-emotion'

sentiment_df = df[(10 < df['token count'])]
print('we have {} reviews for which we can determine sentiment'.format(len(sentiment_df)))
sentences = sentiment_df['review'].values.tolist()
# this takes about a minute per 1000-1200 sentences
time_start = now()
pipe = pipeline(task='sentiment-analysis', model=MODEL)
# result_df = pd.DataFrame(pipe(sentences)).groupby(by='label').sum().reset_index()
broken = []
for index, row in sentiment_df.iterrows():
    try:
        result = pipe(row['review'])
    except RuntimeError:
        broken.append(index)
        
print(broken)
print('done in {}'.format(now() - time_start))

2024-02-19 14:37:24.791352: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 14:37:24.791512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 14:37:24.961926: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


we have 10680 reviews for which we can determine sentiment


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]


TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()



tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (630 > 512). Running this sequence through the model will result in indexing errors


[157176411, 157008817, 156818194, 156716587, 156658710, 156639546, 156590645, 156429244, 156427962, 156393027, 156267254, 156138573, 156117565, 156012075, 155873125, 155846468, 155522133, 155521709, 155121035, 155093561, 154499400, 154487011, 154470938, 154198922, 154152700, 154068646, 153858013, 153842921, 153836037, 153825974, 153776712, 153653768, 153630030, 153562353, 153540929, 153504770, 153442207, 153252351, 153085697, 153038207, 152795763, 152350894, 152030996, 151998130, 151964455, 151735360, 150533300, 150491242, 150402060, 150383319, 150186592, 150092175, 149890740, 149787616, 149787549, 149780595, 149520621, 149134799, 149004400, 148872458, 148821599, 148772246, 148580390, 148514501, 148493048, 148491827, 148199074, 147899731, 147860402, 147801990, 147785368, 147717356, 147635548, 147516331, 147488085, 147473810, 147290512, 147241897, 147161507, 146975033, 146812946, 146810848, 146486115, 146460833, 146393731, 146335484, 146229968, 146145237, 146080741, 146069885, 145710265