In [1]:
!pip install --quiet keybert
print('pip install keybert complete')

pip install keybert complete


In [2]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

In [3]:
import pandas as pd

GTAV = '/kaggle/input/grand-theft-auto-v/GTAV_Steam_Reviews.csv'

df = pd.read_csv(filepath_or_buffer=GTAV, parse_dates=['created', 'author_last_played'], index_col=['id']).drop(columns=['language', 'written_during_early_access'])
df['token count'] = df['review'].str.split().str.len()
df.head()

Unnamed: 0_level_0,review,created,voted_up,votes_up,comment_count,steam_purchase,recieved_for_free,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,token count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
157337410,"Games good, But Rockstar Your a BILLION DOLLAR...",2024-02-01 16:00:22,True,0,0,True,False,0,9,9105,131,9105,2024-01-30 21:04:02,39.0
157337371,modders make it better and free everything,2024-02-01 15:59:57,True,0,0,True,True,0,1,2530,1728,2530,2024-02-01 16:01:41,7.0
157337210,great game,2024-02-01 15:57:48,True,0,0,False,False,0,1,281457,4523,281457,2024-02-01 00:48:38,2.0
157336468,best,2024-02-01 15:47:51,True,0,0,True,False,34,3,2842,431,2816,2024-02-01 15:50:12,1.0
157335380,sed,2024-02-01 15:32:26,True,0,0,True,False,11,2,2055,79,2021,2024-02-01 16:12:38,1.0


If we throw away short reviews how much data do we have left?

In [4]:
df[df['token count'] > 10].shape

(10680, 14)

What does the time series look like? Are all of the reviews bunched up into a few days?

In [5]:
from plotly.express import histogram
histogram(data_frame=df[df['token count'] > 10], x='created',)

We have a bimodal distribution, more or less, so there's a good chance not all of the reviews will be talking about the same thing.

We need to cut our dataset down to reviews that are long enough to capture their sentiment; we've identified 163 cases of reviews that meet our length criterion but that break our sentiment model, so we're going to drop those too.

In [6]:
import pandas as pd
from arrow import now
from transformers import pipeline

EMOTION = 'bhadresh-savani/distilbert-base-uncased-emotion'

sentiment_df = df[(10 < df['token count'])].drop(labels=[157176411, 157008817, 156818194, 156716587, 156658710, 156639546, 156590645, 156429244, 156427962, 156393027, 156267254, 156138573, 156117565, 156012075, 155873125, 155846468, 155522133, 155521709, 155121035, 155093561, 154499400, 154487011, 154470938, 154198922, 154152700, 154068646, 153858013, 153842921, 153836037, 153825974, 153776712, 153653768, 153630030, 153562353, 153540929, 153504770, 153442207, 153252351, 153085697, 153038207, 152795763, 152350894, 152030996, 151998130, 151964455, 151735360, 150533300, 150491242, 150402060, 150383319, 150186592, 150092175, 149890740, 149787616, 149787549, 149780595, 149520621, 149134799, 149004400, 148872458, 148821599, 148772246, 148580390, 148514501, 148493048, 148491827, 148199074, 147899731, 147860402, 147801990, 147785368, 147717356, 147635548, 147516331, 147488085, 147473810, 147290512, 147241897, 147161507, 146975033, 146812946, 146810848, 146486115, 146460833, 146393731, 146335484, 146229968, 146145237, 146080741, 146069885, 145710265, 145668492, 145595360, 145570865, 145137479, 145017458, 144973412, 144973220, 144876317, 144673818, 144666271, 144529017, 144368912, 144325205, 144258527, 144079277, 144073937, 144003131, 143961859, 143817030, 143729379, 143372674, 143357380, 143308229, 143177962, 143044733, 142986284, 142930813, 142898798, 142878894, 142825606, 142818994, 142742797, 142654991, 142572485, 142522827, 142506071, 142431120, 142384949, 142384322, 142346229, 142270511, 142218693, 142112986, 142098177, 142093027, 142077550, 142002976, 141925869, 141925067, 141849764, 141810418, 141786630, 141672971, 141664071, 141637198, 141628276, 141541750, 141454727, 141441042, 141391324, 141309035, 141305292, 141289481, 141274135, 141221337, 141217260, 141191473, 141124274, 141101669, 140941522, 140902705, 140831897], axis=0)
print('we have {} reviews for which we can determine sentiment'.format(len(sentiment_df)))
sentences = sentiment_df['review'].values.tolist()
# this takes about a minute per 1000-1200 sentences
# or in our case about twelve minutes
time_start = now()
pipe = pipeline(task='sentiment-analysis', model=EMOTION)
result_df = pd.DataFrame(pipe(sentences)).groupby(by='label').sum().reset_index()
print('done in {}'.format(now() - time_start))

2024-02-19 15:27:24.829116: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 15:27:24.829342: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 15:27:25.014390: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


we have 10517 reviews for which we can determine sentiment


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]


TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()



tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

done in 0:10:59.020839


In [7]:
from plotly.express import line_polar
line_polar(data_frame=result_df, r='score', theta='label', line_close=True, title='GTA V reviews', log_r=True, )

We know from past experience that this sentiment model tends to find joy in everything; the fact that we're seeing lots of anger and sadness is not encouraging.

Let's see if we can use a keyword model to find out what the reviewers are talking about.

In [8]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 2
MINILM = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
DOCS = sentiment_df['review'].values.tolist()

model_start = now()
model = KeyBERT(model=MINILM,)
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
sentiment_df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding time: 0:05:04.295515
we have 10517 documents and 7206 words.
model time: 0:05:17.048432


In [9]:
import pandas as pd
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
sentiment_df['short text'] = sentiment_df['review'].apply(func=lambda x: ' '.join(x.split()[:20]) + '...' if len(x.split()) > 20 else x)
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
sentiment_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:38.105046


In [10]:
sentiment_df['keyword'].nunique(), sentiment_df['keyword'].value_counts(normalize=True).head(n=10)

(1687,
 keyword
 gta         0.187221
 game        0.103832
 rockstar    0.055719
 modders     0.013692
 mode        0.013502
 fun         0.012266
 play        0.011695
 hackers     0.011220
 steam       0.010554
 gta6        0.010269
 Name: proportion, dtype: float64)

In [11]:
histogram(data_frame=sentiment_df['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count')

Now we can look at the reviews with the top ten or so keywords in a scatter plot.

In [12]:
from plotly.express import scatter
scatter(data_frame=sentiment_df[sentiment_df['keyword'].isin(sentiment_df['keyword'].value_counts().head(n=10).index.tolist())], x='u0', y='u1', hover_name='short text', color='keyword', height=900).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)
