In [1]:
!pip install --quiet keybert
print('pip installed keybert')

pip installed keybert


In [2]:
import pandas as pd
from datetime import datetime
from time import mktime

def short_review(arg: str) -> str:
    pieces = arg.split()
    return ' '.join(pieces[:10]) + '...'

# https://stackoverflow.com/a/6451892
def since(date):
    return mktime(date.timetuple())

def year_fraction(date) -> float:
    this_year = datetime(year=date.year, month=1, day=1)
    next_year = datetime(year=1 + date.year, month=1, day=1)
    return date.year + (since(date) - since(this_year))/(since(next_year) - since(this_year))

filename = '/kaggle/input/cyberpunk-2077-steam-reviews/cyberpunk_2077_filtered.csv'
usecols = ['language', 'review', 'updated']
df = pd.read_csv(filepath_or_buffer=filename, parse_dates=['updated'], usecols=usecols)
# let's only look at reviews in English 
df = df[df['language'] == 'english']
# we need to be mindful of review length
df['token count'] = df['review'].str.split().str.len()
df = df[df['token count'] > 3]
# we want to be able to get time slices
df['year'] = df['updated'].apply(func=year_fraction) 
df = df.drop(columns=['language'])
df['short review'] = df['review'].apply(short_review)
df.head()

Unnamed: 0,review,updated,token count,year,short review
0,It's very fun. I don't usually like open world...,2023-12-13,39,2023.947945,It's very fun. I don't usually like open world...
6,Coming back to try the game after 2.0 came out...,2023-12-13,77,2023.947945,Coming back to try the game after 2.0 came out...
10,i dont even own this fucking game why can i wr...,2023-12-13,13,2023.947945,i dont even own this fucking game why can i...
11,Todo valio la pena al final con el mejor endin...,2023-12-13,49,2023.947945,Todo valio la pena al final con el mejor endin...
12,I am a sneaky boi and I stab people with arm s...,2023-12-13,13,2023.947945,I am a sneaky boi and I stab people with...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 227338 entries, 0 to 612379
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   review        227338 non-null  object        
 1   updated       227338 non-null  datetime64[ns]
 2   token count   227338 non-null  int64         
 3   year          227338 non-null  float64       
 4   short review  227338 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 10.4+ MB


In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='updated', log_y=True).show()

Those spikes are really something; we might expect them to correlate with events like updates or minor releases, maybe?

In [5]:
histogram(data_frame=df, x='token count', log_y=True).show()

Our BERT models expect sequences of 128 subwords by default; review length measured in tokens may be an issue we need to deal with.

In [6]:
sample_df = df.sort_values(ascending=False, by='year').head(n=10000)
sample_df.shape

(10000, 5)

In [7]:
# use KeyBERT to tag using the top keyword
from arrow import now
from keybert import KeyBERT

MODEL = 'all-MiniLM-L12-v2'
model = KeyBERT(model=MODEL)

# model.max_seq_length = 256 + 128

model_start = now()
document_embeddings, word_embeddings = model.extract_embeddings(docs=sample_df['review'].values,)
print('embedding time: {}'.format(now() - model_start))
keywords = model.extract_keywords(docs=sample_df['review'].values, top_n=1)
print('model time: {}'.format(now() - model_start))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:05:31.942136
model time: 0:11:14.061065


In [8]:
TOP_N = 26 # we have 26 colors so let's use 26 tags
# we are folding together some similar tags somewhat arbitrarily
RESOLVE = {'buggy' : 'bug', 'crashes': 'crash', 'games': 'game'}
sample_df['top keyword'] = [item[0][0] if len(item) else 'unknown' for item in keywords]
sample_df['tag'] = sample_df['top keyword'].apply(func=lambda x: x if x not in RESOLVE.keys() else RESOLVE[x])
# all of the keywords out of the top N get retagged as unknown
sample_df['tag'] = sample_df['tag'].apply(func=lambda x: 'unknown' if x not in sample_df['tag'].value_counts()[:TOP_N].index.tolist() else x)
# how much of the corpus did we tag?
print(sample_df['tag'].value_counts(normalize=True))
print(sorted(sample_df['tag'].unique().tolist()))

tag
unknown      0.5486
game         0.0913
cyberpunk    0.0845
dlc          0.0300
phantom      0.0272
cdpr         0.0263
gameplay     0.0180
bugs         0.0163
launch       0.0155
witcher      0.0135
crash        0.0123
rpg          0.0119
gta          0.0095
bug          0.0088
night        0.0088
release      0.0076
patch        0.0074
glitches     0.0071
immersive    0.0067
review       0.0066
keanu        0.0065
hours        0.0064
update       0.0059
panam        0.0059
starfield    0.0059
samurai      0.0059
graphics     0.0056
Name: proportion, dtype: float64
['bug', 'bugs', 'cdpr', 'crash', 'cyberpunk', 'dlc', 'game', 'gameplay', 'glitches', 'graphics', 'gta', 'hours', 'immersive', 'keanu', 'launch', 'night', 'panam', 'patch', 'phantom', 'release', 'review', 'rpg', 'samurai', 'starfield', 'unknown', 'update', 'witcher']


In [9]:
from plotly.colors import qualitative
from plotly.express import scatter
from umap import UMAP

umap_model = UMAP(n_components=2, random_state=2023, verbose=True, n_jobs=1)
sample_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
scatter(data_frame=sample_df, x='u0', y='u1', hover_name='short review',
        hover_data = ['top keyword'],
        height=900, color='tag', color_discrete_sequence=qualitative.Alphabet).show()

UMAP(n_jobs=1, random_state=2023, verbose=True)
Thu Jan  4 16:37:52 2024 Construct fuzzy simplicial set
Thu Jan  4 16:37:52 2024 Finding Nearest Neighbors
Thu Jan  4 16:37:52 2024 Building RP forest with 10 trees
Thu Jan  4 16:37:58 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	 6  /  13
	Stopping threshold met -- exiting after 6 iterations
Thu Jan  4 16:38:16 2024 Finished Nearest Neighbor Search
Thu Jan  4 16:38:19 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Jan  4 16:38:39 2024 Finished embedding
