In [1]:
# pip install nltk transformers sentencepiece torch


In [2]:
# import pandas as pd
# from google_play_scraper import Sort, reviews

# result, continuation_token = reviews(
#     'com.nianticlabs.pokemongo',
#     lang='en', # defaults to 'en'
#     country='us', # defaults to 'us'
#     sort=Sort.NEWEST, # defaults to Sort.NEWEST
#     count=100, # defaults to 100
#     filter_score_with=None # defaults to None(means all score)
# )

# # If you pass `continuation_token` as an argument to the reviews function at this point,
# # it will crawl the items after 3 review items.

# result, _ = reviews(
#     'com.nianticlabs.pokemongo',
#     continuation_token=continuation_token # defaults to None(load from the beginning)
# )

# df = pd.DataFrame(result)

# df.to_csv("app_reviews.csv", index=False)

In [3]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

df = pd.read_csv('app_reviews.csv')

In [5]:
df['appVersion'].value_counts().sort_values()

appVersion
0.289.0     1
0.215.1     1
0.311.0     1
0.199.0     1
0.163.2     1
0.235.0     1
0.367.2     1
0.313.1     1
0.261.1     1
0.377.1     1
0.293.1     1
0.359.0     1
0.219.1     1
0.385.1     3
0.385.2    52
Name: count, dtype: int64

In [6]:
df['score'].value_counts().sort_values(ascending=False)

score
1    41
5    29
4    10
3     8
2     6
Name: count, dtype: int64

In [11]:
# Keep only needed columns
columns = ["reviewId", "userName", "content", "score", "thumbsUpCount", "appVersion", "at"]
df_reviews = df.loc[:, columns]

# Drop empty/very short reviews
df_reviews["content"] = df_reviews["content"].astype(str)
df_reviews = df_reviews[df_reviews["content"].str.len() > 10].copy()

# Basic features
df_reviews["review_len"] = df_reviews["content"].str.len()
df_reviews["review_date"] = df_reviews["at"]

sia = SentimentIntensityAnalyzer()
df_reviews["sentiment"] = df_reviews["content"].apply(lambda x: sia.polarity_scores(x)["compound"])

df_reviews.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,appVersion,at,review_len,review_date,sentiment
0,8d4a0b00-b564-433d-9777-6ed1ac2e8688,RAMYA RS,Great motivation for some physical movement.,5,0,0.385.2,2025-11-15 09:52:40,44,2025-11-15 09:52:40,0.7579
1,45b12a15-40bf-42e6-ab6d-4aedcdbc163e,Aasim Ahmed,bekar hai mat khelo ise mera bhaiya kha ra ise...,1,1,,2025-11-15 09:51:37,81,2025-11-15 09:51:37,0.0
2,1a5898a4-ca5d-4333-8dc0-e44fab008fca,BhaviK,I am playing it since 2020 and now it is not i...,5,0,0.289.0,2025-11-15 09:49:11,105,2025-11-15 09:49:11,0.2732
3,5d80f96d-f88f-4afa-9e10-189c0336eb8c,Peter Frydenlund Reenberg,Terrible design choices all around. Just wants...,1,3,0.385.2,2025-11-15 09:35:10,70,2025-11-15 09:35:10,-0.4767
4,a0d00d8a-48d1-4289-91c3-0eebc3c16333,sunil kumar,very poor game online i download the game but ...,1,3,,2025-11-15 09:32:00,66,2025-11-15 09:32:00,-0.2952


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  # stars 1–5
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_star(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1).flatten()
    # labels are 1–5 stars
    return int(torch.argmax(probs) + 1)

# For demo, apply to a sample first
df_sample = df_reviews.sample(10, random_state=42).copy()
df_sample["score_bert"] = df_sample["content"].apply(predict_star)


In [14]:
df_sample

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,appVersion,at,review_len,review_date,sentiment,score_bert
40,e651b6c1-e63f-4a14-9712-ea52ba583608,MsTruedemon Night,Only thing that could make it better is if the...,2,0,0.385.2,2025-11-14 14:49:50,330,2025-11-14 14:49:50,0.2732,1
22,998bb733-83e3-4dc8-922e-798c084e04df,KJ T,"good, just lags,wish I could play without wifi...",4,2,0.385.2,2025-11-14 22:00:24,81,2025-11-14 22:00:24,0.513,4
55,f16fe544-1d1a-4a98-898c-9ceed97296d8,erik reinbolt,I've been playing since the first month as a m...,5,0,0.385.2,2025-11-14 10:51:31,331,2025-11-14 10:51:31,0.5052,2
72,383d3f65-6b5f-459c-88ea-90af86fba3df,BB,too addicting... saw a person almost get run o...,3,0,0.261.1,2025-11-13 23:03:56,86,2025-11-13 23:03:56,0.5229,2
0,8d4a0b00-b564-433d-9777-6ed1ac2e8688,RAMYA RS,Great motivation for some physical movement.,5,0,0.385.2,2025-11-15 09:52:40,44,2025-11-15 09:52:40,0.7579,5
26,2d6cc780-b621-43ab-811b-c1714cd292c9,Carlos Ochoa,"Keeps freezing when new events starts, wastes ...",1,0,0.385.2,2025-11-14 20:55:36,107,2025-11-14 20:55:36,-0.4545,1
39,216452bf-dea7-4b44-806b-326cdc10d408,Mary Fosu Baidoo,I Like the app but it takes a long time to dow...,4,0,,2025-11-14 16:07:35,55,2025-11-14 16:07:35,0.1901,4
67,a019a6bf-f2cb-4d2d-88ef-84eabb40ce03,Dariusz Nowak,Well it's incompatible with my device now,1,0,,2025-11-14 03:11:22,41,2025-11-14 03:11:22,0.2732,1
10,050f4628-79ef-440e-9800-3143c420f9c6,Sofia,"I love Pokémon, a lot, but this game keeps gli...",1,1,0.385.2,2025-11-15 05:39:08,369,2025-11-15 05:39:08,0.3111,1
44,cefaa10f-7db0-4115-97de-76dbc9df4577,Tudorescu George Bogdan,"worst game ever , bought 3 remote pass raid , ...",1,0,,2025-11-14 13:32:46,204,2025-11-14 13:32:46,-0.8316,1


In [None]:
df_reviews

In [None]:
pd.set_option('display.max_colwidth', None)

df_reviews[df_reviews['score']<5]['content']

In [15]:
from transformers import pipeline
import torch

summarizer = pipeline("summarization", model="google/pegasus-xsum")

def summarize(text):
    try:
        return summarizer(text, max_length=30, min_length=10, do_sample=False)[0]['summary_text']
    except:
        return None

df_reviews["summary"] = df_reviews["content"].apply(summarize)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 30, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 30, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 30, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_

In [None]:
from openai import OpenAI
client = OpenAI()

reviews = df_reviews['content'].dropna().tolist()

chunks = []
chunk = []

for t in reviews:
    chunk.append(t)
    if len(chunk) == 20:  # summarize 20 reviews at a time
        chunks.append(chunk)
        chunk = []

summaries = []

for c in chunks:
    text = "\n".join(c)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Summarize user complaints in a few bullet points."},
            {"role": "user", "content": text}
        ]
    )
    summaries.append(response.choices[0].message["content"])
