In [3]:
import pandas as pd
from collections import Counter

def map_chunk(df_chunk):
    counter = Counter()

    df_filtered = df_chunk[df_chunk["votes_helpful"] >= 3]

    for _, row in df_filtered.iterrows():
        key = (row["app_id"], row["app_name"])
        counter[key] += 1

    return counter

def shuffle(mapped_counters):
    total = Counter()
    for counter in mapped_counters:
        total.update(counter)
    return total

def reduce(shuffeled_counter):
    return shuffeled_counter.most_common()

def map_reduce(file, chunk_size=200_000):
    print(file)

    mapped = []
    handled_reviews = set()

    chunks = pd.read_csv(
        file,
        usecols = ["app_id", "app_name", "review_id", "votes_helpful"],
        dtype = {
            "app_id": "string",
            "app_name": "string",
            "review_id": "string",
            "votes_helpful": "Int64"
        },
        chunksize = chunk_size,
        # low_memory = True,
        engine = "c"
    )

    for chunk in chunks:
        chunk = chunk.dropna(subset=["review_id"])
        
        mask_new = ~chunk["review_id"].isin(handled_reviews)
        chunk_new = chunk.loc[mask_new]

        chunk_new = chunk_new.drop_duplicates(subset=["review_id"])

        if not chunk_new.empty:
            mapped.append(map_chunk(chunk_new))
            handled_reviews.update(chunk_new["review_id"].unique())

    shuffled = shuffle(mapped)
    result = reduce(shuffled)

    return result


file = r"./data/steam_reviews.csv"
result = map_reduce(file)

print("Статистика полезных отзывов у игр")
for (app_id, app_name), helpful_count in result:
    print(f"{app_name} ({app_id}) - {helpful_count}")

./data/steam_reviews.csv
Статистика полезных отзывов у игр
PLAYERUNKNOWN'S BATTLEGROUNDS (578080) - 75241
Grand Theft Auto V (271590) - 55543
No Man's Sky (275850) - 42819
PAYDAY 2 (218620) - 39084
Fallout 4 (377160) - 37719
ARK: Survival Evolved (346110) - 37594
Tom Clancy's Rainbow Six Siege (359550) - 35486
Rust (252490) - 28336
Dead by Daylight (381210) - 27428
The Witcher 3: Wild Hunt (292030) - 25383
Garry's Mod (4000) - 23572
Rocket League (252950) - 20632
Terraria (105600) - 20367
The Elder Scrolls V: Skyrim (72850) - 17589
DARK SOULS™ III (374320) - 15544
Euro Truck Simulator 2 (227300) - 14938
The Forest (242760) - 14567
NieR:Automata™ (524220) - 13947
Sid Meier's Civilization VI (289070) - 13728
The Elder Scrolls Online (306130) - 13465
Arma 3 (107410) - 13272
Monster Hunter: World (582010) - 13224
Stardew Valley (413150) - 13158
The Elder Scrolls V: Skyrim Special Edition (489830) - 13069
For Honor (304390) - 12632
Dying Light (239140) - 12379
Stellaris (281990) - 12339
Dar