In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [38]:
game_reviews = pd.read_csv("all_Steam_Reviews_cleaned.csv")
pos_reviews=pd.read_csv("positive_steam_reviews.csv")
neg_reviews=pd.read_csv("negative_steam_reviews.csv")

In [39]:
pos_reviews

Unnamed: 0,Game,Review,Recommended,Genre,cleaned_review
0,Company of Heroes 3,Here is my actual review. COH3 has the best Qo...,True,"Action, Strategy",actual review COH good qol franchise stop play...
1,Company of Heroes 3,I resisted for quite a while to buy this game ...,True,"Action, Strategy",resist buy game not expect game different CoH ...
2,Company of Heroes 3,Final Thoughts\nCompany of Heroes 3 is a stunn...,True,"Action, Strategy",final Thoughts Company Heroes stunning evoluti...
3,Company of Heroes 3,game has come a long way since release but can...,True,"Action, Strategy",game come long way release feel like cash grab...
4,Company of Heroes 3,A great RTS that is going to get better with e...,True,"Action, Strategy",great RTS go well major update time start play...
...,...,...,...,...,...
34040,Baldur's Gate 3,It's not just a game! It's a whole story that ...,True,"Adventure, RPG, Strategy",game story completely immerse primarily thank ...
34041,Baldur's Gate 3,It's the closest you're going to get to playin...,True,"Adventure, RPG, Strategy",close go play DnD group friend willing sit pla...
34042,Baldur's Gate 3,verry gooood gammeeee. I slept on it for to lo...,True,"Adventure, RPG, Strategy",verry gooood gammeeee sleep long finish ton co...
34043,Baldur's Gate 3,Funny wild magic sorceror Durge run made every...,True,"Adventure, RPG, Strategy",funny wild magic sorceror Durge run turn cat t...


In [40]:
# Define TF-IDF vectorizers
vectorizer_pos = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=5000)  # Unigrams & bigrams
vectorizer_neg = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=5000)  # Unigrams & bigrams

# Remove NaN values from the cleaned_review columns
pos_reviews_cleaned = pos_reviews["cleaned_review"].dropna()
neg_reviews_cleaned = neg_reviews["cleaned_review"].dropna()

# Fit and transform the positive reviews
tfidf_matrix_pos = vectorizer_pos.fit_transform(pos_reviews_cleaned)

# Fit and transform the negative reviews
tfidf_matrix_neg = vectorizer_neg.fit_transform(neg_reviews_cleaned)

# Get feature names
terms_pos = vectorizer_pos.get_feature_names_out()
terms_neg = vectorizer_neg.get_feature_names_out()

# Compute the mean TF-IDF scores per term
tfidf_sum_pos = tfidf_matrix_pos.mean(axis=0).A1
tfidf_sum_neg = tfidf_matrix_neg.mean(axis=0).A1

# Create DataFrames with terms and scores
tfidf_df_pos = pd.DataFrame({'term': terms_pos, 'tfidf_score': tfidf_sum_pos})
tfidf_df_neg = pd.DataFrame({'term': terms_neg, 'tfidf_score': tfidf_sum_neg})

# Sort by importance (highest TF-IDF scores)
tfidf_df_pos = tfidf_df_pos.sort_values(by="tfidf_score", ascending=False)
tfidf_df_neg = tfidf_df_neg.sort_values(by="tfidf_score", ascending=False)


In [41]:
# Define function to get top words per game
def get_top_words_per_game(reviews_df, vectorizer, top_n=10):
    top_words_per_game = {}

    for game in reviews_df["Game"].unique():
        # Filter reviews for the specific game
        game_reviews = reviews_df[reviews_df["Game"] == game]["cleaned_review"]

        if game_reviews.empty:
            continue  # Skip if no reviews

        # Apply TF-IDF for the specific game
        tfidf_matrix = vectorizer.fit_transform(game_reviews)
        terms = vectorizer.get_feature_names_out()

        # Compute mean TF-IDF scores for words
        row = tfidf_matrix.mean(axis=0).A1
        sorted_terms = [terms[idx] for idx in row.argsort()[::-1][:top_n]]

        # Store in dictionary
        top_words_per_game[game] = sorted_terms

    return top_words_per_game





In [43]:
# Remove NaN values from the cleaned_review columns
pos_reviews_cleaned = pos_reviews.dropna(subset=['cleaned_review'])
neg_reviews_cleaned = neg_reviews.dropna(subset=['cleaned_review'])

# Compute top words per game for positive & negative reviews
top_words_pos = get_top_words_per_game(pos_reviews_cleaned, vectorizer_pos)
top_words_neg = get_top_words_per_game(neg_reviews_cleaned, vectorizer_neg)

In [45]:
# Ensure both dictionaries have the same keys
games = set(top_words_pos.keys()).union(set(top_words_neg.keys()))

# Create lists for DataFrame
games_list = []
top_pos_words_list = []
top_neg_words_list = []

for game in games:
    games_list.append(game)
    top_pos_words_list.append(top_words_pos.get(game, []))
    top_neg_words_list.append(top_words_neg.get(game, []))

# Convert to DataFrame
df_top_words = pd.DataFrame({
    "Game": games_list,
    "Top_Positive_Words": top_pos_words_list,
    "Top_Negative_Words": top_neg_words_list
})

# Save to CSV
df_top_words.to_csv("game_top_tfidf_words.csv", index=False)


In [46]:
df_top_words

Unnamed: 0,Game,Top_Positive_Words,Top_Negative_Words
0,No Man's Sky,"[game, play, good, space, like, great, fun, lo...","[game, bug, like, feel, time, thing, new, base..."
1,The Binding of Isaac: Rebirth,"[game, good, play, fun, like, isaac, good game...","[game, like, time, item, good, play, shit, fee..."
2,Pathfinder: Kingmaker — Enhanced Plus Edition,"[game, good, enjoy, great, pathfinder, good ga...","[game, nah, play, like, pathfinder, fun, encou..."
3,MONSTER HUNTER RISE,"[good, game, monster, hunter, fun, yes, world,...","[play, game, bad, monster, fun, remove, mid, d..."
4,Artifact,"[valve murder, valve, murder great, murder, gr...",[]
...,...,...,...
92,DiRT Rally,"[game, rally, good, nice, rally game, old, car...",[]
93,The Sims™ 4,"[good, game, fun, love, sim, play, like, good ...","[game, dlc, sims, sim, play, buy, ea, pack, wa..."
94,Battlefield 4™,"[good, game, battlefield, good game, yes, grea...","[game, server, punkbuster, play, kick, bug, mu..."
95,Box Cat Bash,"[fun, play, love, love bough, game, game love,...",[]


As we can see the words have some like "game", "good", "play", "great", "bad" that doesnt really say nothing to us, therefore we have to add them to a stopword list to remove them.