# Data load, clean and join

In [54]:
!pip install langid



In [55]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import langid
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [56]:
games_df = pd.read_csv('games-reducido.csv', header=None, skiprows=1)
reviews_df = pd.read_csv('reviews-reducido.csv')

In [57]:
# Agrego columna "Unknown" ya que no viene nomenclada y genera un desfasaje en los indices de columnas
games_df_column_names = ['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU', 
                    'Required age', 'Price', 'Unknown', 'DiscountDLC count', 'About the game', 
                    'Supported languages', 'Full audio languages', 'Reviews', 'Header image', 
                    'Website', 'Support url', 'Support email', 'Windows', 'Mac', 
                    'Linux', 'Metacritic score', 'Metacritic url', 'User score', 
                    'Positive', 'Negative', 'Score rank', 'Achievements', 
                    'Recommendations', 'Notes', 'Average playtime forever', 
                    'Average playtime two weeks', 'Median playtime forever', 
                    'Median playtime two weeks', 'Developers', 'Publishers', 
                    'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies']
games_df.columns = games_df_column_names

In [58]:
games_df.shape

(4870, 40)

In [59]:
reviews_df.shape

(320855, 5)

In [60]:
games_df.dtypes

AppID                           int64
Name                           object
Release date                   object
Estimated owners               object
Peak CCU                        int64
Required age                    int64
Price                         float64
Unknown                         int64
DiscountDLC count               int64
About the game                 object
Supported languages            object
Full audio languages           object
Reviews                        object
Header image                   object
Website                        object
Support url                    object
Support email                  object
Windows                          bool
Mac                              bool
Linux                            bool
Metacritic score                int64
Metacritic url                 object
User score                      int64
Positive                        int64
Negative                        int64
Score rank                    float64
Achievements

In [61]:
reviews_df.dtypes

app_id           int64
app_name        object
review_text     object
review_score     int64
review_votes     int64
dtype: object

In [62]:
games_df.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,Unknown,DiscountDLC count,About the game,Supported languages,Full audio languages,Reviews,Header image,Website,Support url,Support email,Windows,Mac,Linux,Metacritic score,Metacritic url,User score,Positive,Negative,Score rank,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,1535090,VR Kyoto: Beauty of Japan,"Feb 16, 2021",0 - 20000,0,0,2.99,0,0,Immerse yourself in the world of Japanese beautiful places / towns for about 20 minutes. You wil...,"['English', 'French', 'Italian', 'German', 'Spanish - Spain']",[],,https://cdn.akamai.steamstatic.com/steam/apps/1535090/header.jpg?t=1658712885,,vr.oxbros.com,,True,False,False,0,,0,16,8,,0,0,,0,0,0,0,William at Oxford,William at Oxford,Single-player,Casual,"Casual,Walking Simulator,First-Person,VR,Singleplayer",https://cdn.akamai.steamstatic.com/steam/apps/1535090/ss_2ec0c7535563eede58ed4569a444dee750129bd...,http://cdn.akamai.steamstatic.com/steam/apps/256820968/movie_max.mp4?t=1658712884
1,745970,Terminal Conflict,"Sep 24, 2020",0 - 20000,1,0,29.99,0,4,Command history in this most definitive Cold War strategy game . Gain access to your retro termi...,['English'],['English'],"“It's engaging, well designed and made with a passion for the period. It delivers as one of the ...",https://cdn.akamai.steamstatic.com/steam/apps/745970/header.jpg?t=1648395677,http://terminalconflict.com,http://strategy-mill.com/contact.html,support@strategy-mill.com,True,True,True,0,,0,42,14,,42,0,,0,0,0,0,"Strategy Mill,BL-Logic,Scribble Pad Studios,Polywickstudio",Strategy Mill,"Single-player,Multi-player,PvP,Online PvP,Cross-Platform Multiplayer,Steam Achievements,Steam Wo...","Simulation,Strategy","Simulation,Strategy,Cold War,Political,Retro,Grand Strategy,Turn-Based Strategy,Board Game,Polit...",https://cdn.akamai.steamstatic.com/steam/apps/745970/ss_9cd69b483866f3f76aaf1f6893112e9b9b9ddae4...,http://cdn.akamai.steamstatic.com/steam/apps/256737282/movie_max.mp4?t=1543852819
2,1548270,Darkilson,"Feb 23, 2021",0 - 20000,0,0,0.99,0,0,"Due to a terrible curse, the BA Bear was locked down in an asylum for meek and hairy bears. Help...",['Portuguese - Brazil'],['Portuguese - Brazil'],,https://cdn.akamai.steamstatic.com/steam/apps/1548270/header.jpg?t=1614095231,https://www.facebook.com/KomachoGames,https://www.facebook.com/KomachoGames,komachogames@gmail.com,True,False,False,0,,0,10,1,,9,0,This game contains partial nudity and some curse words.,0,0,0,0,Komacho,Komacho,"Single-player,Steam Achievements,Full controller support","Casual,Indie","Memes,Singleplayer,Stealth,Dark,Horror,Surreal,2D,Cute,Platformer,Perma Death,Third Person,Psych...",https://cdn.akamai.steamstatic.com/steam/apps/1548270/ss_9a5b6333d8866661d9c07e05b440b86c330fe3d...,http://cdn.akamai.steamstatic.com/steam/apps/256821572/movie_max.mp4?t=1612900463
3,2944670,Rapta,"Jul 12, 2024",0 - 0,0,0,4.49,0,0,Introduction: Explore the Silicon Sneak system and use your efforts to rescue your kidnapped sis...,"['English', 'Portuguese - Brazil']",[],,https://shared.akamai.steamstatic.com/store_item_assets/steam/apps/2944670/header.jpg?t=1720795987,,,RaptaSteamGame@gmail.com,True,False,False,0,,0,0,0,,15,0,"The theme and situations present in the game involve kidnappings, system invasions, theft and mu...",0,0,0,0,"Felipai,Kamuii","Felipai,Kamuii","Single-player,Steam Achievements,Steam Cloud,Family Sharing","Casual,Indie,Simulation,Strategy,Early Access",,https://shared.akamai.steamstatic.com/store_item_assets/steam/apps/2944670/ss_8270639424b75b0903...,http://cdn.akamai.steamstatic.com/steam/apps/257033007/movie_max.mp4?t=1719600752
4,2260980,University Life Visual Novel,"Feb 23, 2023",0 - 0,0,0,13.99,0,0,Entertaining &amp; Engaging Story Welcome to University Life! Set in the vibrant town of Falls V...,['English'],['English'],,https://cdn.akamai.steamstatic.com/steam/apps/2260980/header.jpg?t=1677186294,,,universitylifevn@gmail.com,True,False,False,0,,0,0,0,,0,0,"This Visual novel contains: Sexual Content, Violence, Alcohol, Smoking, and Mature Language. If ...",0,0,0,0,"Travis Sullivan,Alex Draco,Elisha Bauder",Sarah Nyx,Single-player,"Action,Adventure,Casual,Indie,Simulation",,https://cdn.akamai.steamstatic.com/steam/apps/2260980/ss_eac112731f081ba1874d6b1950df7e78b917044...,http://cdn.akamai.steamstatic.com/steam/apps/256930714/movie_max.mp4?t=1676480737


In [63]:
reviews_df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,391540,Undertale,Never before have I played something that made me feel like my actions mattered so much... even ...,1,0
1,239140,Dying Light,"Great game, kept me entertained for more than a whole playthrough. Visuals are great and control...",1,0
2,238960,Path of Exile,"Honestly, the devs of this game have such integrity and dedication to their game its unbelievabl...",1,0
3,92000,Hydrophobia: Prophecy,"It's not that bad game. Sometimes it feels more like tech demo, but after all the gameplay is qu...",1,0
4,7520,Two Worlds II,I will give this game a thumbs up becaus it costs less than $2.50 and is at least as entertainin...,1,0


In [64]:
games_df['AppID'].nunique()

4870

In [65]:
reviews_df["app_id"].nunique()

477

In [66]:
reviews_df["review_score"].value_counts()

review_score
 1    284087
-1     36768
Name: count, dtype: int64

In [67]:
# Dataframes cleaning

In [68]:
games_df_columns = ['AppID', 'Name', 'Windows', 'Mac', 'Linux', 'Genres', 'Release date', 'Average playtime forever', 'Positive', 'Negative']
reviews_df_columns = ['app_id', 'review_text', 'review_score']

In [69]:
games_df_cleaned = games_df.dropna(subset=games_df_columns)[games_df_columns].copy()
reviews_df_cleaned = reviews_df.dropna(subset=reviews_df_columns)[reviews_df_columns].copy()

In [70]:
games_df_cleaned["Genres"] = games_df_cleaned["Genres"].str.lower()

In [71]:
reviews_df_cleaned['review_text'] = reviews_df_cleaned['review_text'].astype(str)

# Queries resolution

## Q1: Cantidad de juegos soportados en cada plataforma (Windows, Linux, MAC)

In [72]:
windows_supported_games = games_df_cleaned[games_df_cleaned["Windows"] == True]
linux_supported_games = games_df_cleaned[games_df_cleaned["Linux"] == True]
mac_supported_games = games_df_cleaned[games_df_cleaned["Mac"] == True]

In [73]:
print("Total de juegos: " + str(games_df_cleaned.shape[0]))
print("Total de juegos soportados en Windows: " + str(windows_supported_games.shape[0]))
print("Total de juegos soportados en Linux: " + str(linux_supported_games.shape[0]))
print("Total de juegos soportados en Mac: " + str(mac_supported_games.shape[0]))

Total de juegos: 4634
Total de juegos soportados en Windows: 4632
Total de juegos soportados en Linux: 603
Total de juegos soportados en Mac: 914


## Q2: Nombre del top 10 de juegos del género "Indie" publicados en la década del 2010 con más tiempo promedio histórico de juego

In [74]:
games_indie = games_df_cleaned[games_df_cleaned["Genres"].str.contains("indie")]

In [75]:
games_indie_2010_decade = games_indie[games_indie["Release date"].str.contains("201")]

In [76]:
games_indie.shape

(3246, 10)

In [77]:
games_indie_2010_decade.shape

(1178, 10)

In [78]:
q2_result = games_indie_2010_decade.sort_values(by='Average playtime forever', ascending=False).head(10)

In [79]:
q2_result[['Name', 'Average playtime forever']]

Unnamed: 0,Name,Average playtime forever
864,Aberoth,8334
2277,Path of Exile,8194
890,Workers & Resources: Soviet Republic,6536
72,Dawn of Man,4014
4369,Wargame: Red Dragon,3484
3822,Hero Zero,2851
3720,Fell Seal: Arbiter's Mark,2644
2595,Automation Empire,2458
3428,Ninja Stealth,2179
1101,City Car Driving,2070


## Q3: Nombre de top 5 juegos del género "Indie" con más reseñas positivas

In [80]:
games_indie_reduced = games_indie[["AppID", "Name"]]
games_indie_reduced.head()

Unnamed: 0,AppID,Name
2,1548270,Darkilson
3,2944670,Rapta
4,2260980,University Life Visual Novel
8,2258890,黑白之地 Xanadu Land
9,1110490,LAB Defence


In [81]:
reviews_reduced_q3 = reviews_df_cleaned[["app_id", "review_score"]]

In [82]:
games_indie_reviews = pd.merge(games_indie_reduced, reviews_reduced_q3, left_on='AppID', right_on='app_id', how='inner')

In [83]:
def positive_score(score):
    return 1 if score > 0 else 0

games_indie_reviews['positive_score'] = games_indie_reviews['review_score'].apply(positive_score)

In [84]:
q3_result = games_indie_reviews.groupby('Name')['positive_score'].sum().sort_values(ascending=False).head(5)

In [85]:
q3_result.head(10)

Name
Undertale                 42284
FTL: Faster Than Light    21302
Insurgency                20666
Path of Exile             17058
Super Meat Boy             7926
Name: positive_score, dtype: int64

## Q4: Nombre de juegos del género "action" con más de 5.000 reseñas negativas en idioma inglés

In [86]:
# Juegos de acción
games_action = games_df_cleaned[games_df_cleaned["Genres"].str.contains("action")]
games_action_reduced = games_action[["AppID", "Name"]]
games_action_reduced.shape

(1896, 2)

In [87]:
reviews_q4 = reviews_df_cleaned.copy()

In [102]:
# Reviews con mas de 5000 comentarios negativos

def negative_score(score):
    return 1 if score < 0 else 0

reviews_q4["negative_score"] = reviews_q4["review_score"].apply(negative_score)
reviews_q4_negatives = reviews_q4[reviews_q4["negative_score"] == 1].copy()
reviews_count = reviews_q4_negatives.groupby('app_id').size().reset_index(name='count')
# reviews_count_more_than_5000 = reviews_count[reviews_count["count"] > 5000]
reviews_count_more_than_5000 = reviews_count[reviews_count["count"] > 1200]
reviews_count_more_than_5000.shape

(4, 2)

In [103]:
# De las reviews con mas de 5000 comentarios negativos, nos quedamos con aquellas que sean sobre juegos de acción
games_action_with_5000_negative_reviews = pd.merge(games_action_reduced, reviews_count_more_than_5000, left_on='AppID', right_on="app_id", how='inner')
games_action_with_5000_negative_reviews = games_action_with_5000_negative_reviews[["AppID", "Name"]]
games_action_with_5000_negative_reviews.shape

(3, 2)

In [104]:
# Enriquecemos con el texto de la review
reviews_count_more_than_5000_with_text = pd.merge(reviews_q4, games_action_with_5000_negative_reviews, left_on='app_id', right_on="AppID", how='inner')
reviews_count_more_than_5000_with_text = reviews_count_more_than_5000_with_text[["app_id", "review_text"]]
reviews_count_more_than_5000_with_text.shape

(43816, 2)

In [105]:
# CPU INTENSIVE #############################
def detect_language(texto):
    language, _ = langid.classify(texto)
    return language
#############################################

# Calculo del idioma sobre las reviews
start_time = time.time()
reviews_count_more_than_5000_with_text["review_language"] = reviews_count_more_than_5000_with_text['review_text'].apply(detect_language)
elapsed_time = time.time() - start_time
print(f"Execution time on {reviews_count_more_than_5000_with_text.shape[0]} rows: {elapsed_time:.2f} seconds")

# Con el dataset original y más de 5000 reviews tarda 722.20 segundos
# Con el dataset reducido y más de 250 reviews tarda x segundos

Execution time on 43816 rows: 55.14 seconds


In [106]:
reviews_count_more_than_5000_with_text.shape

(43816, 3)

In [107]:
# Nos quedamos con aquellas reviews que estan en idioma inglés
reviews_count_more_than_5000_with_text_english = reviews_count_more_than_5000_with_text[reviews_count_more_than_5000_with_text["review_language"] == "en"]
reviews_count_more_than_5000_with_text_english.shape

(42629, 3)

In [109]:
# Nos quedamos con aquellos juegos que tengan mas de 5000 reseñas negativas en inglés
q4_results_app_ids = reviews_count_more_than_5000_with_text_english.groupby('app_id').size().reset_index(name='count')
# q4_results_app_ids = q4_results_app_ids[q4_results_app_ids["count"] > 5000]
q4_results_app_ids = q4_results_app_ids[q4_results_app_ids["count"] > 1200]
q4_results_app_ids.head(25)

Unnamed: 0,app_id,count
0,222880,21817
1,238960,17817
2,292730,2995


In [110]:
# Enriquecemos con el nombre de esos juegos
q4_results_games_names = pd.merge(q4_results_app_ids, games_action_with_5000_negative_reviews, left_on='app_id', right_on="AppID", how='inner')["Name"]
q4_results_games_names.head(25)

0                         Insurgency
1                      Path of Exile
2    Call of Duty®: Infinite Warfare
Name: Name, dtype: object

## Q5: Nombre de juegos del género "action" dentro del percentil 90 en cantidad de reseñas negativas

In [111]:
games_action_reduced.head()

Unnamed: 0,AppID,Name
4,2260980,University Life Visual Novel
7,1687430,Z Mutants
9,1110490,LAB Defence
10,1270670,Touhou Double Focus
12,1957780,Ghosts of Tabor


In [112]:
games_action_reduced.shape

(1896, 2)

In [113]:
reviews_q5 = reviews_df_cleaned.copy()
reviews_q5 = reviews_q5[["app_id", "review_score"]]
reviews_q5["negative_score"] = reviews_q5["review_score"].apply(negative_score)
reviews_q5_negative_score = reviews_q5[reviews_q5["negative_score"] == 1]
reviews_q5_negative_score.shape

(36738, 3)

In [114]:
reviews_q5_negative_score_action = pd.merge(reviews_q5_negative_score, games_action_reduced, left_on='app_id', right_on="AppID", how='inner')
reviews_q5_negative_score_action.shape

(21167, 5)

In [115]:
reviews_q5_negative_score_action_by_app_id = reviews_q5_negative_score_action.groupby('app_id').size().reset_index(name='count')
reviews_q5_negative_score_action_by_app_id.shape

(210, 2)

In [116]:
percentil_90 = reviews_q5_negative_score_action_by_app_id['count'].quantile(0.90)
percentil_90

np.float64(218.0)

In [117]:
q5_result = reviews_q5_negative_score_action_by_app_id[reviews_q5_negative_score_action_by_app_id['count'] >= percentil_90]
q5_result.shape

(22, 2)

In [118]:
q5_result_with_game_names = pd.merge(q5_result, games_action_reduced, left_on='app_id', right_on="AppID", how='inner')
q5_result_with_game_names[["app_id", "Name"]].head(10)

Unnamed: 0,app_id,Name
0,500,Left 4 Dead
1,620,Portal 2
2,8080,Kane and Lynch: Dead Men™
3,8500,EVE Online
4,17410,Mirror's Edge™
5,21100,F.E.A.R. 3
6,92000,Hydrophobia: Prophecy
7,111900,Guardians of Middle-earth
8,204300,Awesomenauts - the 2D moba
9,205100,Dishonored
