# Feature Engineering

## Import the libraries

In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
%load_ext autoreload
%autoreload 2

## Create the Dataframe

In [3]:
df_reviews = pd.read_csv(r'Datasets\user_reviews_clean.csv')

In [4]:
df_items = pd.read_csv(r'Datasets\user_items_clean.csv')

In [5]:
df_games = pd.read_csv(r'Datasets\steam_games_clean.csv')

## Sentiment Analysis

We make a function to analyze the sentiments in df_reviews, we will apply it in the reviews column to make a new column and delete the other, to simplify the work of the machine learning

In [6]:
def analyze_sentiment(text):
    if text is None:
        return 1    
    
    analysis = TextBlob(text)
    sentiment = analysis.sentiment.polarity
    
    if sentiment < -0.1:  
        return 0
    elif sentiment > 0.1:  
        return 2
    else:  
        return 1

In [7]:
df_reviews['review'] = df_reviews['review'].astype(str)

In [8]:
df_reviews['sentiment_analysis'] = df_reviews.review.apply(analyze_sentiment)

In [9]:
df_reviews = df_reviews.drop(columns=['review'])
df_reviews.columns 

Index(['user_id', 'user_url', 'funny', 'posted', 'last_edited', 'item_id',
       'helpful', 'recommend', 'sentiment_analysis'],
      dtype='object')

## Dataframe preparation to load them in the API

Now, we are going to preparate and make the Datasets to be in the API. We make join the dataframes that we need and ignore unnecesary columns 

In the first query we are asked to show the quantity of Items and Free to play games by year

In [151]:
df_games.head()

Unnamed: 0,publisher,release_date,price,early_access,id,developer,genre,app_title
0,Kotoshiro,2018-01-04,4.99,0.0,761140.0,Kotoshiro,Action,Lost Summoner Kitty
1,Kotoshiro,2018-01-04,4.99,0.0,761140.0,Kotoshiro,Casual,Lost Summoner Kitty
2,Kotoshiro,2018-01-04,4.99,0.0,761140.0,Kotoshiro,Indie,Lost Summoner Kitty
3,Kotoshiro,2018-01-04,4.99,0.0,761140.0,Kotoshiro,Simulation,Lost Summoner Kitty
4,Kotoshiro,2018-01-04,4.99,0.0,761140.0,Kotoshiro,Strategy,Lost Summoner Kitty


In [16]:
price = df_games[["price","id","release_date","developer"]]
price = price.drop_duplicates()
price.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32131 entries, 0 to 85619
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         32131 non-null  object 
 1   id            32131 non-null  float64
 2   release_date  32131 non-null  object 
 3   developer     32131 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


We see that we have a lot of columns with the name Free or Free To Play, we are going to reeplace them with 0

In [17]:
price ["price"] = price["price"].replace(["Free","Free To Play",'Free to Play',       #all this values appears when
                                          'Play for Free!',"No data","Install Now",
                                          'Free Mod','Free HITMAN™ Holiday Pack','Free Movie'],0) # we try to convert the price
                                                                                    #values to float it dont work                                                                               

Now we extract the year and make a new column with it, dropping the original date column

In [19]:
mask = (price['release_date'] != 'No data') #We drop the rows that have no year, since we are not gonna use them
price = price[mask]

price['year'] = price['release_date'].str.extract(r'(\d{4})')
price.drop(columns=['release_date'], inplace=True)
price

Unnamed: 0,price,id,developer,year
0,4.99,761140.0,Kotoshiro,2018
5,0,643980.0,Secret Level SRL,2018
9,0,670290.0,Poolians.com,2017
14,0.99,767400.0,彼岸领域,2017
21,3.99,772540.0,Trickjump Games Ltd,2018
...,...,...,...,...
85603,1.99,745400.0,Bidoniera Games,2018
85607,1.99,773640.0,"Nikita ""Ghost_RUS""",2018
85611,4.99,733530.0,Sacada,2018
85614,1.99,610660.0,Laush Dmitriy Sergeevich,2018


In [20]:
df_item_new_df = df_items[["item_id","items_count"]]
df_item_new_df = df_item_new_df.rename(columns={'item_id':'id'}) #rename the column to join them later
df_item_new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5094105 entries, 0 to 5094104
Data columns (total 2 columns):
 #   Column       Dtype
---  ------       -----
 0   id           int64
 1   items_count  int64
dtypes: int64(2)
memory usage: 77.7 MB


In [21]:
df_first_query = df_item_new_df.merge(price, on='id', how='left')   #first dataframe to the first query
df_first_query['price'].fillna('0', inplace=True)  #replace the missing values with 0, since we assume that are free to play
df_first_query["price"] = df_first_query['price'].replace("No data",0)

In [22]:
df_first_query = df_first_query.dropna(subset="year") #we drop the null year items since they are useless for this query

In [23]:
df_first_query.head()

Unnamed: 0,id,items_count,price,developer,year
0,10,277,9.99,Valve,2000
1,20,277,4.99,Valve,1999
2,30,277,4.99,Valve,2003
3,40,277,4.99,Valve,2001
4,50,277,4.99,Gearbox Software,1999


Now we have to do the Query, we have to group by the itemcount by content

In [26]:
grouped = df_first_query.groupby(['year',"developer"])  #first, we group

grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015D80228F50>

In [27]:
# Then, count the total elements by year
total_count_by_year = grouped['items_count'].sum().reset_index()

total_count_by_year

Unnamed: 0,year,developer,items_count
0,1983,Digital Leisure Inc.,47345
1,1984,Digital Leisure Inc.,13835
2,1987,"MicroProse Software, Inc",127167
3,1987,Sierra,126966
4,1987,Spectrum Holobyte,54136
...,...,...,...
7411,2017,oddonegames,26136
7412,2018,Bundle Of Sticks Studios,68634
7413,2018,MP Game Studios,64512
7414,2018,Team Monkey,70712


In [28]:
# Calculate the number of 0 (free) elements by year
zero_price_count_by_year = grouped.apply(lambda group: (group['price'] == 0).sum()).reset_index()
zero_price_count_by_year

Unnamed: 0,year,developer,0
0,1983,Digital Leisure Inc.,0
1,1984,Digital Leisure Inc.,0
2,1987,"MicroProse Software, Inc",0
3,1987,Sierra,0
4,1987,Spectrum Holobyte,0
...,...,...,...
7411,2017,oddonegames,0
7412,2018,Bundle Of Sticks Studios,0
7413,2018,MP Game Studios,0
7414,2018,Team Monkey,0


In [29]:
# We calculate the percentange
percentage_zero_price_by_year = (zero_price_count_by_year[0] / total_count_by_year['items_count']) * 100
percentage_zero_price_by_year

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
7411    0.0
7412    0.0
7413    0.0
7414    0.0
7415    0.0
Length: 7416, dtype: float64

In [31]:
#We create the definitive data frame
result_df = pd.DataFrame({
    'Year': total_count_by_year['year'],
    'Total Items': total_count_by_year['items_count'],
    'Zero Price Items': zero_price_count_by_year[0],
    'Percentage of Zero Price': percentage_zero_price_by_year,
    "Developer": total_count_by_year.developer
})
result_df

Unnamed: 0,Year,Total Items,Zero Price Items,Percentage of Zero Price,Developer
0,1983,47345,0,0.0,Digital Leisure Inc.
1,1984,13835,0,0.0,Digital Leisure Inc.
2,1987,127167,0,0.0,"MicroProse Software, Inc"
3,1987,126966,0,0.0,Sierra
4,1987,54136,0,0.0,Spectrum Holobyte
...,...,...,...,...,...
7411,2017,26136,0,0.0,oddonegames
7412,2018,68634,0,0.0,Bundle Of Sticks Studios
7413,2018,64512,0,0.0,MP Game Studios
7414,2018,70712,0,0.0,Team Monkey


We export it to a csv to use it later in the API

In [32]:
result_df.to_csv('Api_DataFrame/developer.csv', index=False, encoding='utf-8')
print('developer.csv was saved')

developer.csv was saved


Now we got to show the amount of money spent by the user, the amount of items and the reconmendation percentage

We make the auxiliar dataframes to unite them later

In [165]:
price = df_games[["price","id"]]
price = price.drop_duplicates()
price = price.rename(columns={'id':'item_id'}) 

In [166]:
price ["price"] = price["price"].replace(["Free","Free To Play",'Free to Play',       #all this values appears when
                                          'Play for Free!',"No data","Install Now",
                                          'Free Mod','Free HITMAN™ Holiday Pack','Free Movie'],0) # we try to convert the price
                                                                                    #values to float it dont work

In [167]:
items_recomendation = df_reviews[["user_id","item_id","recommend"]]
items_recomendation = items_recomendation.drop_duplicates()

In [168]:
item_count_users = df_items[["item_id","items_count"]]
item_count_users

Unnamed: 0,item_id,items_count
0,10,277
1,20,277
2,30,277
3,40,277
4,50,277
...,...,...
5094100,346330,7
5094101,373330,7
5094102,388490,7
5094103,521570,7


In [169]:
df_auxiliar_query = items_recomendation.merge(item_count_users, on='item_id', how='right')
df_auxiliar_query = df_auxiliar_query.drop_duplicates()
df_auxiliar_query

Unnamed: 0,user_id,item_id,recommend,items_count
0,peetsasucks,10,False,277
1,mixadance,10,True,277
2,76561198134580826,10,True,277
3,Tokiwadai,10,True,277
4,76561198039441595,20,False,277
...,...,...,...,...
86611352,Xx-Woods,346330,False,7
86611353,Zejus,346330,False,7
86611355,ssbatt,388490,False,7
86611356,TfhuAWGscvg,521570,True,7


In [170]:
df_second_query = df_auxiliar_query.merge(price, on='item_id', how='right')
df_second_query

Unnamed: 0,user_id,item_id,recommend,items_count,price
0,,761140.0,,,4.99
1,,643980.0,,,0
2,,670290.0,,,0
3,,767400.0,,,0.99
4,,773570.0,,,2.99
...,...,...,...,...,...
4852573,,773640.0,,,1.99
4852574,,733530.0,,,4.99
4852575,,610660.0,,,1.99
4852576,,658870.0,,,4.99


We drop the columns with NaN in user_id and sentiment_analysis since they are useless for this this querys

In [171]:
mask = (df_second_query["user_id"].notna()) & (df_second_query["recommend"].notna())
df_second_query  = df_second_query[mask]
df_second_query.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4055977 entries, 187 to 4851819
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      object 
 1   item_id      float64
 2   recommend    object 
 3   items_count  float64
 4   price        object 
dtypes: float64(2), object(3)
memory usage: 185.7+ MB


In [172]:
df_second_query.head(1)

Unnamed: 0,user_id,item_id,recommend,items_count,price
187,EizanAratoFujimaki,70.0,True,277.0,9.99


Now we make the groupped dataframe

In [173]:
df_second_query['price'] = df_second_query['price'].astype(float)     #first of all, we have to convert price to float type

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_second_query['price'] = df_second_query['price'].astype(float)     #first of all, we have to convert price to float type


Now we do the query that is asked, namely, to show the user id, the money they spent, items count and the percentage of reviews

In [277]:
#two auxiliar columns to group the two parts of the query
money_spent = df_second_query.groupby(['user_id'])['price'].sum()
money_spent

user_id
-Beave-        18190.90
-PRoSlayeR-     3418.15
-SEVEN-        10102.72
-Ultrix         8031.96
-Zovix-        13326.11
                 ...   
zoozles        28098.72
zucchin1       27595.40
zukuta          1478.52
zuzuga2003      4291.40
zv_odd         13808.23
Name: price, Length: 4816, dtype: float64

In [281]:
items = df_second_query.groupby('user_id')['items_count'].sum()
items

user_id
-Beave-        575695.0
-PRoSlayeR-    415276.0
-SEVEN-        261937.0
-Ultrix        483346.0
-Zovix-        552941.0
                 ...   
zoozles        942857.0
zucchin1       160057.0
zukuta         108441.0
zuzuga2003     526865.0
zv_odd         687959.0
Name: items_count, Length: 4816, dtype: float64

In [282]:
recommended_games = df_second_query.groupby('user_id')['recommend'].mean() * 100
recommended_games

user_id
-Beave-            100.0
-PRoSlayeR-        100.0
-SEVEN-        99.431818
-Ultrix            100.0
-Zovix-            100.0
                 ...    
zoozles            100.0
zucchin1           100.0
zukuta             100.0
zuzuga2003         100.0
zv_odd             100.0
Name: recommend, Length: 4816, dtype: object

In [283]:
total_count_by_user = pd.DataFrame({'user_id':money_spent.index, 'money_spent': money_spent, 
                                    'percentage_reviews': recommended_games, 'items_count': items})

total_count_by_user = total_count_by_user.reset_index(drop= True)
total_count_by_user

Unnamed: 0,user_id,money_spent,percentage_reviews,items_count
0,-Beave-,18190.90,100.0,575695.0
1,-PRoSlayeR-,3418.15,100.0,415276.0
2,-SEVEN-,10102.72,99.431818,261937.0
3,-Ultrix,8031.96,100.0,483346.0
4,-Zovix-,13326.11,100.0,552941.0
...,...,...,...,...
4811,zoozles,28098.72,100.0,942857.0
4812,zucchin1,27595.40,100.0,160057.0
4813,zukuta,1478.52,100.0,108441.0
4814,zuzuga2003,4291.40,100.0,526865.0


Save it in a csv

In [284]:
total_count_by_user.to_csv('Api_DataFrame/userdata.csv', index=False, encoding='utf-8')
print('userdata.csv was saved')

userdata.csv was saved


In the third query, we are asked to give the user who played more hours by genre, and a list of hour played by year

First of all we prepare the auxilary dataframe, like we did in the previous parts

In [260]:
hours_played = df_items[["playtime_forever","user_id","item_id"]]
hours_played

Unnamed: 0,playtime_forever,user_id,item_id
0,6.0,76561197970982479,10
1,0.0,76561197970982479,20
2,7.0,76561197970982479,30
3,0.0,76561197970982479,40
4,0.0,76561197970982479,50
...,...,...,...
5094100,0.0,76561198329548331,346330
5094101,0.0,76561198329548331,373330
5094102,3.0,76561198329548331,388490
5094103,4.0,76561198329548331,521570


In [261]:
years = df_games[["release_date","id","genre"]]
years = years.rename(columns={'id':'item_id'})
years['year'] = years['release_date'].str.extract(r'(\d{4})')
years.drop(columns=['release_date'], inplace=True) 

In [262]:
years.info()      #there are very little nulls, stadistically is not a big number, so we are deleting them

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85625 entries, 0 to 85624
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   item_id  85625 non-null  float64
 1   genre    85625 non-null  object 
 2   year     74642 non-null  object 
dtypes: float64(1), object(2)
memory usage: 2.0+ MB


In [263]:
years = years.dropna(subset="year")

In [264]:
df_third_query = hours_played.merge(years, on='item_id', how='left')

In [265]:
df_third_query = df_third_query.drop_duplicates()

In [266]:
mask = df_third_query["genre"].isna()
df_third_query[mask]

Unnamed: 0,playtime_forever,user_id,item_id,genre,year
13,0.0,76561197970982479,34440,,
29,692.0,76561197970982479,9340,,
30,0.0,76561197970982479,228200,,
41,0.0,76561197970982479,17340,,
62,53.0,76561197970982479,23120,,
...,...,...,...,...,...
10962592,0.0,76561198326700687,508380,,
10962595,0.0,76561198326700687,510660,,
10962613,0.0,76561198326700687,519170,,
10962633,0.0,76561198329548331,373330,,


In [267]:
df_third_query.dropna(inplace=True) 

In [268]:
df_third_query

Unnamed: 0,playtime_forever,user_id,item_id,genre,year
0,6.0,76561197970982479,10,Action,2000
1,0.0,76561197970982479,20,Action,1999
2,7.0,76561197970982479,30,Action,2003
3,0.0,76561197970982479,40,Action,2001
4,0.0,76561197970982479,50,Action,1999
...,...,...,...,...,...
10962634,3.0,76561198329548331,388490,Adventure,2015
10962635,3.0,76561198329548331,388490,Free to Play,2015
10962636,4.0,76561198329548331,521570,Casual,2016
10962637,4.0,76561198329548331,521570,Free to Play,2016


We are going to use two dataframes in this querys, so that we dont waste memory (They are less heavy separeted)

In [285]:
genre = df_third_query.groupby(['genre',"user_id"])['playtime_forever'].sum().reset_index()
genre.rename(columns={'playtime_forever': 'playtime_genre'}, inplace=True)
genre

Unnamed: 0,genre,user_id,playtime_genre
0,1980s,-Mad-,0.0
1,1980s,00690069006900,0.0
2,1980s,007james_bond,30.0
3,1980s,08alross,0.0
4,1980s,091263,0.0
...,...,...,...
791030,e-sports,zuilde,7.0
791031,e-sports,zwanzigdrei,512.0
791032,e-sports,zyxwvutsrqponm,0.0
791033,e-sports,zzeee,1.0


In [286]:
genre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 791035 entries, 0 to 791034
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   genre           791035 non-null  object 
 1   user_id         791035 non-null  object 
 2   playtime_genre  791035 non-null  float64
dtypes: float64(1), object(2)
memory usage: 18.1+ MB


In [287]:
playtime_year_group = df_third_query.groupby(['user_id', 'year'])['playtime_forever'].sum().reset_index()
playtime_year_group.rename(columns={'playtime_forever': 'playtime_year'}, inplace=True)
playtime_year_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772338 entries, 0 to 772337
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        772338 non-null  object 
 1   year           772338 non-null  object 
 2   playtime_year  772338 non-null  float64
dtypes: float64(1), object(2)
memory usage: 17.7+ MB


Export the csv

In [288]:
playtime_year_group.to_csv("Api_DataFrame/UserForGenre_year.csv",index=False, encoding='utf-8')
print("UserForGenre_year.csv was saved")

UserForGenre_year.csv was saved


In [289]:
genre.to_csv("Api_DataFrame/UserForGenre_genre.csv",index=False, encoding='utf-8')
print("UserForGenre_genre.csv was saved")

UserForGenre_genre.csv was saved


In the fourth query, we are asked to show the top 3 most recommended developers by year

In [187]:
developer = df_games[["developer","id","release_date"]]
developer =developer.rename(columns={'id':'item_id'})
developer['year'] = developer['release_date'].str.extract(r'(\d{4})')
developer.drop(columns=['release_date'], inplace=True)

We are deleting all this rows where developer is without data, since they are going to be useless for was is asked in the query

In [188]:
mask = developer.developer == "No data"
developer[mask]

Unnamed: 0,developer,item_id,year
17,No data,773570.0,
18,No data,773570.0,
19,No data,773570.0,
20,No data,773570.0,
44,No data,724910.0,
...,...,...,...
85620,No data,681550.0,
85621,No data,681550.0,
85622,No data,681550.0,
85623,No data,681550.0,


In [189]:
mask = developer.developer != "No data"
developer = developer[mask]

In [190]:
review = df_reviews[["item_id","recommend","sentiment_analysis"]]

In [191]:
review.head()

Unnamed: 0,item_id,recommend,sentiment_analysis
0,211420,True,1
1,211820,True,1
2,730,True,2
3,550,True,1
4,730,True,1


In [192]:
df_fourth_query = developer.merge(review, on = "item_id", how = "left")

In [193]:
df_fourth_query

Unnamed: 0,developer,item_id,year,recommend,sentiment_analysis
0,Kotoshiro,761140.0,2018,,
1,Kotoshiro,761140.0,2018,,
2,Kotoshiro,761140.0,2018,,
3,Kotoshiro,761140.0,2018,,
4,Kotoshiro,761140.0,2018,,
...,...,...,...,...,...
84815,Laush Dmitriy Sergeevich,610660.0,2018,,
84816,Laush Dmitriy Sergeevich,610660.0,2018,,
84817,Laush Dmitriy Sergeevich,610660.0,2018,,
84818,"xropi,stev3ns",658870.0,2017,,


Same here, useless values with no data. Same process, deleting them

In [194]:
mask = df_fourth_query["recommend"].isnull() & df_fourth_query["sentiment_analysis"].isnull()
df_fourth_query[mask]

Unnamed: 0,developer,item_id,year,recommend,sentiment_analysis
0,Kotoshiro,761140.0,2018,,
1,Kotoshiro,761140.0,2018,,
2,Kotoshiro,761140.0,2018,,
3,Kotoshiro,761140.0,2018,,
4,Kotoshiro,761140.0,2018,,
...,...,...,...,...,...
84815,Laush Dmitriy Sergeevich,610660.0,2018,,
84816,Laush Dmitriy Sergeevich,610660.0,2018,,
84817,Laush Dmitriy Sergeevich,610660.0,2018,,
84818,"xropi,stev3ns",658870.0,2017,,


In [195]:
mask = df_fourth_query["recommend"].notnull() & df_fourth_query["sentiment_analysis"].notnull()
df_fourth_query = df_fourth_query[mask]

In [196]:
df_fourth_query.head(1)

Unnamed: 0,developer,item_id,year,recommend,sentiment_analysis
82,Valve,70.0,1998,True,1.0


We are doing the query now

In [197]:
recomendation = df_fourth_query.groupby(['developer','year'])['recommend'].sum().reset_index()
recomendation

Unnamed: 0,developer,year,recommend
0,07th Expansion,2015,True
1,07th Expansion,2016,True
2,11 bit studios,2014,12
3,1C: Maddox Games,2006,2
4,1C:InoCo,2009,0
...,...,...,...
1359,the whale husband,2015,12
1360,tobyfox,2015,80
1361,△○□× (Miwashiba),2016,4
1362,"インレ,Inre",2016,2


In [198]:
recommended_games = df_fourth_query.groupby(['developer'
                                             ,'year'])[
                                               'sentiment_analysis'
                                               ].sum().reset_index() 
                                                                                                                               
#i do a sum() because, since 0 means bad, 
#1 neutral, 2 good, the most recommended is, by logic, 
# the one with highest score      
recommended_games

Unnamed: 0,developer,year,sentiment_analysis
0,07th Expansion,2015,0.0
1,07th Expansion,2016,1.0
2,11 bit studios,2014,9.0
3,1C: Maddox Games,2006,2.0
4,1C:InoCo,2009,2.0
...,...,...,...
1359,the whale husband,2015,28.0
1360,tobyfox,2015,102.0
1361,△○□× (Miwashiba),2016,8.0
1362,"インレ,Inre",2016,2.0


In [199]:
final_fourth_query = pd.DataFrame({'developer':recomendation.developer, 'year': recomendation.year, 
                                    'recommend': recomendation.recommend, 
                                    "sentiment_analysis": recommended_games.sentiment_analysis })
final_fourth_query

Unnamed: 0,developer,year,recommend,sentiment_analysis
0,07th Expansion,2015,True,0.0
1,07th Expansion,2016,True,1.0
2,11 bit studios,2014,12,9.0
3,1C: Maddox Games,2006,2,2.0
4,1C:InoCo,2009,0,2.0
...,...,...,...,...
1359,the whale husband,2015,12,28.0
1360,tobyfox,2015,80,102.0
1361,△○□× (Miwashiba),2016,4,8.0
1362,"インレ,Inre",2016,2,2.0


In [200]:
final_fourth_query.to_csv("Api_DataFrame/best_developer_year.csv",index=False, encoding='utf-8')
print("best_developer_year.csv was saved")

best_developer_year.csv was saved


Finally, the last query

We have to give the number of positive and negaive reviews by developer

In [201]:
developer = df_games[["developer","id"]] #we make an aux columns similar to the one in the former query
developer =developer.rename(columns={'id':'item_id'})
mask = developer.developer != "No data"
developer = developer[mask]
developer

Unnamed: 0,developer,item_id
0,Kotoshiro,761140.0
1,Kotoshiro,761140.0
2,Kotoshiro,761140.0
3,Kotoshiro,761140.0
4,Kotoshiro,761140.0
...,...,...
85614,Laush Dmitriy Sergeevich,610660.0
85615,Laush Dmitriy Sergeevich,610660.0
85616,Laush Dmitriy Sergeevich,610660.0
85617,"xropi,stev3ns",658870.0


In [202]:
review = df_reviews[["item_id","sentiment_analysis"]]
review

Unnamed: 0,item_id,sentiment_analysis
0,211420,1
1,211820,1
2,730,2
3,550,1
4,730,1
...,...,...
8005,265630,0
8006,306130,0
8007,427730,1
8008,570,1


In [203]:
df_fifth_query = developer.merge(review, on = "item_id", how = "left")
df_fifth_query.dropna(subset="sentiment_analysis", inplace= True) #We are asked give a  
                                                                  #number of good or bad analysis, 
                                                                  # so NaN in that category values are useless
df_fifth_query.info()                                                                 

<class 'pandas.core.frame.DataFrame'>
Index: 17003 entries, 82 to 84785
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   developer           17003 non-null  object 
 1   item_id             17003 non-null  float64
 2   sentiment_analysis  17003 non-null  float64
dtypes: float64(2), object(1)
memory usage: 531.3+ KB


We make now two auxiliares dataframes to count the positive and negative values

In [204]:
mask = df_fifth_query["sentiment_analysis"] == 2
positive = df_fifth_query[mask]
positive

Unnamed: 0,developer,item_id,sentiment_analysis
83,Valve,70.0,2.0
163,Introversion Software,1520.0,2.0
164,Introversion Software,1520.0,2.0
168,Facepunch Studios,4000.0,2.0
169,Facepunch Studios,4000.0,2.0
...,...,...,...
84765,Valve,10.0,2.0
84766,Valve,10.0,2.0
84767,Valve,10.0,2.0
84770,Valve,40.0,2.0


In [205]:
mask = df_fifth_query["sentiment_analysis"] == 0
negative = df_fifth_query[mask]
negative

Unnamed: 0,developer,item_id,sentiment_analysis
119,Outerlight Ltd.,2400.0,0.0
123,Outerlight Ltd.,2400.0,0.0
127,Outerlight Ltd.,2400.0,0.0
172,Facepunch Studios,4000.0,0.0
174,Facepunch Studios,4000.0,0.0
...,...,...,...
84585,Firaxis Games,3920.0,0.0
84602,Strategy First,1670.0,0.0
84758,Valve,20.0,0.0
84762,Valve,60.0,0.0


Now we use the created dataframes to group them in two different df, one for positive and other for negative reviews

In [206]:
grouped_p = positive.groupby(['developer'])['sentiment_analysis'].count().reset_index()
grouped_p = grouped_p.rename(columns={'sentiment_analysis':'negative reviews'})
grouped_p

Unnamed: 0,developer,negative reviews
0,11 bit studios,3
1,1C:InoCo,1
2,22cans,10
3,2D BOY,1
4,"2K Australia,Gearbox Software,Aspyr (Mac &amp;...",2
...,...,...
637,sparsevector,3
638,stage-nana,2
639,the whale husband,12
640,tobyfox,30


In [207]:
grouped_n = negative.groupby(['developer'])['sentiment_analysis'].count().reset_index()
grouped_n = grouped_n.rename(columns={'sentiment_analysis':'positive reviews'})
grouped_n

Unnamed: 0,developer,positive reviews
0,07th Expansion,1
1,11 bit studios,6
2,1C: Maddox Games,1
3,3909,2
4,5Wolf,3
...,...,...
427,practicing01,2
428,sparsevector,3
429,stage-nana,1
430,the whale husband,4


Now we merge them

In [208]:
final_fifth_query = grouped_p.merge(grouped_n, on = "developer", how = "left")
final_fifth_query

Unnamed: 0,developer,negative reviews,positive reviews
0,11 bit studios,3,6.0
1,1C:InoCo,1,
2,22cans,10,
3,2D BOY,1,
4,"2K Australia,Gearbox Software,Aspyr (Mac &amp;...",2,
...,...,...,...
637,sparsevector,3,3.0
638,stage-nana,2,1.0
639,the whale husband,12,4.0
640,tobyfox,30,14.0


We see missing values, since a missing values means that no one made a positive or negative review, we replace it with a 0, meaning that the there are no positive or negative reviews (depending on the case)

In [209]:
final_fifth_query.fillna(0,inplace=True)
final_fifth_query

Unnamed: 0,developer,negative reviews,positive reviews
0,11 bit studios,3,6.0
1,1C:InoCo,1,0.0
2,22cans,10,0.0
3,2D BOY,1,0.0
4,"2K Australia,Gearbox Software,Aspyr (Mac &amp;...",2,0.0
...,...,...,...
637,sparsevector,3,3.0
638,stage-nana,2,1.0
639,the whale husband,12,4.0
640,tobyfox,30,14.0


Lastly, we save it on a csv

In [210]:
final_fifth_query.to_csv("Api_DataFrame/developer_reviews_analysis.csv",index=False, encoding='utf-8')
print("developer_reviews_analysis.csv was saved")

developer_reviews_analysis.csv was saved
