## imports

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
steam       = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam.csv")
description = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam_description_data.csv")

In [6]:
steam.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [5]:
description.columns

Index(['steam_appid', 'detailed_description', 'about_the_game',
       'short_description'],
      dtype='object')

In [8]:
len(list(steam.genres.unique()))

1552

In [13]:
df_geners = steam.groupby(["genres"])["appid"].count().sort_values(ascending=False).to_frame().reset_index()

In [15]:
df_geners.head()

Unnamed: 0,genres,appid
0,Action;Indie,1852
1,Casual;Indie,1482
2,Action;Adventure;Indie,1229
3,Adventure;Indie,1170
4,Action;Casual;Indie,1004


## 1. Remove less frequent tags

In [41]:
mask = [';' not in i for i in list(df_geners.genres)]
df_geners[mask]

Unnamed: 0,genres,appid
5,Action,843
7,Indie,759
8,Casual,560
9,Adventure,535
11,Strategy,485
19,Simulation,328
21,RPG,270
49,Racing,86
59,Sports,63
89,Utilities,44


In [43]:
list(df_geners[mask].genres)

['Action',
 'Indie',
 'Casual',
 'Adventure',
 'Strategy',
 'Simulation',
 'RPG',
 'Racing',
 'Sports',
 'Utilities',
 'Free to Play',
 'Early Access',
 'Education',
 'Design & Illustration',
 'Video Production',
 'Audio Production',
 'Animation & Modeling',
 'Violent',
 'Gore',
 'Web Publishing']

In [44]:
tags_to_remove = [ 'Utilities',
 'Free to Play',
 'Early Access',
 'Education',
 'Design & Illustration',
 'Video Production',
 'Audio Production',
 'Animation & Modeling',
 'Violent',
 'Gore',
 'Web Publishing']

In [315]:
data = steam.copy(deep=True)

In [316]:
data.shape

(27075, 18)

In [317]:
for tag in tags_to_remove:
    data.drop(data[(data["genres"]==tag)].index, axis=0, inplace=True)

In [318]:
data.shape

(26961, 18)

## 2. Remove any tag that wasn't in our main tags

In [326]:
mask = [';' not in i for i in list(data.genres)]
df = data[mask].groupby(["genres"])["appid"].count().sort_values(ascending=False).to_frame().reset_index()
df.head(10)

Unnamed: 0,genres,appid
0,Action,843
1,Indie,759
2,Casual,560
3,Adventure,535
4,Strategy,485
5,Simulation,328
6,RPG,270
7,Racing,86
8,Sports,63


In [354]:
tags_set = set(df["genres"])
tags_set #topics

{'Action',
 'Adventure',
 'Casual',
 'Indie',
 'RPG',
 'Racing',
 'Simulation',
 'Sports',
 'Strategy'}

### There are some tags that aren't in our tag list (irrelevent topics)
like the following tags: Animation & Modeling and Video Production

In [77]:
all_tags = [i.split(";") for i in list(data.genres)]
all_tags[40:45]

[['RPG'],
 ['Animation & Modeling', 'Video Production'],
 ['Strategy'],
 ['RPG'],
 ['Action', 'RPG']]

In [68]:
#input:   dataframe with unwanted tags to remove
#output:  dataframe with edited tags
#purpose: 

def remove_unimportant_tags(df, tags):
    
    splitted_tags = [txt.split(";") for txt in list(df.genres)]
    new_tags = []
    
    count = 0
    
    for game in splitted_tags:
        
        temp = []
        print(game)
        for tag in game:
            if tag in tags:
                temp.append(tag)
        print(temp)
        new_tags.append(temp)
        
        count +=1
        if count == 10:
            break
    return new_tags


In [78]:
mask =  [ len(b) != len(tags_set.intersection(b)) for b in all_tags ]
#tags that wasn't present in the main tags

In [81]:
data[mask].head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
19,440,Team Fortress 2,2007-10-10,1,Valve,Valve,windows;mac;linux,0,Multi-player;Cross-Platform Multiplayer;Steam ...,Action;Free to Play,Free to Play;Multiplayer;FPS,520,515879,34036,8495,623,20000000-50000000,0.0
22,570,Dota 2,2013-07-09,1,Valve,Valve,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,0.0
25,730,Counter-Strike: Global Offensive,2012-08-21,1,Valve;Hidden Path Entertainment,Valve,windows;mac;linux,0,Multi-player;Steam Achievements;Full controlle...,Action;Free to Play,FPS;Multiplayer;Shooter,167,2644404,402313,22494,6502,50000000-100000000,0.0
41,1840,Source Filmmaker,2012-07-10,1,Valve,Valve,windows,0,Steam Workshop,Animation & Modeling;Video Production,Animation & Modeling;Video Production;Free to ...,0,15083,1111,239,77,1000000-2000000,0.0
132,4560,Company of Heroes - Legacy Edition,2007-07-17,1,Relic Entertainment,SEGA,windows,18,Single-player;Multi-player;Steam Trading Cards...,Violent;Action;Strategy,Strategy;RTS;World War II,0,4772,320,175,186,2000000-5000000,0.0


In [70]:
new_tags = remove_unimportant_tags(data[mask], tags_set)
#for i in tags:
#    print(i, new_tags.count(i))
new_tags[:10]

ValueError: Item wrong length 9 instead of 26961.

In [466]:
len(new_tags)

NameError: name 'new_tags' is not defined

In [452]:
pd.DataFrame(new_tags).groupby([0])[0].count().sort_values(ascending=False).to_frame().head(50)

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
Action,2242
Violent,706
Casual,613
Adventure,576
Indie,414
Free to Play,363
Sexual Content,245
Nudity,112
Gore,80
Animation & Modeling,62


In [301]:
data.loc[mask, "genres"] = new_tags
#['genres'] = new_tags

In [302]:
data.group()

0                        Action
1                        Action
2                        Action
3                        Action
4                        Action
                  ...          
27070    Adventure;Casual;Indie
27071    Action;Adventure;Indie
27072       Action;Casual;Indie
27073    Adventure;Casual;Indie
27074    Adventure;Casual;Indie
Name: genres, Length: 26961, dtype: object