## imports

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [270]:
import matplotlib.pyplot as plt

In [2]:
steam       = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam.csv")
description = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam_description_data.csv")

In [6]:
steam.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [5]:
description.columns

Index(['steam_appid', 'detailed_description', 'about_the_game',
       'short_description'],
      dtype='object')

In [8]:
len(list(steam.genres.unique()))

1552

In [13]:
df_geners = steam.groupby(["genres"])["appid"].count().sort_values(ascending=False).to_frame().reset_index()

In [15]:
df_geners.head()

Unnamed: 0,genres,appid
0,Action;Indie,1852
1,Casual;Indie,1482
2,Action;Adventure;Indie,1229
3,Adventure;Indie,1170
4,Action;Casual;Indie,1004


## 1. Remove less frequent tags

In [41]:
mask = [';' not in i for i in list(df_geners.genres)]
df_geners[mask]

Unnamed: 0,genres,appid
5,Action,843
7,Indie,759
8,Casual,560
9,Adventure,535
11,Strategy,485
19,Simulation,328
21,RPG,270
49,Racing,86
59,Sports,63
89,Utilities,44


In [43]:
list(df_geners[mask].genres)

['Action',
 'Indie',
 'Casual',
 'Adventure',
 'Strategy',
 'Simulation',
 'RPG',
 'Racing',
 'Sports',
 'Utilities',
 'Free to Play',
 'Early Access',
 'Education',
 'Design & Illustration',
 'Video Production',
 'Audio Production',
 'Animation & Modeling',
 'Violent',
 'Gore',
 'Web Publishing']

In [44]:
tags_to_remove = [ 'Utilities',
 'Free to Play',
 'Early Access',
 'Education',
 'Design & Illustration',
 'Video Production',
 'Audio Production',
 'Animation & Modeling',
 'Violent',
 'Gore',
 'Web Publishing']

In [127]:
data = steam.copy(deep=True)

In [128]:
data.shape

(27075, 18)

In [129]:
for tag in tags_to_remove:
    data.drop(data[(data["genres"]==tag)].index, axis=0, inplace=True)

In [130]:
data.shape

(26961, 18)

## 2. Remove any tag that wasn't in our main tags

In [131]:
mask = [';' not in i for i in list(data.genres)]
df = data[mask].groupby(["genres"])["appid"].count().sort_values(ascending=False).to_frame().reset_index()
df.head(10)

Unnamed: 0,genres,appid
0,Action,843
1,Indie,759
2,Casual,560
3,Adventure,535
4,Strategy,485
5,Simulation,328
6,RPG,270
7,Racing,86
8,Sports,63


In [132]:
tags_set = set(df["genres"])
tags_set #topics

{'Action',
 'Adventure',
 'Casual',
 'Indie',
 'RPG',
 'Racing',
 'Simulation',
 'Sports',
 'Strategy'}

### There are some tags that aren't in our tag list (irrelevent topics)
like the following tags: Animation & Modeling and Video Production

In [133]:
all_tags = [i.split(";") for i in list(data.genres)]
all_tags[40:45]

[['RPG'],
 ['Animation & Modeling', 'Video Production'],
 ['Strategy'],
 ['RPG'],
 ['Action', 'RPG']]

In [134]:
#input:   dataframe with unwanted tags to remove
#output:  dataframe with edited tags
#purpose: 

def remove_unimportant_tags(df, tags):
    
    splitted_tags = [txt.split(";") for txt in list(df.genres)]
    new_tags = []
    
    for game in splitted_tags:
        
        temp = []
        for tag in game:
            if tag in tags:
                temp.append(tag)
        new_tags.append(temp)
        
    return new_tags


In [135]:
mask =  [ len(b) != len(tags_set.intersection(b)) for b in all_tags ]
#tags that wasn't present in the main tags

In [136]:
data[mask].head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
19,440,Team Fortress 2,2007-10-10,1,Valve,Valve,windows;mac;linux,0,Multi-player;Cross-Platform Multiplayer;Steam ...,Action;Free to Play,Free to Play;Multiplayer;FPS,520,515879,34036,8495,623,20000000-50000000,0.0
22,570,Dota 2,2013-07-09,1,Valve,Valve,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,0.0
25,730,Counter-Strike: Global Offensive,2012-08-21,1,Valve;Hidden Path Entertainment,Valve,windows;mac;linux,0,Multi-player;Steam Achievements;Full controlle...,Action;Free to Play,FPS;Multiplayer;Shooter,167,2644404,402313,22494,6502,50000000-100000000,0.0
41,1840,Source Filmmaker,2012-07-10,1,Valve,Valve,windows,0,Steam Workshop,Animation & Modeling;Video Production,Animation & Modeling;Video Production;Free to ...,0,15083,1111,239,77,1000000-2000000,0.0
132,4560,Company of Heroes - Legacy Edition,2007-07-17,1,Relic Entertainment,SEGA,windows,18,Single-player;Multi-player;Steam Trading Cards...,Violent;Action;Strategy,Strategy;RTS;World War II,0,4772,320,175,186,2000000-5000000,0.0


In [151]:
new_tags = remove_unimportant_tags(data[mask], tags_set)
new_tags[:10]

[['Action'],
 ['Action', 'Strategy'],
 ['Action'],
 [],
 ['Action', 'Strategy'],
 ['Action'],
 ['Action', 'Adventure', 'RPG'],
 ['RPG'],
 ['Action', 'RPG', 'Strategy'],
 ['RPG']]

In [152]:
for i, tag in zip(range(len(new_tags)), new_tags):
    if tag == []:
        new_tags[i] = ""
    elif len(tag) == 1:
        new_tags[i] = tag[0]
    else:
        new_tags[i]=';'.join(tag)

In [154]:
new_tags[:10]

['Action',
 'Action;Strategy',
 'Action',
 '',
 'Action;Strategy',
 'Action',
 'Action;Adventure;RPG',
 'RPG',
 'Action;RPG;Strategy',
 'RPG']

In [159]:
data.loc[mask,"genres"] = new_tags

### Drop games with empty tags

In [201]:
to_delete = data.loc[(data['genres'] == ""), 'genres'].index

In [202]:
to_delete

Int64Index([   41,  1438,  1498,  1697,  1904,  1988,  2041,  2226,  2268,
             2350,
            ...
            25622, 25640, 25733, 25735, 25841, 26190, 26348, 26464, 26481,
            26793],
           dtype='int64', length=139)

In [205]:
data.drop(index=to_delete, inplace=True)

In [214]:
data.to_csv("../Data/cleaned_steam.csv")

### Merge tags with description

In [215]:
steam = pd.read_csv("../Data/cleaned_steam.csv", usecols=['appid','genres'])

In [225]:
steam.head()

Unnamed: 0,appid,genres
0,10,Action
1,20,Action
2,30,Action
3,40,Action
4,50,Action


In [216]:
desc_gen = description.merge(steam,left_on='steam_appid',right_on='appid').drop(columns=["appid", "about_the_game", "short_description"])

In [217]:
desc_gen.to_csv("../Data/cleaned_desc.csv")

In [218]:
desc_gen

Unnamed: 0,steam_appid,detailed_description,genres
0,10,Play the world's number 1 online action game. ...,Action
1,20,One of the most popular online action games of...,Action
2,30,Enlist in an intense brand of Axis vs. Allied ...,Action
3,40,Enjoy fast-paced multiplayer gaming with Death...,Action
4,50,Return to the Black Mesa Research Facility as ...,Action
...,...,...,...
26817,1065230,"<img src=""https://steamcdn-a.akamaihd.net/stea...",Adventure;Casual;Indie
26818,1065570,Have you ever been so lonely that no one but y...,Action;Adventure;Indie
26819,1065650,<strong>Super Star Blast </strong>is a space b...,Action;Casual;Indie
26820,1066700,Pursue a snow-white deer through an enchanted ...,Adventure;Casual;Indie


## Clean description 
1. remove html tags
2. remove digits
3. remove punctuation
4. lower case 

In [219]:
# Create a custom function to remove the html tags from the descriptions
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_punctuation(text):
    return re.sub('[%s]' % re.escape(string.punctuation), '', text)

def lower_case(text):
    return text.lower()

def remove_num(text):
    return re.sub('\w*\d\w*','',text)  

In [220]:
stemmer = nltk.stem.PorterStemmer()
nltk.download('stopwords')
stopwords = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elaaf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [221]:

# Custom tokenizer to remove html tags, punctuation, set to lowercase, and remove stopwords
def my_tokenizer(sentence):
    # Remove HTML tags with custom function
    sentence = remove_html_tags(sentence)
    
    # remove punctuation using string attribute
    for punct in string.punctuation:
        # set to lower case with built in functions
        sentence = sentence.replace(punct,'').lower()

    # split into words
    words = sentence.split(' ')
    stemmed_list = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in words:
        if (not word in stopwords) and (word!=''):
            # Stem words
            stemmed = stemmer.stem(word)
            stemmed_list.append(stemmed)

    return stemmed_list

In [222]:
# Initial thresholds
minimum_descr_count = 5 # do not count words unless they occur in at least this many descriptions
maximum_descr_perc = 0.90 # drop words that occur in 90% or more of the descriptions

tfidf = TfidfVectorizer(stop_words = stopwords, min_df=minimum_descr_count, 
                             max_df=maximum_descr_perc, tokenizer=my_tokenizer)                          

In [224]:
tfidf_matrix = tfidf.fit_transform(desc_gen.detailed_description)

In [241]:
desc_gen[(desc_gen.genres == "Action")]

Unnamed: 0,steam_appid,detailed_description,genres
0,10,Play the world's number 1 online action game. ...,Action
1,20,One of the most popular online action games of...,Action
2,30,Enlist in an intense brand of Axis vs. Allied ...,Action
3,40,Enjoy fast-paced multiplayer gaming with Death...,Action
4,50,Return to the Black Mesa Research Facility as ...,Action
...,...,...,...
26656,1046490,This is a simple multiplayer shooter in which ...,Action
26670,1047780,The planet Tserberus is inhabited by reasonabl...,Action
26738,1053060,"<img src=""https://steamcdn-a.akamaihd.net/stea...",Action
26741,1053190,Lover Bands is a 2-player cooperative platform...,Action


In [256]:
desc_gen.loc[desc_gen[desc_gen.genres == "Action"].index, "genres"][0]

'Action'

In [None]:
desc_gen.iloc[5].steam_appid

60

In [280]:
steam       = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam.csv")
steam.columns

Index(['appid', 'name', 'release_date', 'english', 'developer', 'publisher',
       'platforms', 'required_age', 'categories', 'genres', 'steamspy_tags',
       'achievements', 'positive_ratings', 'negative_ratings',
       'average_playtime', 'median_playtime', 'owners', 'price'],
      dtype='object')

In [284]:
steam.groupby(["name"])["appid"].count().reset_index().to_csv("data.csv")