# Imports

In [1]:
# Import nltk if not
# !pip install nltk

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel, paired_cosine_distances, cosine_similarity, linear_kernel
from nltk.corpus import stopwords
import nltk
import pickle

# read datasets

In [4]:
game_steam_df = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam.csv")
steam_description_df = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam_description_data.csv")

# Preprocessing 

### 1)  Data cleaning

In [7]:
# Calculate percentage of positive reviews and total reviews
game_steam_df['total_ratings'] = game_steam_df['positive_ratings'] + game_steam_df['negative_ratings']
game_steam_df['percent_positive_ratings'] = game_steam_df['positive_ratings'] / game_steam_df['total_ratings']
game_steam_df.drop(['positive_ratings','negative_ratings'],axis=1,inplace=True)
game_steam_df.head(3)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,average_playtime,median_playtime,owners,price,total_ratings,percent_positive_ratings
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,17612,317,10000000-20000000,7.19,127873,0.973888
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,277,62,5000000-10000000,3.99,3951,0.839787
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,187,34,5000000-10000000,3.99,3814,0.895648


In [8]:
# change col name so we can merge on the appid
new_data = steam_description_df.rename(index=str, columns={"steam_appid":"appid"})
#merge two datasets
merge_dataset = game_steam_df.merge(new_data, on="appid")

In [9]:
# drop unnecessary columns
merge_dataset.drop(columns={"release_date","english","developer","publisher","platforms","required_age",
                            "achievements","owners","about_the_game","short_description"})

Unnamed: 0,appid,name,categories,genres,steamspy_tags,average_playtime,median_playtime,price,total_ratings,percent_positive_ratings,detailed_description
0,10,Counter-Strike,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,17612,317,7.19,127873,0.973888,Play the world's number 1 online action game. ...
1,20,Team Fortress Classic,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,277,62,3.99,3951,0.839787,One of the most popular online action games of...
2,30,Day of Defeat,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,187,34,3.99,3814,0.895648,Enlist in an intense brand of Axis vs. Allied ...
3,40,Deathmatch Classic,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,258,184,3.99,1540,0.826623,Enjoy fast-paced multiplayer gaming with Death...
4,50,Half-Life: Opposing Force,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,624,415,3.99,5538,0.947996,Return to the Black Mesa Research Facility as ...
...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,Room of Pandora,Single-player;Steam Achievements,Adventure;Casual;Indie,Adventure;Indie;Casual,0,0,2.09,3,1.000000,"<img src=""https://steamcdn-a.akamaihd.net/stea..."
27071,1065570,Cyber Gun,Single-player,Action;Adventure;Indie,Action;Indie;Adventure,0,0,1.69,9,0.888889,Have you ever been so lonely that no one but y...
27072,1065650,Super Star Blast,Single-player;Multi-player;Co-op;Shared/Split ...,Action;Casual;Indie,Action;Indie;Casual,0,0,3.99,1,0.000000,<strong>Super Star Blast </strong>is a space b...
27073,1066700,New Yankee 7: Deer Hunters,Single-player;Steam Cloud,Adventure;Casual;Indie,Indie;Casual;Adventure,0,0,5.19,2,1.000000,Pursue a snow-white deer through an enchanted ...


In [12]:
# Create a custom function to remove the html tags from the descriptions
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_punctuation(text):
    return re.sub('[%s]' % re.escape(string.punctuation), '', text)

def lower_case(text):
    return text.lower()

def remove_num(text):
    return re.sub('\w*\d\w*','',text)  

In [13]:
stemmer = nltk.stem.PorterStemmer()
nltk.download('stopwords')
stopwords = stopwords.words('english')

# Custom tokenizer to remove html tags, punctuation, set to lowercase, and remove stopwords
def my_tokenizer(sentence):
    # Remove HTML tags with custom function
    sentence = remove_html_tags(sentence)
    
    # remove punctuation using string attribute
    for punct in string.punctuation:
        # set to lower case with built in functions
        sentence = sentence.replace(punct,'').lower()

    # split into words
    words = sentence.split(' ')
    stemmed_list = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in words:
        if (not word in stopwords) and (word!=''):
            # Stem words
            stemmed = stemmer.stem(word)
            stemmed_list.append(stemmed)

    return stemmed_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elaaf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#merge_dataset["detailed_description"] = merge_dataset["detailed_description"].apply(my_tokenizer)
# merge_dataset["detailed_description"] = merge_dataset["detailed_description"].apply(lower_case)
# merge_dataset["detailed_description"] = merge_dataset["detailed_description"].apply(remove_num)

# Apply TFIDF

In [19]:
# Initial thresholds
minimum_descr_count = 5 # do not count words unless they occur in at least this many descriptions
maximum_descr_perc = 0.90 # drop words that occur in 90% or more of the descriptions

tfidf = TfidfVectorizer(stop_words = stopwords, min_df=minimum_descr_count, 
                             max_df=maximum_descr_perc, tokenizer=my_tokenizer)                          

In [20]:
tfidf_matrix = tfidf.fit_transform(merge_dataset.detailed_description)



In [22]:
tfidf_matrix.shape

(27075, 18715)

# Applying similarity mitrex

### recommend system function

In [23]:
# find index of each name to bring the similar game by entring game name
indexs = pd.Series(merge_dataset.index, index = merge_dataset.name).drop_duplicates()

In [24]:
# test recommendtion system based on sigmoid_kernel
def recomend_game(name, model, num):
    indx = indexs[name]
    sig_score = list(enumerate(model[indx])) # get similarity score
    sig_score = sorted(sig_score, key= lambda x:x[1], reverse = True) # sort game
    sig_score = sig_score[1:num+1]
    
    game_index = [i[0] for i in sig_score] # get index of each score
    
    return merge_dataset.name.iloc[game_index] # return the name
    
    

In [25]:
def content_recommender(name, games, similarities, vote_threshold=1000, rating_threshold=0.7) :
    
    # Get the game by the title
    game_index = games[games['name']==name].index
    
    # Create a dataframe with the game id, name, and rating information with similarity
    sim_df = pd.DataFrame(
        {'appid': games['appid'],
         'game': games['name'], 
         'similarity': np.array(similarities[game_index, :].todense()).squeeze(),
         'diversity': 1- np.array(similarities[game_index, :].todense()).squeeze(),
         'vote_count': games['total_ratings'],
         'percent_positive_ratings': games['percent_positive_ratings']
        })
    
    # Get the top 10 games that satisfy our thresholds
    top_games = sim_df[(sim_df['vote_count']>vote_threshold) & 
                       (sim_df['percent_positive_ratings']>rating_threshold)].sort_values(by='similarity', ascending=False).head(10)
    
    return top_games

In [26]:
def content_recommender2(name, games, similarities, vote_threshold=1000, rating_threshold=0.7) :
    
    # Get the game by the title
    game_index = games[games['name']==name].index
    
    # Create a dataframe with the game id, name, and rating information with similarity
    sim_df = pd.DataFrame(
        {'appid': games['appid'],
         'game': games['name'], 
         'similarity': np.array(similarities[game_index, :]).squeeze(),
         'diversity': 1 - np.array(similarities[game_index, :]).squeeze(),
         'vote_count': games['total_ratings'],
         'percent_positive_ratings': games['percent_positive_ratings']
        })
    
    # Get the top 10 games 
    top_games = sim_df[(sim_df['vote_count']>vote_threshold) & 
                       (sim_df['percent_positive_ratings']>rating_threshold)].sort_values(by='similarity', ascending=False).head(10)
    
    return top_games

### 1) sigmoid_kernel

In [27]:
# to find  content based similarity  will try to use sigmoid kernel to find the propabilty of detailed_description similarity
sig = sigmoid_kernel(tfidf_matrix,tfidf_matrix)
# to much time but works 


In [28]:
recomend_game("PAYDAY 2",sig,10)

6655             Let's Sing 2016
22256                 Idle Heist
21486                Dreadnought
3575              The Masterplan
4438                     rFactor
26886               Grand Battle
1982               7 Days to Die
1357                   Sanctum 2
12648    DRAGON QUEST HEROES™ II
8928                     Colosse
Name: name, dtype: object

In [29]:
similar_games = content_recommender2('PAYDAY 2', merge_dataset, sig, 
                                    rating_threshold=0.80)
similar_games.head(5).sort_values(by='percent_positive_ratings',ascending=False)
# the less diversity the better

Unnamed: 0,appid,game,similarity,diversity,vote_count,percent_positive_ratings
1357,210770,Sanctum 2,0.761599,0.238401,12902,0.901333
8762,460810,Vanquish,0.761598,0.238402,2687,0.878303
4382,338170,Ratz Instagib,0.761598,0.238402,2505,0.854291
1467,218620,PAYDAY 2,0.761617,0.238383,365180,0.845219
10844,524220,NieR:Automata™,0.761598,0.238402,48801,0.807094


### 2) cosine_similarity

In [129]:
similarities = cosine_similarity(tfidf_matrix, dense_output=False)
# take to much time

In [130]:
similarities

<27075x27075 sparse matrix of type '<class 'numpy.float64'>'
	with 696023852 stored elements in Compressed Sparse Row format>

In [141]:
similar_games = content_recommender('PAYDAY 2', merge_dataset, similarities, 
                                    rating_threshold=0.80)
similar_games.head(5).sort_values(by='percent_positive_ratings',ascending=False)

Unnamed: 0,appid,game,similarity,diversity,vote_count,percent_positive_ratings
1357,210770,Sanctum 2,0.197968,0.8020316,12902,0.901333
8762,460810,Vanquish,0.177617,0.822383,2687,0.878303
4382,338170,Ratz Instagib,0.179991,0.8200088,2505,0.854291
1467,218620,PAYDAY 2,1.0,-2.220446e-16,365180,0.845219
10844,524220,NieR:Automata™,0.18683,0.8131701,48801,0.807094


### 3) cosine_similarity with norm

In [123]:
# the defult use 64 i will try using float32 but will be have of accuracy 
normalized_df = tfidf_matrix.astype(np.float32)
cosine_sim = cosine_similarity(normalized_df, normalized_df)

In [124]:
similar_games = content_recommender2('PAYDAY 2', merge_dataset, cosine_sim, 
                                    rating_threshold=0.80)
similar_games.head(5).sort_values(by='percent_positive_ratings',ascending=False)

Unnamed: 0,appid,game,similarity,diversity,vote_count,percent_positive_ratings
1357,210770,Sanctum 2,0.197968,0.802032,12902,0.901333
8762,460810,Vanquish,0.177617,0.822383,2687,0.878303
4382,338170,Ratz Instagib,0.179991,0.820009,2505,0.854291
1467,218620,PAYDAY 2,1.0,0.0,365180,0.845219
10844,524220,NieR:Automata™,0.18683,0.81317,48801,0.807094


### 4) linear_kernel

In [61]:
lk = linear_kernel(tfidf_matrix, tfidf_matrix)

In [120]:
recomend_game("PAYDAY 2",lk,10)

6655             Let's Sing 2016
22256                 Idle Heist
21486                Dreadnought
3575              The Masterplan
4438                     rFactor
26886               Grand Battle
1982               7 Days to Die
1357                   Sanctum 2
12648    DRAGON QUEST HEROES™ II
8928                     Colosse
Name: name, dtype: object

In [122]:
similar_games = content_recommender2('PAYDAY 2', merge_dataset, lk, 
                                    rating_threshold=0.80)
similar_games.head(5).sort_values(by='percent_positive_ratings',ascending=False)

Unnamed: 0,appid,game,similarity,diversity,vote_count,percent_positive_ratings
1357,210770,Sanctum 2,0.197968,0.802032,12902,0.901333
8762,460810,Vanquish,0.177617,0.822383,2687,0.878303
4382,338170,Ratz Instagib,0.179991,0.820009,2505,0.854291
1467,218620,PAYDAY 2,1.0,0.0,365180,0.845219
10844,524220,NieR:Automata™,0.18683,0.81317,48801,0.807094


# Pickel

In [56]:
filename = 'tfidf_matrix.pkl'
pickle.dump(tfidf_matrix, open(filename, 'wb'))

In [143]:
filename = 'cosine_similarity_model.pkl'
pickle.dump(similarities, open(filename, 'wb'))

In [30]:
filename = 'sigmoid.pkl'
pickle.dump(sig, open(filename, 'wb'))