Exercise where we obtain cosine of similarity (user), classification (game) from the hours played:

In [1]:
# Import libraries
import csv
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Open file IT WAS DONE WITH 200,000 DATA, NOT WITH THE STEAM ORIGINALS
df = pd.read_csv('steam-200k.csv', header=None).drop(4,axis=1)

In [3]:
# Order columns
df.columns = ['user', 'game', 'action', 'hours']

In [4]:
# Statistical description
df.describe()

Unnamed: 0,user,hours
count,200000.0,200000.0
mean,103655900.0,17.874384
std,72080740.0,138.056952
min,5250.0,0.1
25%,47384200.0,1.0
50%,86912010.0,1.0
75%,154230900.0,1.3
max,309903100.0,11754.0


We have 200.000 users (counted), with an average playing time of 17,87 hours (138 standard deviation).
The minimum being played is 0,1 hour and a maximum of 11,754 hours. 75% play up to 1,3 hours.

In [5]:
# Configuration of our matrix without NaN, but 0
matrix = df.pivot_table(columns="game", index="user", values="hours", fill_value=0)

In [6]:
# Game containing string 3
df[df['game'].str.contains('3')].head(5)

Unnamed: 0,user,game,action,hours
30,151603712,Fallout 3 - Game of the Year Edition,purchase,1.0
31,151603712,Fallout 3 - Game of the Year Edition,play,0.8
130,59945701,Serious Sam 3 BFE,purchase,1.0
147,53875128,Far Cry 3,purchase,1.0
148,53875128,Far Cry 3,play,35.0


In [7]:
# Average hours of play per game in descending order
df.groupby("game")["hours"].mean().sort_values(ascending=False).head()

game
Eastside Hockey Manager    648.000000
FIFA Manager 09            206.000000
Perpetuum                  200.987500
Football Manager 2012      194.501887
Football Manager 2014      194.017722
Name: hours, dtype: float64

In [8]:
# Game with user and hours played
df.loc[df["game"] == "Eastside Hockey Manager"]

Unnamed: 0,user,game,action,hours
194334,213854339,Eastside Hockey Manager,purchase,1.0
194335,213854339,Eastside Hockey Manager,play,1295.0


In [9]:
# Sum in descending order of hours played with game
df.groupby("game")["hours"].sum().sort_values(ascending=False).head()

game
Dota 2                             986525.6
Counter-Strike Global Offensive    324183.6
Team Fortress 2                    175996.3
Counter-Strike                     135117.1
Sid Meier's Civilization V         100417.3
Name: hours, dtype: float64

In [10]:
# Hours played (mean)
mean_hours_played = df['hours'].mean()
print(mean_hours_played)

17.874384000000003


In [11]:
# Show matrix
matrix

game,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,samurai_jazz,the static speaks my name,theHunter,theHunter Primal
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
76767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
86540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
103360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
144736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309554670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
309626088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
309812026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
309824202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [12]:
# Array without 0
matrix_without_0 = matrix[(matrix != 0).sum(axis=1) >= 5]

In [13]:
# Dataframe shape
matrix_without_0.shape

(3757, 5155)

Here the number of rows drops, from 12393 to 3757 (more than 5 rows without 0 are discriminated against).

In [14]:
# Assuming `matrix_without_0` is a matrix where each row represents a game
games = matrix_without_0

# Calculate the cosine similarity between all games
similarities = cosine_similarity(games, games)

# Define the minimum cosine similarity to consider a pair of games similar
min_similarity = 0.5

# Find the pairs of games with similar cosine similarity
similar_game_pairs = []
for i in range(games.shape[0]):
    for j in range(i+1, games.shape[0]):
        if similarities[i, j] >= min_similarity:
            similar_game_pairs.append((i, j)) 

In [32]:
# Convert to dataframe
df = pd.DataFrame(similarities)

In [33]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3747,3748,3749,3750,3751,3752,3753,3754,3755,3756
0,1.000000,0.012417,0.009056,0.031806,0.035126,0.043114,0.004779,0.022992,0.040321,0.031382,...,0.018279,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.006171,0.0,0.002435
1,0.012417,1.000000,0.011290,0.224758,0.149078,0.292772,0.342257,0.005359,0.202945,0.386890,...,0.041622,0.008814,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,0.009056,0.011290,1.000000,0.036889,0.040740,0.038847,0.004570,0.107112,0.035074,0.036398,...,0.019891,0.006791,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.031806,0.224758,0.036889,1.000000,0.883509,0.752826,0.340736,0.004043,0.845154,0.964764,...,0.222937,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.035126,0.149078,0.040740,0.883509,1.000000,0.814893,0.067367,0.004525,0.746702,0.831338,...,0.205795,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3752,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000818,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.060021,1.000000,0.000000,0.171254,0.0,0.207396
3753,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.119666
3754,0.006171,0.000000,0.000000,0.000000,0.000000,0.000000,0.001353,0.171300,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.035950,0.171254,0.000000,1.000000,0.0,0.043655
3755,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031713,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.0,0.000000


In [34]:
# Recommend the next 10 games, similar, if user number 3 is chosen
input_user = 3
recommendations = pd.DataFrame(df.nlargest(11,input_user)[3])
recommendations = recommendations[recommendations[3]!=input_user]
print(recommendations)

            3
3    1.000000
38   1.000000
45   1.000000
48   1.000000
61   1.000000
125  0.997054
86   0.996204
9    0.964764
14   0.894427
20   0.894427
29   0.894427


In [18]:
# Like an array, Cosine similarity
similarities = cosine_similarity(matrix_without_0, matrix_without_0)
similarities

array([[1.        , 0.01241667, 0.00905632, ..., 0.00617114, 0.        ,
        0.00243529],
       [0.01241667, 1.        , 0.01129041, ..., 0.        , 0.        ,
        0.        ],
       [0.00905632, 0.01129041, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00617114, 0.        , 0.        , ..., 1.        , 0.        ,
        0.04365482],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.00243529, 0.        , 0.        , ..., 0.04365482, 0.        ,
        1.        ]])

In [19]:
# Convert to DF
df = pd.DataFrame(similarities, columns=matrix_without_0.index, index=matrix_without_0.index)

In [20]:
df

user,5250,76767,86540,103360,144736,181212,229911,298950,299153,381543,...,303007171,303129589,303442756,303467308,303525289,304081461,304971849,306547522,306971738,309404240
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,1.000000,0.012417,0.009056,0.031806,0.035126,0.043114,0.004779,0.022992,0.040321,0.031382,...,0.018279,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.006171,0.0,0.002435
76767,0.012417,1.000000,0.011290,0.224758,0.149078,0.292772,0.342257,0.005359,0.202945,0.386890,...,0.041622,0.008814,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
86540,0.009056,0.011290,1.000000,0.036889,0.040740,0.038847,0.004570,0.107112,0.035074,0.036398,...,0.019891,0.006791,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
103360,0.031806,0.224758,0.036889,1.000000,0.883509,0.752826,0.340736,0.004043,0.845154,0.964764,...,0.222937,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
144736,0.035126,0.149078,0.040740,0.883509,1.000000,0.814893,0.067367,0.004525,0.746702,0.831338,...,0.205795,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304081461,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000818,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.060021,1.000000,0.000000,0.171254,0.0,0.207396
304971849,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.119666
306547522,0.006171,0.000000,0.000000,0.000000,0.000000,0.000000,0.001353,0.171300,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.035950,0.171254,0.000000,1.000000,0.0,0.043655
306971738,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031713,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.0,0.000000


In [29]:
# Recomendar los próximos 10 juegos, similares, por el usuario 309404240
input_user = 309404240
recommendations = pd.DataFrame(df.nlargest(11,input_user)[309404240])
recommendations = recommendations[recommendations[309404240]!=input_user]
print(recommendations)

           309404240
user                
309404240   1.000000
271073348   0.953213
238773402   0.946235
247443969   0.946202
232047954   0.941307
255007865   0.940020
156619742   0.935648
228649481   0.935461
201816765   0.935104
190016182   0.923654
182726567   0.919459


In [22]:
# Next we centered our values. 
# This can be useful for certain machine learning algorithms that require data to be normalized before it can be processed.
def center(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row
matrix_std = matrix.apply(center)

In [23]:
matrix_std

game,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,samurai_jazz,the static speaks my name,theHunter,theHunter Primal
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
76767,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
86540,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
103360,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
144736,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309554670,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
309626088,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
309812026,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101
309824202,-0.000081,-0.000212,-0.000192,-0.0002,-0.000081,-0.0002,-0.000125,-0.000356,-0.000259,-0.000179,...,-0.000081,-0.000385,-0.00013,-0.000477,-0.000081,-0.002124,-0.000081,-0.000807,-0.000573,-0.000101


In [24]:
# Create correlation matrix
correlation_matrix = matrix_without_0.corr()

In [25]:
correlation_matrix

game,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,samurai_jazz,the static speaks my name,theHunter,theHunter Primal
game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,1.000000,-0.000459,-0.000532,-0.000550,-0.000266,-0.000487,-0.000362,-0.000712,-0.000637,-0.000507,...,-0.000266,-0.000586,-0.000366,-0.000873,-0.000266,-0.002358,-0.000266,-0.000934,-0.001527,-0.000328
0RBITALIS,-0.000459,1.000000,-0.000917,0.265920,-0.000459,-0.000840,-0.000623,-0.001227,-0.001099,-0.000873,...,-0.000459,-0.001009,-0.000631,-0.001504,-0.000459,-0.004064,-0.000459,-0.001609,0.007158,-0.000565
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),-0.000532,-0.000917,1.000000,-0.001100,-0.000532,-0.000974,-0.000723,-0.001423,-0.001274,-0.001013,...,-0.000532,-0.001171,-0.000731,0.071208,-0.000532,0.085859,-0.000532,0.293103,0.016676,-0.000656
10 Second Ninja,-0.000550,0.265920,-0.001100,1.000000,-0.000550,-0.001007,-0.000748,-0.001472,-0.001318,-0.001047,...,-0.000550,-0.001211,-0.000756,0.011302,-0.000550,-0.004874,-0.000550,-0.001930,0.001615,-0.000678
10000000,-0.000266,-0.000459,-0.000532,-0.000550,1.000000,-0.000487,-0.000362,-0.000712,-0.000637,-0.000507,...,-0.000266,0.467558,-0.000366,-0.000873,-0.000266,-0.002358,-0.000266,-0.000934,-0.001527,-0.000328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sZone-Online,-0.002358,-0.004064,0.085859,-0.004874,-0.002358,-0.004316,-0.003203,-0.000022,-0.005646,-0.004487,...,-0.002358,-0.005187,-0.003240,-0.003023,-0.002358,1.000000,-0.002358,0.052982,0.066803,-0.002905
samurai_jazz,-0.000266,-0.000459,-0.000532,-0.000550,-0.000266,-0.000487,-0.000362,-0.000712,-0.000637,-0.000507,...,-0.000266,-0.000586,-0.000366,-0.000873,-0.000266,-0.002358,1.000000,-0.000934,-0.001527,-0.000328
the static speaks my name,-0.000934,-0.001609,0.293103,-0.001930,-0.000934,-0.001709,-0.001268,-0.002497,-0.002236,-0.001777,...,-0.000934,-0.002054,-0.001283,-0.003061,-0.000934,0.052982,-0.000934,1.000000,0.012357,-0.001150
theHunter,-0.001527,0.007158,0.016676,0.001615,-0.001527,-0.002795,-0.002074,-0.000792,-0.001647,0.002100,...,-0.001527,-0.003359,-0.002098,-0.004374,-0.001527,0.066803,-0.001527,0.012357,1.000000,0.037530


In [26]:
# Convert to DF
df_game = pd.DataFrame(correlation_matrix)

In [28]:
# Recommend next 10 games, similar like "theHunter" game
input_game = "theHunter"
recommendations = pd.DataFrame(df_game.nlargest(11,input_game)["theHunter"])
recommendations = recommendations[recommendations["theHunter"]!=input_game]
print(recommendations)

                                theHunter
game                                     
theHunter                        1.000000
Angry Birds Space                0.879203
Pulut Adventure                  0.879203
WARSHIFT                         0.879203
Legends of Aethereus             0.828203
Angels Fall First                0.790776
Trove Power Pack                 0.620693
Flame Over                       0.505980
Divinity Dragon Commander Beta   0.476204
Incredipede                      0.455587
Dead Rising 3                    0.374316
