In [157]:
import pandas as pd
import numpy as np

from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity

#### Note: purchase_or_hours always takes the value "1" when the game was purchased and the number of hours played otherwise.

In [158]:
df = pd. read_csv(
    "data/steam-200k.csv",
    header=None,
    names=["userid", "game", "action", "hours_played", "useless"]
)[["userid", "game", "action", "hours_played"]]
df.head()

Unnamed: 0,userid,game,action,hours_played
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0


### Missing Values

In [159]:
df.isnull().sum()

userid          0
game            0
action          0
hours_played    0
dtype: int64

#### We only keep values for which "action" == "play" because we score games proportionally to the number of hours played

Here, we need to put a new index because it gets weird after the slicing

In [160]:
df_play = df[df["action"] == "play"]
df_play["new_index"] = [*range(0, len(df_play))]
df_play = df_play.set_index("new_index").drop(columns="action")

df_play.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_play["new_index"] = [*range(0, len(df_play))]


Unnamed: 0_level_0,userid,game,hours_played
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,151603712,The Elder Scrolls V Skyrim,273.0
1,151603712,Fallout 4,87.0
2,151603712,Spore,14.9
3,151603712,Fallout New Vegas,12.1
4,151603712,Left 4 Dead 2,8.9


### Appreciation

In this dataset, we do not have an explicit feature translating the rate of appreciation of a game by a user.  Therefore, we consider that the user liked the game if he played at least 20 hours.  
God knows there are video games we hated but still played for more than 20 hours, which justifies a higher threshold. However, that would unvoluntarily discard small video games (which take few hours to complete).

Quoique je me dis ptet que c'est mieux :    
- 3 if plays more than 50 hours (even if it's a small game, if it's really good then it can be played multiple times)
- 2 if between 20 and 50 hours
- 1 if less than 20 hours
- 0 if less than 2 hours (it's shit)

on peut réduire le nombre de classes si problématique

In [161]:
df_play["score"] = [
    3 if df_play["hours_played"][i] >= 50 else
    2 if df_play["hours_played"][i] >= 20 and df_play["hours_played"][i] < 50 else
    1 if df_play["hours_played"][i] > 2 and df_play["hours_played"][i] < 20 else
    0 for i in range(len(df_play))
]

In [162]:
df_play.head(5)

Unnamed: 0_level_0,userid,game,hours_played,score
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,151603712,The Elder Scrolls V Skyrim,273.0,3
1,151603712,Fallout 4,87.0,3
2,151603712,Spore,14.9,1
3,151603712,Fallout New Vegas,12.1,1
4,151603712,Left 4 Dead 2,8.9,1


### Interaction Matrix

In [163]:
def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [164]:
interaction_matrix = create_interaction_matrix(df_play, "userid", "game", "score")
interaction_matrix.head(5)

game,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### User Dictionnary

In [165]:
def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [166]:
user_dict = create_user_dict(interaction_matrix)
list(user_dict.items())[:5]

[(5250, 0), (76767, 1), (86540, 2), (144736, 3), (181212, 4)]

### Item Dictionnary

In [170]:
def create_item_dict(df):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_name = list(df.columns)
    item_dict = {}
    counter = 0 
    for i in item_name:
        item_dict[i] = counter
        counter += 1
    return item_dict

In [217]:
item_dict = create_item_dict(interaction_matrix)
list(item_dict.items())[:5]

[('007 Legends', 0),
 ('0RBITALIS', 1),
 ('1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)', 2),
 ('10 Second Ninja', 3),
 ('10,000,000', 4)]

## Model

In [176]:
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

In [177]:
model = runMF(interaction_matrix)

### User Recommender

In [215]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
                       .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    #known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    #scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items[:5]:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores[:5]:
            print(str(counter) + '- ' + i)
            counter+=1
            
    #return return_score_list

In [216]:
test_id = df_play["userid"][0]

sample_recommendation_user(
    model,
    interaction_matrix,
    test_id,
    user_dict,
    item_dict,
    threshold=0,
    nrec_items=5,
    show=True
)

Known Likes:
1- Tomb Raider
2- The Elder Scrolls V Skyrim
3- Team Fortress 2
4- Spore
5- Poly Bridge

 Recommended Items:
1- Terraria
2- Borderlands 2
3- Counter-Strike Global Offensive
4- Sid Meier's Civilization V
5- Dota 2
