<a href="https://colab.research.google.com/github/plaban1981/HACKEREARTH/blob/master/av_lightfm_recommendation_hack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://towardsdatascience.com/solving-business-usecases-by-recommender-system-using-lightfm-4ba7b3ac8e62

In [2]:
## Importing required libraries
import pandas as pd ## For DataFrame operation
import numpy as np ## Numerical python for matrix operations
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler ## Preprocessing function
import pandas_profiling ## For easy profiling of pandas DataFrame
import missingno as msno ## Missing value co-occurance analysis

In [4]:
pip install recsys

Collecting recsys
[?25l  Downloading https://files.pythonhosted.org/packages/e1/d9/cb8bcab9ef4e1ad686b09297c35c685c654ab9f5e8db0295e077ef049db1/recsys-0.0.4.tar.gz (1.4MB)
[K     |████████████████████████████████| 1.4MB 2.8MB/s 
Building wheels for collected packages: recsys
  Building wheel for recsys (setup.py) ... [?25l[?25hdone
  Created wheel for recsys: filename=recsys-0.0.4-cp36-cp36m-linux_x86_64.whl size=773251 sha256=b024f7b4fea72146673803f576fd357f9d806e72096d7820134a7d660d0c38ad
  Stored in directory: /root/.cache/pip/wheels/43/ee/be/aacf038daea92cacaafd8fa4c70084b902cb157cfc45886779
Successfully built recsys
Installing collected packages: recsys
Successfully installed recsys-0.0.4


In [6]:
from recsys import *

In [3]:
####### Data Exploration ############

def print_dim(df):
    '''
    Function to print the dimensions of a given python dataframe
    Required Input -
        - df = Pandas DataFrame
    Expected Output -
        - Data size
    '''
    print("Data size: Rows-{0} Columns-{1}".format(df.shape[0],df.shape[1]))


def print_dataunique(df):
    '''
    Function to print unique information for each column in a python dataframe
    Required Input - 
        - df = Pandas DataFrame
    Expected Output -
        - Column name
        - Data type of that column
        - Number of unique values in that column
        - 5 unique values from that column
    '''
    counter = 0
    for i in df.columns:
        x = df.loc[:,i].unique()
        print(counter,i,type(df.loc[0,i]), len(x), x[0:5])
        counter +=1
        
def do_data_profiling(df, filename):
    '''
    Function to do basic data profiling
    Required Input - 
        - df = Pandas DataFrame
        - filename = Path for output file with a .html extension
    Expected Output -
        - HTML file with data profiling summary
    '''
    profile = pandas_profiling.ProfileReport(df)
    profile.to_file(output_file = filename)
    print("Data profiling done")

def missing_value_analysis(df):
    '''
    Function to do basic missing value analysis
    Required Input - 
        - df = Pandas DataFrame
    Expected Output -
        - Chart of Missing value co-occurance
        - Chart of Missing value heatmap
    '''
    msno.matrix(df)
    msno.heatmap(df)

####### Basic helper function ############

def join_df(left, right, left_on, right_on=None, method='left'):
    '''
    Function to outer joins of pandas dataframe
    Required Input - 
        - left = Pandas DataFrame 1
        - right = Pandas DataFrame 2
        - left_on = Fields in DataFrame 1 to merge on
        - right_on = Fields in DataFrame 2 to merge with left_on fields of Dataframe 1
        - method = Type of join
    Expected Output -
        - Pandas dataframe with dropped no variation columns
    '''
    if right_on is None:
        right_on = left_on
    return left.merge(right, 
                      how=method, 
                      left_on=left_on, 
                      right_on=right_on, 
                      suffixes=("","_y"))
    
####### Pre-processing ############    

def drop_allsame(df):
    '''
    Function to remove any columns which have same value all across
    Required Input - 
        - df = Pandas DataFrame
    Expected Output -
        - Pandas dataframe with dropped no variation columns
    '''
    to_drop = list()
    for i in df.columns:
        if len(df.loc[:,i].unique()) == 1:
            to_drop.append(i)
    return df.drop(to_drop,axis =1)

def treat_missing_numeric(df,columns,how = 'mean'):
    '''
    Function to treat missing values in numeric columns
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns need to be imputed
        - how = valid values are 'mean', 'mode', 'median','ffill', numeric value
    Expected Output -
        - Pandas dataframe with imputed missing value in mentioned columns
    '''
    if how == 'mean':
        for i in columns:
            print("Filling missing values with mean for columns - {0}".format(i))
            df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].mean())
            
    elif how == 'mode':
        for i in columns:
            print("Filling missing values with mode for columns - {0}".format(i))
            df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].mode())
    
    elif how == 'median':
        for i in columns:
            print("Filling missing values with median for columns - {0}".format(i))
            df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].median())
    
    elif how == 'ffill':
        for i in columns:
            print("Filling missing values with forward fill for columns - {0}".format(i))
            df.ix[:,i] = df.ix[:,i].fillna(method ='ffill')
    
    elif type(how) == int or type(how) == float:
        for i in columns:
            print("Filling missing values with {0} for columns - {1}".format(how,i))
            df.ix[:,i] = df.ix[:,i].fillna(how)
    else:
        print("Missing value fill cannot be completed")
    return df

def treat_missing_categorical(df,columns,how = 'mode'):
    '''
    Function to treat missing values in numeric columns
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns need to be imputed
        - how = valid values are 'mode', any string or numeric value
    Expected Output -
        - Pandas dataframe with imputed missing value in mentioned columns
    '''
    if how == 'mode':
        for i in columns:
            print("Filling missing values with mode for columns - {0}".format(i))
            df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].mode()[0])
    elif type(how) == str:
        for i in columns:
            print("Filling missing values with {0} for columns - {1}".format(how,i))
            df.ix[:,i] = df.ix[:,i].fillna(how)
    elif type(how) == int or type(how) == float:
        for i in columns:
            print("Filling missing values with {0} for columns - {1}".format(how,i))
            df.ix[:,i] = df.ix[:,i].fillna(str(how))
    else:
        print("Missing value fill cannot be completed")
    return df
    
def min_max_scaler(df,columns):
    '''
    Function to do Min-Max scaling
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns which needs to be min-max scaled
    Expected Output -
        - df = Python DataFrame with Min-Max scaled attributes
        - scaler = Function which contains the scaling rules
    '''
    scaler = MinMaxScaler()
    data = pd.DataFrame(scaler.fit_transform(df.loc[:,columns]))
    data.index = df.index
    data.columns = columns
    return data, scaler

def z_scaler(df,columns):
    '''
    Function to standardize features by removing the mean and scaling to unit variance
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns which needs to be min-max scaled
    Expected Output -
        - df = Python DataFrame with Min-Max scaled attributes
        - scaler = Function which contains the scaling rules
    '''
    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(df.loc[:,columns]))
    data.index = df.index
    data.columns = columns
    return data, scaler
    
def label_encoder(df,columns):
    '''
    Function to label encode
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns which needs to be label encoded
    Expected Output -
        - df = Pandas DataFrame with lable encoded columns
        - le_dict = Dictionary of all the column and their label encoders
    '''
    le_dict = {}
    for c in columns:
        print("Label encoding column - {0}".format(c))
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values.astype('str')))
        df[c] = lbl.transform(list(df[c].values.astype('str')))
        le_dict[c] = lbl
    return df, le_dict

def one_hot_encoder(df, columns):
    '''
    Function to do one-hot encoded
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns which needs to be one-hot encoded
    Expected Output -
        - df = Pandas DataFrame with one-hot encoded columns
    '''
    for each in columns:
        print("One-Hot encoding column - {0}".format(each))
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df.drop(columns,axis = 1)

####### Feature Engineering ############
def create_date_features(df,column, date_format = None, more_features = False, time_features = False):
    '''
    Function to extract date features
    Required Input - 
        - df = Pandas DataFrame
        - date_format = Date parsing format
        - columns = Columns name containing date field
        - more_features = To get more feature extracted
        - time_features = To extract hour from datetime field
    Expected Output -
        - df = Pandas DataFrame with additional extracted date features
    '''
    if date_format is None:
        df.loc[:,column] = pd.to_datetime(df.loc[:,column])
    else:
        df.loc[:,column] = pd.to_datetime(df.loc[:,column],format = date_format)
    df.loc[:,column+'_Year'] = df.loc[:,column].dt.year
    df.loc[:,column+'_Month'] = df.loc[:,column].dt.month.astype('uint8')
    df.loc[:,column+'_Week'] = df.loc[:,column].dt.week.astype('uint8')
    df.loc[:,column+'_Day'] = df.loc[:,column].dt.day.astype('uint8')
    
    if more_features:
        df.loc[:,column+'_Quarter'] = df.loc[:,column].dt.quarter.astype('uint8')
        df.loc[:,column+'_DayOfWeek'] = df.loc[:,column].dt.dayofweek.astype('uint8')
        df.loc[:,column+'_DayOfYear'] = df.loc[:,column].dt.dayofyear
        
    if time_features:
        df.loc[:,column+'_Hour'] = df.loc[:,column].dt.hour.astype('uint8')
    return df

def target_encoder(train_df, col_name, target_name, test_df = None, how='mean'):
    '''
    Function to do target encoding
    Required Input - 
        - train_df = Training Pandas Dataframe
        - test_df = Testing Pandas Dataframe
        - col_name = Name of the columns of the source variable
        - target_name = Name of the columns of target variable
        - how = 'mean' default but can also be 'count'
	Expected Output - 
		- train_df = Training dataframe with added encoded features
		- test_df = Testing dataframe with added encoded features
    '''
    aggregate_data = train_df.groupby(col_name)[target_name] \
                    .agg([how]) \
                    .reset_index() \
                    .rename(columns={how: col_name+'_'+target_name+'_'+how})
    if test_df is None:
        return join_df(train_df,aggregate_data,left_on = col_name)
    else:
        return join_df(train_df,aggregate_data,left_on = col_name), join_df(test_df,aggregate_data,left_on = col_name)


In [7]:
user = pd.read_csv('/content/train.csv')
challenge = pd.read_csv('/content/challenge_data.csv')

In [8]:
user.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [11]:
user['times_challenge_attempted'] = user['challenge'].map(user.groupby('challenge')['user_id'].count().to_dict())

In [12]:
user.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,times_challenge_attempted
0,4576_1,4576,1,CI23714,8385
1,4576_2,4576,2,CI23855,10016
2,4576_3,4576,3,CI24917,8213
3,4576_4,4576,4,CI23663,8025
4,4576_5,4576,5,CI23933,7381


In [14]:
def rating(val):
  if val < 2000:
    return 1
  elif val < 4000:
    return 2
  elif val < 6000:
    return 3
  elif val < 8000:
    return 4
  else:
    return 5

In [15]:
user['challenge_rating'] = user['times_challenge_attempted'].map(rating)

In [16]:
user.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,times_challenge_attempted,challenge_rating
0,4576_1,4576,1,CI23714,8385,5
1,4576_2,4576,2,CI23855,10016,5
2,4576_3,4576,3,CI24917,8213,5
3,4576_4,4576,4,CI23663,8025,5
4,4576_5,4576,5,CI23933,7381,4


In [19]:
import recsys

In [23]:
pip install lightfm

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/e9/8e/5485ac5a8616abe1c673d1e033e2f232b4319ab95424b42499fabff2257f/lightfm-1.15.tar.gz (302kB)
[K     |█                               | 10kB 17.5MB/s eta 0:00:01[K     |██▏                             | 20kB 1.7MB/s eta 0:00:01[K     |███▎                            | 30kB 2.3MB/s eta 0:00:01[K     |████▍                           | 40kB 2.5MB/s eta 0:00:01[K     |█████▍                          | 51kB 2.0MB/s eta 0:00:01[K     |██████▌                         | 61kB 2.3MB/s eta 0:00:01[K     |███████▋                        | 71kB 2.5MB/s eta 0:00:01[K     |████████▊                       | 81kB 2.7MB/s eta 0:00:01[K     |█████████▊                      | 92kB 2.9MB/s eta 0:00:01[K     |██████████▉                     | 102kB 2.8MB/s eta 0:00:01[K     |████████████                    | 112kB 2.8MB/s eta 0:00:01[K     |█████████████                   | 122kB 2.8MB/s eta 0:00:01[K  

In [24]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity

def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict
    
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
								 .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list
    

def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list 


def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

In [122]:
# Creating interaction matrix using user data
interactions = create_interaction_matrix(df = challenge,
                                         user_col = 'challenge',
                                         item_col = 'challenge',
                                         rating_col = 'total_submissions')
interactions.head()

challenge,CI23478,CI23479,CI23480,CI23481,CI23482,CI23483,CI23484,CI23485,CI23486,CI23487,CI23488,CI23489,CI23490,CI23491,CI23492,CI23493,CI23494,CI23495,CI23496,CI23497,CI23498,CI23499,CI23500,CI23501,CI23502,CI23503,CI23504,CI23505,CI23506,CI23507,CI23508,CI23509,CI23510,CI23511,CI23512,CI23513,CI23514,CI23515,CI23516,CI23517,...,CI29044,CI29045,CI29046,CI29047,CI29048,CI29049,CI29050,CI29051,CI29052,CI29053,CI29054,CI29055,CI29056,CI29057,CI29058,CI29059,CI29060,CI29061,CI29062,CI29063,CI29064,CI29065,CI29066,CI29067,CI29068,CI29069,CI29070,CI29071,CI29072,CI29073,CI29074,CI29075,CI29076,CI29077,CI29078,CI29079,CI29080,CI29081,CI29082,CI29083
challenge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
CI23478,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23479,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23480,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23481,0.0,0.0,0.0,236.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23482,0.0,0.0,0.0,0.0,137.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
interactions.tail()

challenge,CI23478,CI23479,CI23480,CI23481,CI23482,CI23483,CI23484,CI23485,CI23486,CI23487,CI23488,CI23489,CI23490,CI23491,CI23492,CI23493,CI23494,CI23495,CI23496,CI23497,CI23498,CI23499,CI23500,CI23501,CI23502,CI23503,CI23504,CI23505,CI23506,CI23507,CI23508,CI23509,CI23510,CI23511,CI23512,CI23513,CI23514,CI23515,CI23516,CI23517,...,CI29044,CI29045,CI29046,CI29047,CI29048,CI29049,CI29050,CI29051,CI29052,CI29053,CI29054,CI29055,CI29056,CI29057,CI29058,CI29059,CI29060,CI29061,CI29062,CI29063,CI29064,CI29065,CI29066,CI29067,CI29068,CI29069,CI29070,CI29071,CI29072,CI29073,CI29074,CI29075,CI29076,CI29077,CI29078,CI29079,CI29080,CI29081,CI29082,CI29083
challenge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
CI29079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
CI29080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
CI29081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
CI29082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0
CI29083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0


In [71]:
interactions.shape

(69532, 5348)

In [27]:
challenge.head()

Unnamed: 0,challenge,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,6/5/2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [90]:
# IMPUTATION

challenge['total_submissions'].fillna(-1,inplace=True)
challenge['author_org_ID'].fillna(-1,inplace=True)
challenge['category_id'].fillna(-1,inplace=True)



challenge['description'] = challenge['challenge']+' '+challenge['author_org_ID'].astype(str) + ' ' + challenge['category_id'].astype(str) + ' ' + challenge['total_submissions'].astype(str)

In [91]:
challenge.head()

Unnamed: 0,challenge,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id,description
0,CI23478,2,SI2445,37.0,6/5/2006,AI563576,M,AOI100001,-1.0,CI23478 AOI100001 -1.0 37.0
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0,CI23479 AOI100002 32.0 48.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,-1.0,CI23480 AOI100003 -1.0 15.0
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0,CI23481 AOI100004 70.0 236.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,-1.0,CI23482 AOI100005 -1.0 137.0


In [115]:
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
      item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

In [116]:
# Create User Dict
user_dict = create_user_dict(interactions=interactions)
# Create Item dict
challenge_dict = create_item_dict(df = challenge,
                               id_col = 'challenge',
                               name_col = 'challenge')

In [117]:
challenge_dict['CI28785']

'CI28785'

In [109]:
challenge[challenge['challenge']=='CI28785']

Unnamed: 0,challenge,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id,description
5068,CI28785,1,SI2819,15.0,4/9/2007,AI566783,M,AOI101589,36.0,CI28785 AOI101589 36.0 15.0


# Building Matrix Factorization model

* interaction matrix: Interaction matrix created in the previous section

* n_components: Number of embedding generated for each user and item

* loss: We need to define a loss function, in this case, we are using warp loss because we mostly care about the ranking of data, i.e, which items should we show first

* epoch: Number of times to run

* n_jobs: Number of cores to use in parallel processing

In [124]:
mf_model = runMF(interactions = interactions,
                 n_components = 30,
                 loss = 'warp',
                 epoch = 30,
                 n_jobs = 4)

# Item recommendation to a user

In [149]:
# Creating interaction matrix using user data
interactions_user = create_interaction_matrix(df = user,
                                         user_col = 'user_id',
                                         item_col = 'challenge',
                                         rating_col = 'challenge_rating')
interactions_user.head()

challenge,CI23478,CI23479,CI23480,CI23481,CI23482,CI23483,CI23484,CI23485,CI23486,CI23488,CI23489,CI23490,CI23491,CI23492,CI23493,CI23494,CI23495,CI23496,CI23497,CI23498,CI23499,CI23500,CI23501,CI23502,CI23503,CI23504,CI23505,CI23506,CI23507,CI23508,CI23509,CI23510,CI23511,CI23512,CI23513,CI23514,CI23515,CI23516,CI23517,CI23518,...,CI29029,CI29030,CI29031,CI29032,CI29033,CI29034,CI29035,CI29039,CI29041,CI29043,CI29044,CI29045,CI29046,CI29047,CI29048,CI29049,CI29050,CI29051,CI29052,CI29054,CI29055,CI29057,CI29058,CI29059,CI29061,CI29063,CI29064,CI29065,CI29066,CI29067,CI29073,CI29074,CI29075,CI29076,CI29078,CI29079,CI29080,CI29081,CI29082,CI29083
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
4576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
mf_user_item_model = runMF(interactions = interactions_user,
                 n_components = 30,
                 loss = 'warp',
                 epoch = 30,
                 n_jobs = 4)

In [151]:
## Calling 10 movie recommendation for user id 11
rec_list = sample_recommendation_user(model = mf_user_item_model, 
                                      interactions = interactions_user, 
                                      user_id = 4576	, 
                                      user_dict = user_dict,
                                      item_dict = challenge_dict, 
                                      threshold = 4,
                                      nrec_items = 13,
                                      show = True)

Known Likes:
1- CI24917
2- CI23855
3- CI23714
4- CI23663

 Recommended Items:
1- CI25135
2- CI23933
3- CI25123
4- CI24957
5- CI24530
6- CI24915
7- CI23691
8- CI25124
9- CI24958
10- CI23848
11- CI25125
12- CI25142
13- CI23975


In [152]:
sample_recommendation_item(model = mf_model,
                           interactions = interactions,
                           item_id = 'CI23975',
                           user_dict = user_dict,
                           item_dict = challenge_dict,
                           number_of_user = 15)

['CI23975',
 'CI27699',
 'CI24095',
 'CI27670',
 'CI26766',
 'CI24663',
 'CI28131',
 'CI25682',
 'CI27287',
 'CI27784',
 'CI25346',
 'CI26012',
 'CI24373',
 'CI25217',
 'CI24111']

In [48]:
test = pd.read_csv('/content/test.csv')

In [83]:
test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,CI23855
1,4577_2,4577,2,CI23933
2,4577_3,4577,3,CI24917
3,4577_4,4577,4,CI24915
4,4577_5,4577,5,CI23714


In [82]:
test_user_id = test.user_id.unique().tolist()

In [53]:
test_user_id[-1]

113838

In [61]:
train_user_id =  user.user_id.unique().tolist()

In [78]:
if 113838 in test_user_id:
  print(True)

True


In [69]:
len(train_user_id)

69532

In [70]:
len(test_user_id)

39732

In [62]:
not_i_train = [ x for x in test_user_id if x not in train_user_id ]

In [72]:
len(not_i_train)

39732

In [119]:
challenge_dict

{'CI23478': 'CI23478',
 'CI23479': 'CI23479',
 'CI23480': 'CI23480',
 'CI23481': 'CI23481',
 'CI23482': 'CI23482',
 'CI23483': 'CI23483',
 'CI23484': 'CI23484',
 'CI23485': 'CI23485',
 'CI23486': 'CI23486',
 'CI23487': 'CI23487',
 'CI23488': 'CI23488',
 'CI23489': 'CI23489',
 'CI23490': 'CI23490',
 'CI23491': 'CI23491',
 'CI23492': 'CI23492',
 'CI23493': 'CI23493',
 'CI23494': 'CI23494',
 'CI23495': 'CI23495',
 'CI23496': 'CI23496',
 'CI23497': 'CI23497',
 'CI23498': 'CI23498',
 'CI23499': 'CI23499',
 'CI23500': 'CI23500',
 'CI23501': 'CI23501',
 'CI23502': 'CI23502',
 'CI23503': 'CI23503',
 'CI23504': 'CI23504',
 'CI23505': 'CI23505',
 'CI23506': 'CI23506',
 'CI23507': 'CI23507',
 'CI23508': 'CI23508',
 'CI23509': 'CI23509',
 'CI23510': 'CI23510',
 'CI23511': 'CI23511',
 'CI23512': 'CI23512',
 'CI23513': 'CI23513',
 'CI23514': 'CI23514',
 'CI23515': 'CI23515',
 'CI23516': 'CI23516',
 'CI23517': 'CI23517',
 'CI23518': 'CI23518',
 'CI23519': 'CI23519',
 'CI23520': 'CI23520',
 'CI23521':

In [125]:
## Creating item-item distance matrix
item_item_dist = create_item_emdedding_distance_matrix(model = mf_model,
                                                       interactions = interactions)
## Checking item embedding distance matrix
item_item_dist.head()

challenge,CI23478,CI23479,CI23480,CI23481,CI23482,CI23483,CI23484,CI23485,CI23486,CI23487,CI23488,CI23489,CI23490,CI23491,CI23492,CI23493,CI23494,CI23495,CI23496,CI23497,CI23498,CI23499,CI23500,CI23501,CI23502,CI23503,CI23504,CI23505,CI23506,CI23507,CI23508,CI23509,CI23510,CI23511,CI23512,CI23513,CI23514,CI23515,CI23516,CI23517,...,CI29044,CI29045,CI29046,CI29047,CI29048,CI29049,CI29050,CI29051,CI29052,CI29053,CI29054,CI29055,CI29056,CI29057,CI29058,CI29059,CI29060,CI29061,CI29062,CI29063,CI29064,CI29065,CI29066,CI29067,CI29068,CI29069,CI29070,CI29071,CI29072,CI29073,CI29074,CI29075,CI29076,CI29077,CI29078,CI29079,CI29080,CI29081,CI29082,CI29083
challenge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
CI23478,1.0,-0.0874,0.188153,0.293452,0.47893,-0.055396,-0.226445,-0.158841,-0.028209,-0.084699,-0.118167,0.120935,-0.108772,-0.171524,0.143904,-0.101748,0.16572,-0.011303,0.048911,0.00371,-0.023196,-0.466069,0.062245,0.256679,-0.264804,0.011054,0.011011,0.220098,-0.209896,0.082311,-0.084546,-0.066713,0.264554,-0.251421,0.217024,0.082413,-0.293391,-0.285127,-0.049824,0.226978,...,-0.150082,0.173616,-0.067729,-0.073319,-0.111901,-0.076761,-0.149743,0.118249,-0.093059,0.26417,-0.187214,-0.246734,-0.162903,-0.151739,-0.036866,0.043185,0.121361,-0.165784,0.229551,0.280244,-0.106108,0.035833,-0.357136,-0.043439,0.269085,0.190265,-0.203473,-0.133345,-0.271118,0.000156,-0.257173,0.384738,-0.297854,0.168716,0.244536,-0.471365,0.027835,0.066079,0.055214,-0.231876
CI23479,-0.0874,1.0,-0.162474,-0.114317,-0.101492,-0.135502,-0.080904,-0.005313,-0.042074,0.030651,-0.005653,-0.306794,-0.180879,-0.310287,-0.215039,-0.241693,-0.127972,-0.075297,-0.128163,-0.106707,0.176137,-0.081455,-0.096474,-0.26373,0.107843,-0.109065,-0.076353,0.225211,0.125524,0.011668,-0.253581,0.096918,0.084585,-0.133438,-0.078009,-0.108244,-0.054434,0.240051,-0.184023,-0.026717,...,0.031506,0.018886,0.065481,0.037229,0.208909,0.142213,0.020421,0.053213,-0.201213,-0.064977,0.111778,0.129251,0.066075,-0.064831,-0.202743,0.019818,-0.185447,0.103041,0.158148,-0.100201,0.182461,0.062299,-0.294889,-0.180986,0.024781,0.03773,-0.284221,0.035969,-0.027043,0.025294,0.353506,-0.277047,-0.027848,-0.111792,-0.320978,0.265382,0.05596,-0.052037,-0.11248,-0.186781
CI23480,0.188153,-0.162474,1.0,0.059605,0.352096,-0.079133,0.116565,0.015055,-0.174798,0.409001,0.266319,-0.072933,-0.191784,0.071657,0.145139,-0.124656,0.194792,0.110309,-0.057951,0.075721,-0.168348,-0.29979,0.356878,-0.35447,-0.374871,0.070273,-0.015539,0.165249,-0.080974,0.001061,0.153103,0.080906,-0.069571,0.140154,0.279531,-0.14721,0.116307,0.223327,0.054176,-0.192878,...,-0.08869,-0.180831,-0.382839,-0.310895,-0.197486,0.184883,-0.175356,-0.161901,0.05868,-0.239052,-0.021022,0.184325,0.086908,0.220083,0.148618,0.088893,0.25812,-0.019514,0.035384,0.118932,0.102211,0.42207,0.048408,-0.279421,0.142735,-0.195734,-0.209487,-0.223985,0.194453,-0.111734,-0.059366,-0.298133,-0.324802,-0.007842,-0.155485,-0.061185,0.11705,-0.004838,-0.209751,-0.13508
CI23481,0.293452,-0.114317,0.059605,1.0,-0.061656,-0.25242,-0.230536,0.224641,-0.150586,0.247235,0.090958,-0.230103,0.215379,-0.144222,0.070827,0.072689,0.096699,0.081048,0.054354,0.113935,0.234511,0.060123,0.123961,0.0941,-0.04532,-0.361888,0.213353,0.031053,-0.116207,-0.090136,0.02952,0.243099,0.060658,-0.129161,0.175932,-0.009587,-0.275249,-0.224176,0.149482,0.139726,...,-0.409639,-0.037406,0.072212,0.255018,0.00225,-0.056034,0.05683,0.248672,0.047152,0.229663,-0.014163,-0.018692,0.18592,0.034166,-0.021339,-0.013994,0.108348,-0.216949,-0.409947,-0.080463,0.015326,-0.01615,0.178958,0.074348,-0.08325,0.093113,0.102589,-0.052028,-0.061203,0.211007,0.164816,0.138451,0.234179,0.27132,0.203255,0.058369,0.184862,0.044315,0.229374,-0.085937
CI23482,0.47893,-0.101492,0.352096,-0.061656,1.0,0.25014,0.135765,-0.096196,0.028602,0.05149,-0.447668,0.073734,-0.139926,0.109983,0.074857,0.094748,-0.024594,-0.144253,0.255183,-0.3007,-0.181387,-0.13111,-0.068372,0.120421,-0.254956,0.279502,0.044991,0.081081,-0.315531,0.190353,0.115098,-0.058749,-0.022637,-0.063768,-0.10671,-0.296313,0.178668,0.007509,-0.113516,-0.238816,...,0.322922,0.1257,-0.048343,-0.085009,0.036653,0.050854,-0.35118,-0.064481,0.015406,0.098366,-0.045396,-0.185928,-0.259274,-0.103461,-0.027414,-0.103889,0.13042,-0.206788,0.098213,0.388308,0.234522,0.114077,-0.41974,0.151052,0.339996,-0.182536,-0.207241,-0.034279,-0.002166,-0.120591,-0.068325,0.117642,-0.323034,-0.040004,0.04046,-0.276317,0.017743,0.102249,0.01653,-0.112378


In [97]:
test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,CI23855
1,4577_2,4577,2,CI23933
2,4577_3,4577,3,CI24917
3,4577_4,4577,4,CI24915
4,4577_5,4577,5,CI23714


In [129]:
## Calling 10 recommended items for item id 
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = 'CI24530',
                                    item_dict = challenge_dict,
                                    n_items = 3)

Item of interest :CI24530
Item similar to the above item:
1- CI27141
2- CI25390
3- CI27413


In [127]:
rec_list

['CI27141', 'CI25390', 'CI27413']

In [98]:
test_10 = test[test['challenge_sequence'] == 10]

In [99]:
test_10.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
9,4577_10,4577,10,CI24530
19,4578_10,4578,10,CI23781
29,4579_10,4579,10,CI26954
39,4583_10,4583,10,CI23667
49,4584_10,4584,10,CI23913


In [101]:
test_10.shape

(39732, 4)

In [128]:
item_item_dist['CI28785']

challenge
CI23478   -0.114458
CI23479   -0.028284
CI23480   -0.008083
CI23481   -0.375516
CI23482    0.152336
             ...   
CI29079   -0.165381
CI29080    0.065800
CI29081    0.109621
CI29082   -0.203861
CI29083   -0.016572
Name: CI28785, Length: 5606, dtype: float32

In [130]:
challenges = test_10.challenge.values.tolist()

In [131]:
pred_challenge = []
for chlng in challenges:
  rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = chlng ,
                                    item_dict = challenge_dict,
                                    n_items = 3)
  pred_challenge.append(rec_list)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Item similar to the above item:
1- CI29046
2- CI28236
3- CI28341
Item of interest :CI26930
Item similar to the above item:
1- CI23695
2- CI24578
3- CI28323
Item of interest :CI25151
Item similar to the above item:
1- CI27031
2- CI27343
3- CI24956
Item of interest :CI23697
Item similar to the above item:
1- CI28504
2- CI27238
3- CI28409
Item of interest :CI26819
Item similar to the above item:
1- CI24562
2- CI27105
3- CI27967
Item of interest :CI24525
Item similar to the above item:
1- CI26262
2- CI24656
3- CI26179
Item of interest :CI25631
Item similar to the above item:
1- CI24333
2- CI26788
3- CI25921
Item of interest :CI25419
Item similar to the above item:
1- CI25197
2- CI25713
3- CI27745
Item of interest :CI24233
Item similar to the above item:
1- CI25324
2- CI26488
3- CI26307
Item of interest :CI24157
Item similar to the above item:
1- CI25279
2- CI26309
3- CI24553
Item of interest :CI25072
Item similar to the above

In [132]:
pred_challenge


[['CI27141', 'CI25390', 'CI27413'],
 ['CI24031', 'CI24556', 'CI28193'],
 ['CI25807', 'CI26886', 'CI25787'],
 ['CI24174', 'CI24243', 'CI23622'],
 ['CI24505', 'CI28077', 'CI25812'],
 ['CI28374', 'CI25420', 'CI28184'],
 ['CI27141', 'CI25390', 'CI27413'],
 ['CI24106', 'CI25399', 'CI25530'],
 ['CI27980', 'CI28688', 'CI27472'],
 ['CI25748', 'CI28010', 'CI23504'],
 ['CI24106', 'CI25399', 'CI25530'],
 ['CI28945', 'CI25352', 'CI26744'],
 ['CI26225', 'CI24083', 'CI26400'],
 ['CI23781', 'CI28535', 'CI28740'],
 ['CI25748', 'CI28010', 'CI23504'],
 ['CI26756', 'CI28061', 'CI25100'],
 ['CI25807', 'CI26886', 'CI25787'],
 ['CI27399', 'CI28474', 'CI28142'],
 ['CI26650', 'CI28810', 'CI23987'],
 ['CI24384', 'CI27195', 'CI26352'],
 ['CI24579', 'CI25382', 'CI27698'],
 ['CI26225', 'CI24083', 'CI26400'],
 ['CI28945', 'CI25352', 'CI26744'],
 ['CI23778', 'CI23790', 'CI24296'],
 ['CI28069', 'CI23581', 'CI28055'],
 ['CI24174', 'CI24243', 'CI23622'],
 ['CI26756', 'CI28061', 'CI25100'],
 ['CI27399', 'CI28474', 'CI2

In [139]:
user_ids = test_10.user_id.values.tolist()

In [140]:
result = []
for i in range(len(user_ids)):
  result.append((str(user_ids[i])+'_11',pred_challenge[i][0]))
  result.append((str(user_ids[i])+'_12',pred_challenge[i][1]))
  result.append((str(user_ids[i])+'_13',pred_challenge[i][2]))



In [141]:
result1 = pd.DataFrame(result,columns=['user_sequence','challenge'])
result1

Unnamed: 0,user_sequence,challenge
0,4577_11,CI27141
1,4577_12,CI25390
2,4577_13,CI27413
3,4578_11,CI24031
4,4578_12,CI24556
...,...,...
119191,113834_12,CI25838
119192,113834_13,CI25409
119193,113838_11,CI24554
119194,113838_12,CI26648


In [142]:
result1.shape

(119196, 2)

In [143]:
test_10.shape

(39732, 4)

In [144]:
result1.to_csv('sub1_lightfm.csv',index=False)