## LightFM

## Import Library

In [None]:
from copy import deepcopy
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score,recall_at_k,reciprocal_rank
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

## Loading Dataset

In [45]:
complete_train = pd.read_csv("/kaggle/input/processed-dataset-interview/train.csv")
complete_test = pd.read_csv("/kaggle/input/processed-dataset-interview/test.csv")

In [46]:
complete_train.head()

In [47]:
complete_test.head()

In [48]:
complete_train.columns

* "Current" set will represent the interactions known at the time of training the complete model. "New" set will represent the new interactions.

* So test data is "New" data and train data is "Current" data

In [49]:
# Interaction data from complete data 
interactions_current = complete_train[['user_id', 'pratilipi_id', 'read_percent',"updated_at"]]
interactions_new = complete_test[['user_id', 'pratilipi_id', 'read_percent',"updated_at"]]
interactions = pd.concat([interactions_current,interactions_new])

In [50]:
interactions_new.shape,interactions_current.shape

In [51]:
# Since new users were removed at the time of creating data we don't have new user is test dataset
new_users = np.setdiff1d(interactions_new["user_id"], interactions_current["user_id"])
old_users = np.intersect1d(interactions_new["user_id"], interactions_current["user_id"])

print("Number of new users in new set: {}".format(len(new_users)))
print("Number of current users in new set: {}".format(len(old_users)))

In [52]:
# Since new items(pratilipi) were removed at the time of creating data we don't have new user is test dataset

new_items = np.setdiff1d(interactions_new['pratilipi_id'], interactions_current['pratilipi_id'])
old_items = np.intersect1d(interactions_new['pratilipi_id'], interactions_current['pratilipi_id'])

print("Number of new items in new set: {}".format(len(new_items)))
print("Number of current items in new set: {}".format(len(old_items)))

## User, item features

In lightfm we have the flexibility of giving metadata directly to the recommender system. In our case the only metadata available is of that of item. Since no user feature was received we will not input any user feature for recommendation and solely depend on item feature.

In [53]:
complete_train.drop(["train"],inplace = True,axis = 1)
complete_test.drop(["train"],inplace = True,axis = 1)

In [54]:
complete_train.columns

In [87]:
# List of item features we have for a single pratilipi
item_cols = ['pratilipi_id','month',
       'year', 'reading_time_min', 'genre|actionandadventure',
       'genre|children', 'genre|comedy', 'genre|crime', 'genre|crimelekhan',
       'genre|cybercrimefiction', 'genre|detective', 'genre|drama',
       'genre|entertainment', 'genre|erotica', 'genre|experiencesandmemories',
       'genre|family', 'genre|fantasy', 'genre|healthandwellness',
       'genre|horror', 'genre|horrormarathon', 'genre|indiawale',
       'genre|krishijeevan', 'genre|life', 'genre|moralinspiring',
       'genre|murdermystery', 'genre|mythology', 'genre|novels',
       'genre|parytan', 'genre|politics', 'genre|pratilipiawardshindi',
       'genre|pratilipikalamkarsamman', 'genre|pratilipikids',
       'genre|pravasisahitya', 'genre|premkamahina', 'genre|radiofiction',
       'genre|rashtriya', 'genre|relegionandspiritual', 'genre|romance',
       'genre|sciencefiction', 'genre|serieswriting', 'genre|shortstories',
       'genre|shortstorychallenge', 'genre|social', 'genre|suspense',
       'genre|swahindi2', 'genre|thechatstory', 'genre|translation',
       'genre|webseries', 'genre|women']

In [56]:
item_features_current = complete_train[item_cols].drop_duplicates(subset = ['pratilipi_id']).reset_index(drop = True)
item_features_new = complete_test[item_cols].drop_duplicates(subset = ['pratilipi_id']).reset_index(drop = True)

In [57]:
item_features_current = item_features_current[item_features_current["pratilipi_id"].isin(interactions_current["pratilipi_id"])]
item_features_new = item_features_new[item_features_new["pratilipi_id"].isin(interactions_new["pratilipi_id"])]

print("Number of current items: {}".format(len(item_features_current)))
print("Number of new items: {}".format(len(item_features_new)))

In [59]:
item_features = pd.concat([item_features_current])

## Transform to LightFM format

#### Dataset

To train the LightFM model the dataset must be converted in a specific format. 

In [60]:
print("Max user id: {}".format(interactions_current['user_id'].max()))
print("Max pratilipi id: {}".format(interactions_current['pratilipi_id'].max()))

In [61]:
from lightfm.data import Dataset

In [62]:
user_ids_buffered = (x for x in interactions_current['user_id'].unique())
item_ids_buffered = (x for x in interactions_current['pratilipi_id'].unique())

item_feature_names = item_features.columns.tolist()
item_feature_names.remove("pratilipi_id")

# dataset = Dataset()

In [63]:
dataset = Dataset()

In [64]:
dataset.fit(
    users=user_ids_buffered,
    items=item_ids_buffered,
    item_features=item_feature_names
)

We will be using implict rating "read_percent" as the target for the lightfm model 

In [65]:
def transform_interactions(interactions):
    """
    Transforms interactions data frame to the format build_interactions needs it.
    
    Returns iterable of tuples ("user_id", "pratilipi_id", 'read_percent').
    """
    return interactions[["user_id", "pratilipi_id", 'read_percent']].itertuples(index=False)

In [66]:
interaction_matrix_current, _ = dataset.build_interactions(
    transform_interactions(interactions_current))

In [69]:
interactions_current.shape

In [67]:
interaction_matrix_new, _ = dataset.build_interactions(
    transform_interactions(interactions_new))

In [71]:
interactions_new.shape

In [72]:
def transform_features(features, id_name):
    """
    Transforms features data frame to the format build_user_features / build_item_features needs it.
    
    Returns iterable of (id, [list of feature names that has value 1]).
    """
    transformed_features = []
    for row in features.to_dict(orient="records"):
        id_value = row[id_name]
        del row[id_name]
        feature_names = [key for key, value in row.items() if value == 1]
        transformed_features.append(
            (id_value, feature_names)
        )
    return transformed_features

In [73]:
item_features_matrix_current = dataset.build_item_features(
    transform_features(item_features_current, "pratilipi_id"))

In [74]:
item_features_matrix_new = dataset.build_item_features(
    transform_features(item_features_new, "pratilipi_id"))

## Train
#### Model definition

In [75]:
# 
model = LightFM(loss="warp",no_components=150,
    learning_rate=0.05,random_state=2019)

### Evaluating the performance of the model

Now we have to evaluate our model to see it's performance. No matter how good your model is, if you can't evaluate your model correctly you can't imporove and trust your model. For recommendation problem, there is not very good matrics for evaluating. But luckily lightfm provides us a very rich set of evaluating matrics. In this steps, we will be calculating AUC scores for our model.


In [76]:
class Evaluator():
    """
    Helper class for model evaluation
    
    Stores the number of epochs and corresponding performance metrics in a data frame.
    Returns the number of epochs with the best validation ROC AUC.
    Prepared to handle multiple runs of the same hyperparameters, and averages up the multiple runs.
    """
    
    def __init__(self):
        self.df_evaluation = pd.DataFrame()
    def evaluate(self, epochs, model, interaction_matrix_train,item_features_matrix):
            features_current = {
               
                "item_features": item_features_matrix
            }
            evaluation_row = pd.DataFrame({
                "epochs": epochs,
                "train_precision": precision_at_k(model, interaction_matrix_train, k=10, **features_current).mean(),
                "train_auc": auc_score(model, interaction_matrix_train, **features_current).mean(),
                },
                index=[0]
            )
            self.df_evaluation = self.df_evaluation.append(evaluation_row, ignore_index=True)
    def get_best_epochs(self):
        df_evaluation_agg = self.df_evaluation.groupby("epochs").agg("mean").reset_index()
        best_epochs =  df_evaluation_agg[
            df_evaluation_agg["train_auc"] == df_evaluation_agg["train_auc"].max()
        ]["epochs"].iloc[0]
        return best_epochs

In [77]:
evaluator = Evaluator()
for epochs in [200]:
    print(f"Epochs: {epochs}")
    
    model.fit(
        interaction_matrix_current,
        epochs=epochs,
        item_features=item_features_matrix_current
    )
    
    evaluator.evaluate(
        epochs, model,
        interaction_matrix_current,item_features_matrix_current)

In [78]:
# train evealuation
evaluator.df_evaluation


In [80]:
# test evaluation
features_new = {
               
                "item_features": item_features_matrix_new
            }

## Evaluation metric

* AUC : It measure the ROC AUC metric for a model: the probability that a randomly chosen positive example has a higher score than a randomly chosen negative example. A perfect score is 1.0.

* Precision at K : Measure the precision at k metric for a model: the fraction of known
    positives in the first k positions of the ranked list of results.A perfect score is 1.0.
    
* Recall at K : Measure the recall at k metric for a model: the number of positive items in
    the first k positions of the ranked list of results divided by the number
    of positive items in the test period. A perfect score is 1.0.
    
* Mean Reciprocal rank : Measure the reciprocal rank metric for a model: 1 / the rank of the highest
    ranked positive example. A perfect score is 1.0.

In [81]:
print(auc_score(model, interaction_matrix_new, **features_new).mean())

In [82]:
print(precision_at_k(model, interaction_matrix_new, k=10, **features_new).mean())

In [85]:
print(reciprocal_rank(model, interaction_matrix_new, **features_new).mean())

In [86]:
print(recall_at_k(model, interaction_matrix_new, k=10, **features_new).mean())

## Get recommendation for test dataset

In [88]:
def predict(model, dataset, user_item_pairs,item_features_matrix):
    user_id_mapping, user_feature_mapping, item_id_mapping, item_feature_mapping = dataset.mapping()
    predictions = model.predict(
        user_ids=np.array([user_id_mapping[x] for x in user_item_pairs["user_id"]]),
        item_ids=np.array([item_id_mapping[x] for x in user_item_pairs["pratilipi_id"]]),
        item_features=item_features_matrix
    )
    
    df_predictions = user_item_pairs.copy()
    df_predictions["prediction"] = predictions
    
    return df_predictions

def merge_interactions(df_evaluation, interactions):
    df_evaluation = df_evaluation.merge(interactions[["user_id", "pratilipi_id", "read_percent"]], on=["user_id", "pratilipi_id"], how="left")
    df_evaluation["has_interaction"] = ~df_evaluation["read_percent"].isna()
#     df_evaluation = df_evaluation.drop(columns="read_percent")
    return df_evaluation

### Jaccard Metric
The Jaccard similarity index (sometimes called the Jaccard similarity coefficient) compares members for two sets to see which members are shared and which are distinct. It’s a measure of similarity for the two sets of data, with a range from 0% to 100%. The higher the percentage, the more similar the two populations. We will find Jaccard metric for 

* Predicted vs Actual pratilipi Id
* Predicted vs Actual category name

In [91]:
metadata = pd.read_csv("../input/data-assignment/ds-assignment/metadata.csv")

In [94]:
complete_test = complete_test.merge(metadata,on = "pratilipi_id")

In [95]:
complete_test.head()

### Since predicting on 60k test data sample is unfeasible we will only predict for 1k users

In [96]:
g = {'pratilipi_id':lambda x: list(x)[:100] if len(list(x))>100 else list(x),
    "category_name":lambda x: list(x)}
final_test = complete_test.sort_values(['updated_at_x']).groupby(["user_id"]).agg(g).reset_index()

In [97]:
final_test["predicted_id"] = None

In [98]:
final_test.head()

In [100]:
all_item_ids = np.unique(interactions["pratilipi_id"])

for i in range(1000):
#     try:
    user_sample_item_pairs = pd.DataFrame({
        "user_id": final_test["user_id"][i],
        "pratilipi_id": all_item_ids
    })
    predictions_sample = predict(model, dataset, user_sample_item_pairs,item_features_matrix_new )
    predictions_sample = merge_interactions(predictions_sample, interactions)
    predictions_sample = predictions_sample.sort_values("prediction", ascending=False)
    final_test["predicted_id"][i] = list(set(predictions_sample["pratilipi_id"].tolist()[:100]))


In [101]:
final_test=final_test[:1000]

In [102]:
### For better understanding of the recommendation we will also find category of predicted items
g = {"category_name":lambda x: list(x)}
metadict = metadata.groupby(["pratilipi_id"]).agg(g).reset_index()

In [103]:
metadata.shape

In [104]:
metadict["pratilipi_id"] = metadict.apply(lambda x: abs(x["pratilipi_id"]),axis = 1)

In [105]:
dic = {}
for i in range(len(metadict)):
    dic[metadict["pratilipi_id"][i]] = metadict["category_name"][i]

In [106]:
final_test["predicted_cat"] = None

In [107]:
for i in range(len(final_test)):
    new_lis = []
    for j in range(len(final_test["predicted_id"][i])):
        new_lis = new_lis + dic[final_test["predicted_id"][i][j]]
    final_test["predicted_cat"][i] = list(set(new_lis))

In [108]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [109]:
final_test.head()

In [110]:
final_test["jac_score"] = final_test.apply(lambda x : jaccard_similarity(x["pratilipi_id"],x["predicted_id"]),axis = 1)

In [111]:
final_test["jac_score"].mean()

In [112]:
final_test["cat_jac_score"] = final_test.apply(lambda x : jaccard_similarity(x["category_name"],x["predicted_cat"]),axis = 1)

In [113]:
final_test["cat_jac_score"].mean()

In [114]:
final_test.head()

In [115]:
final_test.to_csv("Recommendation.csv",index = False)