In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
hackathon_path = "/content/drive/My Drive/Hackathon/Recommendation system/"

In [None]:
!pip install lightfm



In [None]:
import os
import pandas as pd
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
challenge_df = pd.read_csv(hackathon_path + 'train_mddNHeX/challenge_data.csv')
train_df = pd.read_csv(hackathon_path + 'train_mddNHeX/train.csv')
test_df = pd.read_csv(hackathon_path + 'test_HLxMpl7/test.csv')
ss = pd.read_csv(hackathon_path + "sample_submission_J0OjXLi_DDt3uQN.csv")

In [None]:
print(f"Number of challenges present for each user in train is {train_df['challenge_sequence'].nunique()}")
print(f"Number of challenges present for each user in test is {test_df['challenge_sequence'].nunique()} and we have to predict last 3 challenge solved by each user in the dataset.")
print(f"Total number of users present in train is {train_df['user_id'].nunique()}")
print(f"Total number of challenges present in train is {train_df['challenge'].nunique()}")
print(f"Total number of users present in test is {test_df['user_id'].nunique()}")
print(f"Total number of challenges present in test is {test_df['challenge'].nunique()}")

Number of challenges present for each user in train is 13
Number of challenges present for each user in test is 10 and we have to predict last 3 challenge solved by each user in the dataset.
Total number of users present in train is 69532
Total number of challenges present in train is 5348
Total number of users present in test is 39732
Total number of challenges present in test is 4477


In [None]:
challenge_df['publish_date'] = pd.to_datetime(challenge_df['publish_date'])
challenge_df['challenge_series_ID'][challenge_df['challenge_series_ID'].isnull()] = "SI-Unknown"
challenge_df['total_submissions'][challenge_df['total_submissions'].isnull()] = challenge_df['total_submissions'].median()
challenge_df['author_ID'][challenge_df['author_ID'].isnull()] = "unknown-author"
challenge_df['author_gender'][challenge_df['author_gender'].isnull()] = "unknown-gender"
challenge_df['category_id'][challenge_df['category_id'].isnull()] = -99
challenge_df['category_id'] = "category_" + challenge_df['category_id'].astype(str)
challenge_df['programming_language'] = 'programming_language_' + challenge_df['programming_language'].astype(str)
challenge_df['author_org_ID'][challenge_df['author_org_ID'].isnull()] = "AOI-Unknown"
challenge_df['total_submissions'] = pd.qcut(challenge_df['total_submissions'], q=10, labels=['sub_' + str(i) for i in range(10)])

In [None]:
train_df['interactions'] = 1
test_df['interactions'] = 1

In [None]:
challenge_df['year'] = challenge_df['publish_date'].dt.year.astype(str)

In [None]:
del challenge_df['publish_date'] 

In [None]:
challenge_feature_list = []
for column in challenge_df.columns:
  challenge_feature_list.extend(challenge_df[column].unique())

In [None]:
df = pd.concat([train_df, test_df], axis=0)

In [None]:
dataset = Dataset()

In [None]:
challenge_df = challenge_df[challenge_df['challenge_ID'].isin(df['challenge'].unique().tolist())]

In [None]:
df['input_tuple'] = df[['user_id', 'challenge', 'interactions']].apply(lambda x: (x.user_id, x.challenge, x.interactions), axis=1)

In [None]:
unique_user_ids = list(set(df['user_id'].unique().tolist()))
unique_challenge_ids = list(set(df['challenge'].unique().tolist()))

dataset.fit(unique_user_ids, unique_challenge_ids, item_features=challenge_feature_list)

In [None]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [None]:
interactions, weights = dataset.build_interactions(df['input_tuple'])

In [None]:
def create_feature(x):
    '''First Column Should be Id'''
    return (x.tolist()[0], (x.tolist()[1:]))

challenge_df['Challenge_input_Feature'] = challenge_df.apply(create_feature, axis =1)

challenge_features_sparse = dataset.build_item_features(challenge_df['Challenge_input_Feature'])

In [None]:
import numpy as np
import scipy.sparse as sp
from sklearn.utils import shuffle

def _shuffle(uids, iids, data, random_state):

    shuffle_indices = np.arange(len(uids))
    shuffle_indices = shuffle(shuffle_indices, random_state=random_state)

    return (uids[shuffle_indices], iids[shuffle_indices], data[shuffle_indices])


def random_train_test_split(interactions, test_percentage=0.2, random_state=None):
    """
    Randomly split interactions between training and testing.
    This function takes an interaction set and splits it into
    two disjoint sets, a training set and a test set. Note that
    no effort is made to make sure that all items and users with
    interactions in the test set also have interactions in the
    training set; this may lead to a partial cold-start problem
    in the test set.
    To split a sample_weight matrix along the same lines, pass it
    into this function with the same random_state seed as was used
    for splitting the interactions.
    Parameters
    ----------
    interactions: a scipy sparse matrix containing interactions
        The interactions to split.
    test_percentage: float, optional
        The fraction of interactions to place in the test set.
    random_state: int or numpy.random.RandomState, optional
        Random seed used to initialize the numpy.random.RandomState number generator.
        Accepts an instance of numpy.random.RandomState for backwards compatibility.
    Returns
    -------
    (train, test): (scipy.sparse.COOMatrix,
                    scipy.sparse.COOMatrix)
         A tuple of (train data, test data)
    """

    if not sp.issparse(interactions):
        raise ValueError("Interactions must be a scipy.sparse matrix.")

    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(seed=random_state)

    interactions = interactions.tocoo()
    shape = interactions.shape
    uids, iids, data = (interactions.row, interactions.col, interactions.data)

    uids, iids, data = _shuffle(uids, iids, data, random_state)

    cutoff = int((1.0 - test_percentage) * len(uids))

    train_idx = slice(None, cutoff)
    test_idx = slice(cutoff, None)
    
    train = sp.coo_matrix((data[train_idx], (uids[train_idx], iids[train_idx])), shape=shape, dtype=interactions.dtype)
    test = sp.coo_matrix((data[test_idx], (uids[test_idx], iids[test_idx])), shape=shape, dtype=interactions.dtype)

    return train, test

In [None]:
train_interactions, test_interactions= random_train_test_split(interactions, test_percentage=0.2, random_state=2)

In [None]:
train_interactions

<109264x5502 sparse matrix of type '<class 'numpy.float32'>'
	with 1040988 stored elements in COOrdinate format>

In [None]:
interactions.shape

(109264, 5502)

In [None]:
model = LightFM(learning_rate=0.05, loss='warp', random_state=2019)

model.fit(train_interactions, item_features=challenge_features_sparse, epochs=5, verbose=True)

print(auc_score(model, train_interactions, item_features=challenge_features_sparse).mean())

print(auc_score(model, test_interactions, train_interactions=train_interactions, item_features=challenge_features_sparse, num_threads=8).mean())

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
0.9779495
0.97198004


In [None]:
user_challenges_faced = test_df.groupby('user_id')['challenge'].apply(list).to_dict()

In [None]:
ss.shape[0]/3

39732.0

In [None]:
!pip install pandarallel

Collecting pandarallel
  Downloading https://files.pythonhosted.org/packages/99/06/bd582106766c483d6da51c05b0cdd7cb61894bb843c7ecc4789032232327/pandarallel-1.4.8.tar.gz
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.4.8-cp36-none-any.whl size=16112 sha256=2138f163a14fb68010a0efa9b86f03e891a219da79be12d69debff0f6819b05e
  Stored in directory: /root/.cache/pip/wheels/75/a2/85/b45be2e86d86e9ec5da6d05c4b994d18c81abe76e3f39415aa
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.4.8


In [None]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
def predict_test_sequence(input_data):
  user_id = int(input_data)
  new_challenges = list(set(unique_challenge_ids) - set(user_challenges_faced[user_id]))
  scores = model.predict(user_id_map[user_id], [item_id_map[id] for id in new_challenges], item_features=challenge_features_sparse)
  return [user_id, [x for _,x in sorted(zip(scores,new_challenges), reverse=True)][:3]]
user_challenge_recommendations = ss['user_sequence'].str[:-3].drop_duplicates().reset_index(drop=True).parallel_apply(predict_test_sequence).apply(pd.Series).set_index(0).to_dict()[1]

In [None]:
def fill_recommendations(input_data):
  user_id, seq_id = input_data.split("_")
  user_id, seq_id = int(user_id), int(seq_id)
  return user_challenge_recommendations[user_id][seq_id-11]

ss['challenge'] = ss['user_sequence'].parallel_apply(fill_recommendations)

In [None]:
ss.head()

Unnamed: 0,user_sequence,challenge
0,4577_11,CI26051
1,4577_12,CI25126
2,4577_13,CI26052
3,4578_11,CI25126
4,4578_12,CI23691


In [None]:
ss.to_csv(hackathon_path + "sample_submission.csv", index=False)

In [None]:
print("Note: LightFM alone gets a score of zero in submission files.")

Note: LightFM alone gets a score of zero in submission files.


In [None]:
cd /content/drive/My Drive/Hackathon/Recommendation system/

/content/drive/My Drive/Hackathon/Recommendation system


In [None]:
!git clone "https://github.com/CRIPAC-DIG/SR-GNN.git"

Cloning into 'SR-GNN'...
remote: Enumerating objects: 44, done.[K
remote: Total 44 (delta 0), reused 0 (delta 0), pack-reused 44[K
Unpacking objects: 100% (44/44), done.
