# LibRec

In [1]:
import os
import sys
import numpy as np
from IPython.display import display
import pandas as pd
import numpy as np
import logging

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)

HOME = os.environ['HOME']
PROJECT_DIR = '{}/projects/cikm2017/'.format(HOME)
CIKM_HDF5_DIR = os.path.join(PROJECT_DIR, 'data/hdf5/')

sys.path.append(PROJECT_DIR)
os.chdir(PROJECT_DIR)

from cikm import create_train_and_test_data, get_rating_density

# Paths to extraction files.
EXTRACTION_FILES = {'ta': os.path.join(HOME, 'data/tripadvisor/extractions-ta.csv'),
                    'ba': os.path.join(HOME, 'data/beeradvocate/extractions.csv'),
                    'yp': os.path.join(HOME, 'data/yelp/extractions.csv')}

# Paths to input explanation files.
EXPLANATION_H5_FMT = os.path.join(HOME, 'projects/explanations/hdf5', '{}-bw-nw-a00-sessions.h5')
EXPLANATION_PATHS = {'ta': EXPLANATION_H5_FMT.format('ta'),
                     'ba': EXPLANATION_H5_FMT.format('ba'),
                     'yp': EXPLANATION_H5_FMT.format('yp')}

# Valiidate all explanation and extractions paths.
for path in set(EXPLANATION_PATHS.values()).union(EXTRACTION_FILES.values()):
    assert os.path.isfile(path), 'Invalid file: {}'.format(path)



## Rating Densities

In [2]:
for d in ('ta', 'yp', 'ba'):
    print('Rating density of {}: {:.4f}'.format(d, get_rating_density(EXTRACTION_FILES['ta'])))

Rating density of ta: 0.0007
Rating density of yp: 0.0007
Rating density of ba: 0.0007


In [3]:
for dataset in ('ta', 'ba', 'yp'):
    print('Processing ' + dataset)
    _ = create_train_and_test_data(ext_path=EXTRACTION_FILES[dataset], 
                                   exp_path=EXPLANATION_PATHS[dataset])

INFO:Loading extractions from /Users/khalil/data/tripadvisor/extractions-ta.csv


Processing ta


INFO:Saving ratings, user and item ID maps to /Users/khalil/data/tripadvisor/librec-data
INFO:Loading extractions from /Users/khalil/data/beeradvocate/extractions.csv


Processing ba


INFO:Saving ratings, user and item ID maps to /Users/khalil/data/beeradvocate/librec-data
INFO:Loading extractions from /Users/khalil/data/yelp/extractions.csv


Processing yp


INFO:Saving ratings, user and item ID maps to /Users/khalil/data/yelp/librec-data


In [4]:
assert 1 == 2

AssertionError: 

In [6]:
LIBREC_DIR = '/Users/khalil/projects/librec/'

# Copy over files to LibRec directory.
from shutil import copyfile

for dataset in ('tripadvisor', 'beeradvocate', 'yelp'):
    ratings_csv_path = '/Users/khalil/data/{}/librec-data/ratings.csv'.format(dataset)
    test_csv_path = '/Users/khalil/data/{}/librec-data/test.csv'.format(dataset)
    dest_dir = os.path.join(LIBREC_DIR, 'data/{}/rating/'.format(dataset))
    os.makedirs(dest_dir, exist_ok=True)
    print(dest_dir)
    copyfile(ratings_csv_path, os.path.join(dest_dir, 'ratings.txt'))
    copyfile(test_csv_path, os.path.join(dest_dir, 'test.txt'))

/Users/khalil/projects/librec/data/tripadvisor/rating/
/Users/khalil/projects/librec/data/beeradvocate/rating/
/Users/khalil/projects/librec/data/yelp/rating/


`./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result -D rec.recommender.class=biasedmf -D data.input.path=yelp/rating/ratings.txt -D data.testset.path=yelp/rating/test.txt -conf conf/librec.properties`

## Run LibRec

In [7]:
import subprocess

LIBREC_PROJET_DIR = '/Users/khalil/projects/librec/'
os.chdir(LIBREC_PROJET_DIR)

!pwd

algorithm = 'biasedmf'
dataset = 'beeradvocate'
print('Exists: ', os.path.exists(os.path.join(LIBREC_PROJET_DIR, 'bin/librec_dev')))

# Create command.
# cmd = "./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result "
# cmd += "-D rec.recommender.class=biasedmf -D data.input.path=beeradvocate/rating/ratings.txt -D "  
# cmd += "data.testset.path=beeradvocate/rating/test.txt -conf conf/librec.properties"

for algorithm in ('svdpp', 'biasedmf'):
# for algorithm in ('biasedmf'):
    for dataset in ('tripadvisor', 'yelp', 'beeradvocate'):
        print('Dataset: {dataset}, Algorithm: {algorithm}'.format(algorithm=algorithm, dataset=dataset))
        cmd = "./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result "
        cmd += "-D rec.recommender.class={algorithm} -D data.input.path={dataset}/rating/ratings.txt -D "  
        cmd += "data.testset.path={dataset}/rating/test.txt -conf conf/librec.properties"
        cmd = cmd.format(algorithm=algorithm, dataset=dataset)

        print(cmd)
    #     subprocess.check_call(cmd, shell=True)
        os.system(cmd)

/Users/khalil/projects/librec
Exists:  True
Dataset: tripadvisor, Algorithm: svdpp
./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result -D rec.recommender.class=svdpp -D data.input.path=tripadvisor/rating/ratings.txt -D data.testset.path=tripadvisor/rating/test.txt -conf conf/librec.properties
Dataset: yelp, Algorithm: svdpp
./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result -D rec.recommender.class=svdpp -D data.input.path=yelp/rating/ratings.txt -D data.testset.path=yelp/rating/test.txt -conf conf/librec.properties
Dataset: beeradvocate, Algorithm: svdpp
./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result -D rec.recommender.class=svdpp -D data.input.path=beeradvocate/rating/ratings.txt -D data.testset.path=beeradvocate/rating/test.txt -conf conf/librec.properties
Dataset: tripadvisor, Algorithm: biasedmf
./bin/librec_dev rec -exec -D dfs.data.dir=./data -D dfs.result.dir=./result -D rec.recommender.class=biase

In [8]:
assert 1 == 2

AssertionError: 

**Note:** In `df_explanations`, the explanations that have `is_seed` as `True` are those that the user has rated or reviewed. If a user has rated/reviewed and item more than once, we will only consider it as a seed hotel once.

##### The training data
These are tuples (`user_id`, `item_id`, and `rating`) that the corresponds to items users have actually rated. This can be obtained from the explanations DataFrame (i.e. `df_explanations`) by querying all seed items. However this won't be complete since .... Therefore it is best to take this information from the list of extractions (i.e. `df_extractions`).

#### The test data
These are tuples (`user_id`, `item_id`, and `rating`) that the user could have rated. This means we have generated recommendations for them but we don't know what ratings the user might have given to them. This can be obtained from `df_explanations` by querying all target items that are not seeds.