@@ -0,0 +1,167 @@
"""Caching loader for the Book-Crossing Dataset
The description of the dataset is available on the official website at:
http://www.informatik.uni-freiburg.de/~cziegler/BX/
Quoting the introduction:
Collected by Cai-Nicolas Ziegler in a 4-week crawl
(August / September 2004) from the Book-Crossing community
with kind permission from Ron Hornbaker, CTO of Humankind
Systems. Contains 278,858 users (anonymized but with
demographic information) providing 1,149,780 ratings
(explicit / implicit) about 271,379 books.
This dataset loader will download the dataset,
which its size is around 22 Mb compressed. Once
uncompressed the train set is around 130 MB.
The data is downloaded, extracted and cached in the '~/scikit_crab_data'
folder.
References
----------
Improving Recommendation Lists Through Topic Diversification,
Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen;
Proceedings of the 14th International World Wide Web Conference (WWW '05),
May 10-14, 2005, Chiba, Japan.
"""
# Copyright (c) 2011 Marcel Caraciolo <marcel@muricoca.com>
# License: Simplified BSD

import os
import urllib
import logging
import zipfile
from os.path import dirname
from os.path import join
import numpy as np
from base import Bunch
import csv

logger = logging.getLogger(__name__)

URL = "http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip"
ARCHIVE_NAME = "BX-CSV-Dump.zip"


def download_book_crossings(target_dir):
""" Download the book-crossing data and unzip it """
archive_path = os.path.join(target_dir, ARCHIVE_NAME)

if not os.path.exists(target_dir):
os.makedirs(target_dir)

if not os.path.exists(archive_path):
logger.warn("Downloading dataset from %s (77 MB)", URL)
opener = urllib.urlopen(URL)
open(archive_path, 'wb').write(opener.read())

logger.info("Decompressing %s", archive_path)

source_zip = zipfile.ZipFile(archive_path, 'r')
archives = []
for name in source_zip.namelist():
if name.find('.csv') != -1:
source_zip.extract(name, target_dir)
archives.append(name)
source_zip.close()
os.remove(archive_path)

return archives


def load_bookcrossings(data_home=None, download_if_missing=True,
implicit=False):
"""
Load the filenames of the Book Crossings dataset
data_home: optional, default: None
Specify the storage folder for the datasets. If None,
all files is stored in '~/data subfolders.
download_if_missing: optional, True by default
If False, raise an IOError if the data is not locally available
instead of trying to download the data from the source site.
implicit: optional, False by default
If True, it will load the implicit ratings expressed by rating 0,
otherwise it will load the explicit ratings expressed by rating 1-10.
Examples
--------
>>> from os.path import join
>>> from os.path import dirname
>>> from scikits.crab.datasets.book_crossing import load_bookcrossings
>>> data_home = join(dirname(__file__), 'scikits/crab/datasets/tests/data/')
>>> books = load_bookcrossings(data_home)
>>> len(books.data)
26
>>> len(books.item_ids)
100
"""

if data_home:
if not os.path.exists(data_home):
os.makedirs(data_home)
else:
data_home = join(dirname(__file__), 'data/')

try:
if not os.path.exists(os.path.join(data_home, 'BX-Book-Ratings.csv')) \
and not open(os.path.join(data_home, 'BX-Books.csv')):
raise IOError
except Exception, e:
print 80 * '_'
print 'Loading files failed'
print 80 * '_'
print e

if download_if_missing:
print 'downloading the dataset...'
try:
download_book_crossings(data_home)
except:
raise Exception('FAIL: Problems during the download.')
print 'dataset downloaded.'
else:
raise IOError('Book-Crossing dataset not found')

#TO FIX: it is not working for np.loadtxt
#ratings_m = np.loadtxt(os.path.join(data_home, 'BX-Book-Ratings.csv'),
# delimiter=';', skiprows=1)

ratings_m = csv.reader(open(os.path.join(data_home,
'BX-Book-Ratings.csv')), delimiter=';')
ratings_m.next()
data_books = {}
if implicit:
for user_id, item_id, rating in ratings_m:
if rating == "0":
data_books.setdefault(user_id, {})
data_books[user_id][item_id] = True
else:
for user_id, item_id, rating in ratings_m:
rating = int(rating)
if rating != "0":
data_books.setdefault(user_id, {})
data_books[user_id][item_id] = int(rating)

#Read the titles
data_titles = np.loadtxt(os.path.join(data_home, 'BX-Books.csv'),
delimiter=';', usecols=(0, 1), dtype=str)

data_t = []
for item_id, label in data_titles:
data_t.append((item_id, label))
data_titles = dict(data_t)

fdescr = open(dirname(__file__) + '/descr/book-crossing.rst')

return Bunch(data=data_books, item_ids=data_titles,
user_ids=None, DESCR=fdescr.read())

This file was deleted.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -0,0 +1,35 @@
Jack Matthews;Lady in the Water;3.0
Jack Matthews;Snakes on a Planet;4.0
Jack Matthews;You, Me and Dupree;3.5
Jack Matthews;Superman Returns;5.0
Jack Matthews;The Night Listener;3.0
Mick LaSalle;Lady in the Water;3.0
Mick LaSalle;Snakes on a Planet;4.0
Mick LaSalle;Just My Luck;2.0
Mick LaSalle;Superman Returns;3.0
Mick LaSalle;You, Me and Dupree;2.0
Mick LaSalle;The Night Listener;3.0
Claudia Puig;Snakes on a Planet;3.5
Claudia Puig;Just My Luck;3.0
Claudia Puig;You, Me and Dupree;2.5
Claudia Puig;Superman Returns;4.0
Claudia Puig;The Night Listener;4.5
Lisa Rose;Lady in the Water;2.5
Lisa Rose;Snakes on a Planet;3.5
Lisa Rose;Just My Luck;3.0
Lisa Rose;Superman Returns;3.5
Lisa Rose;The Night Listener;3.0
Lisa Rose;You, Me and Dupree;2.5
Toby;Snakes on a Planet;4.5
Toby;Superman Returns;4.0
Toby;You, Me and Dupree;1.0
Gene Seymour;Lady in the Water;3.0
Gene Seymour;Snakes on a Planet;3.5
Gene Seymour;Just My Luck;1.5
Gene Seymour;Superman Returns;5.0
Gene Seymour;You, Me and Dupree;3.5
Gene Seymour;The Night Listener;3.0
Michael Phillips;Lady in the Water;2.5
Michael Phillips;Snakes on a Planet;3.0
Michael Phillips;Superman Returns;3.5
Michael Phillips;The Night Listener;4.0
@@ -0,0 +1,49 @@
Angelica,The Strokes,2.5
Angelica,Blues Traveler,3.5
Angelica,Phoenix,5
Angelica,Broken Bells,2
Angelica,Norah Jones,4.5
Angelica,Slightly Stoopid,1.5
Angelica,Vampire Weekend,2
Veronica,The Strokes,3
Veronica,Slightly Stoopid,2.5
Veronica,Norah Jones,5
Veronica,Phoenix,4
Veronica,Blues Traveler,3
Sam,The Strokes,5
Sam,Blues Traveler,5
Sam,Phoenix,5
Sam,Broken Bells,2
Sam,Norah Jones,3
Sam,Slightly Stoopid,4
Jordyn,The Strokes,4
Jordyn,Phoenix,5
Jordyn,Broken Bells,4.5
Jordyn,Deadmau5,4
Jordyn,Norah Jones,5
Jordyn,Slightly Stoopid,4.5
Jordyn,Vampire Weekend,4
Dan,The Strokes,4
Dan,Blues Traveler,3
Dan,Phoenix,3
Dan,Broken Bells,4
Dan,Deadmau5,4.5
Dan,Slightly Stoopid,4.5
Dan,Vampire Weekend,2
Bill,Blues Traveler,2
Bill,Phoenix,2
Bill,Broken Bells,3.5
Bill,Deadmau5,4
Bill,Slightly Stoopid,3.5
Bill,Vampire Weekend,3
Chan,Blues Traveler,5
Chan,Phoenix,5
Chan,Broken Bells,1
Chan,Deadmau5,1
Chan,Norah Jones,3
Chan,Slightly Stoopid,1
Hailey,Broken Bells,4
Hailey,Deadmau5,1
Hailey,Norah Jones,4
Hailey,Vampire Weekend,1
Hailey,The Strokes,4
@@ -0,0 +1,27 @@
Book-Crossing Dataset ... mined by Cai-Nicolas Ziegler, DBIS Freiburg
http://www.informatik.uni-freiburg.de/~cziegler/BX/

Notes
-----
Collected by Cai-Nicolas Ziegler in a 4-week crawl (August / September 2004) from the Book-Crossing community with kind permission from Ron Hornbaker, CTO of Humankind Systems. Contains 278,858 users (anonymized but with demographic information) providing 1,149,780 ratings (explicit / implicit) about 271,379 books.

[Freely available for research use when acknowledged with the following reference (further details on the dataset are given in this publication):
Improving Recommendation Lists Through Topic Diversification,
Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen; Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan. To appear.


This data set consists of
* 1,149,780 ratings (explicit / implicit) with (0-10) from 278,858 users to 271,379 books.

Format
------

The Book-Crossing dataset comprises 3 tables.
*BX-Users
Contains the users. Note that user IDs (`User-ID`) have been anonymized and map to integers. Demographic data is provided (`Location`, `Age`) if available. Otherwise, these fields contain NULL-values.
*BX-Books
Books are identified by their respective ISBN. Invalid ISBNs have already been removed from the dataset. Moreover, some content-based information is given (`Book-Title`, `Book-Author`, `Year-Of-Publication`, `Publisher`), obtained from Amazon Web Services. Note that in case of several authors, only the first is provided. URLs linking to cover images are also given, appearing in three different flavours (`Image-URL-S`, `Image-URL-M`, `Image-URL-L`), i.e., small, medium, large. These URLs point to the Amazon web site.
*BX-Book-Ratings
Contains the book rating information. Ratings (`Book-Rating`) are either explicit, expressed on a scale from 1-10 (higher values denoting higher appreciation), or implicit, expressed by 0.
@@ -0,0 +1,145 @@
SUMMARY & USAGE LICENSE
=============================================

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.

This data set consists of:
* 100,000 ratings (1-5) from 943 users on 1682 movies.
* Each user has rated at least 20 movies.
* Simple demographic info for the users (age, gender, occupation, zip)

The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th,
1997 through April 22nd, 1998. This data has been cleaned up - users
who had less than 20 ratings or did not have complete demographic
information were removed from this data set. Detailed descriptions of
the data file can be found at the end of this file.

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set. The data set may be used for any research
purposes under the following conditions:

* The user may not state or imply any endorsement from the
University of Minnesota or the GroupLens Research Group.

* The user must acknowledge the use of the data set in
publications resulting from the use of the data set, and must
send us an electronic or paper copy of those publications.

* The user may not redistribute the data without separate
permission.

* The user may not use this information for any commercial or
revenue-bearing purposes without first obtaining permission
from a faculty member of the GroupLens Research Project at the
University of Minnesota.

If you have any further questions or comments, please contact Jon Herlocker
<herlocke@cs.umn.edu>.

ACKNOWLEDGEMENTS
==============================================

Thanks to Al Borchers for cleaning up this data and writing the
accompanying scripts.

PUBLISHED WORK THAT HAS USED THIS DATASET
==============================================

Herlocker, J., Konstan, J., Borchers, A., Riedl, J.. An Algorithmic
Framework for Performing Collaborative Filtering. Proceedings of the
1999 Conference on Research and Development in Information
Retrieval. Aug. 1999.

FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
==============================================

The GroupLens Research Project is a research group in the Department
of Computer Science and Engineering at the University of Minnesota.
Members of the GroupLens Research Project are involved in many
research projects related to the fields of information filtering,
collaborative filtering, and recommender systems. The project is lead
by professors John Riedl and Joseph Konstan. The project began to
explore automated collaborative filtering in 1992, but is most well
known for its world wide trial of an automated collaborative filtering
system for Usenet news in 1996. The technology developed in the
Usenet trial formed the base for the formation of Net Perceptions,
Inc., which was founded by members of GroupLens Research. Since then
the project has expanded its scope to research overall information
filtering solutions, integrating in content-based methods as well as
improving current collaborative filtering technology.

Further information on the GroupLens Research project, including
research publications, can be found at the following web site:

http://www.grouplens.org/

GroupLens Research currently operates a movie recommender based on
collaborative filtering:

http://www.movielens.org/

DETAILED DESCRIPTIONS OF DATA FILES
==============================================

Here are brief descriptions of the data.

ml-data.tar.gz -- Compressed tar file. To rebuild the u data files do this:
gunzip ml-data.tar.gz
tar xvf ml-data.tar
mku.sh

u.data -- The full u data set, 100000 ratings by 943 users on 1682 items.
Each user has rated at least 20 movies. Users and items are
numbered consecutively from 1. The data is randomly
ordered. This is a tab separated list of
user id | item id | rating | timestamp.
The time stamps are unix seconds since 1/1/1970 UTC

u.info -- The number of users, items, and ratings in the u data set.

u.item -- Information about the items (movies); this is a tab separated
list of
movie id | movie title | release date | video release date |
IMDb URL | unknown | Action | Adventure | Animation |
Children's | Comedy | Crime | Documentary | Drama | Fantasy |
Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
Thriller | War | Western |
The last 19 fields are the genres, a 1 indicates the movie
is of that genre, a 0 indicates it is not; movies can be in
several genres at once.
The movie ids are the ones used in the u.data data set.

u.genre -- A list of the genres.

u.user -- Demographic information about the users; this is a tab
separated list of
user id | age | gender | occupation | zip code
The user ids are the ones used in the u.data data set.

u.occupation -- A list of the occupations.

u1.base -- The data sets u1.base and u1.test through u5.base and u5.test
u1.test are 80%/20% splits of the u data into training and test data.
u2.base Each of u1, ..., u5 have disjoint test sets; this if for
u2.test 5 fold cross validation (where you repeat your experiment
u3.base with each training and test set and average the results).
u3.test These data sets can be generated from u.data by mku.sh.
u4.base
u4.test
u5.base
u5.test

ua.base -- The data sets ua.base, ua.test, ub.base, and ub.test
ua.test split the u data into a training set and a test set with
ub.base exactly 10 ratings per user in the test set. The sets
ub.test ua.test and ub.test are disjoint. These data sets can
be generated from u.data by mku.sh.

allbut.pl -- The script that generates training and test sets where
all but n of a users ratings are in the training data.

mku.sh -- A shell script to generate all the u data sets from u.data.
@@ -0,0 +1,7 @@
sample_movies data set was collected by the book called
Programming the Collective Intelligence by Toby Segaran

Notes
-----
This data set consists of
* n ratings with (1-5) from n users to n movies.
@@ -0,0 +1,7 @@
Sample_song data set was collected by the e-book called
A Programmer's Guide to Data Mining - http://guidetodatamining.com

Notes
-----
This data set consists of
* 49 ratings with (1-5) from 8 users to 8 movies.
Empty file.
@@ -0,0 +1,100 @@
"User-ID";"ISBN";"Book-Rating"
"276725";"034545104X";"0"
"276726";"0155061224";"5"
"276727";"0446520802";"0"
"276729";"052165615X";"3"
"276729";"0521795028";"6"
"276733";"2080674722";"0"
"276736";"3257224281";"8"
"276737";"0600570967";"6"
"276744";"038550120X";"7"
"276745";"342310538";"10"
"276746";"0425115801";"0"
"276746";"0449006522";"0"
"276746";"0553561618";"0"
"276746";"055356451X";"0"
"276746";"0786013990";"0"
"276746";"0786014512";"0"
"276747";"0060517794";"9"
"276747";"0451192001";"0"
"276747";"0609801279";"0"
"276747";"0671537458";"9"
"276747";"0679776818";"8"
"276747";"0943066433";"7"
"276747";"1570231028";"0"
"276747";"1885408226";"7"
"276748";"0747558167";"6"
"276748";"3442437407";"0"
"276751";"033390804X";"0"
"276751";"3596218098";"8"
"276754";"0684867621";"8"
"276755";"0451166892";"5"
"276760";"8440682697";"10"
"276762";"034544003X";"0"
"276762";"0380000059";"0"
"276762";"0380711524";"5"
"276762";"0451167317";"0"
"276762";"0451454952";"0"
"276762";"0843920262";"0"
"276762";"3404122879";"0"
"276762";"3404182928";"0"
"276762";"3404611306";"0"
"276762";"342662429";"0"
"276762";"3426690179";"0"
"276762";"3442424216";"0"
"276762";"3442425573";"0"
"276762";"3453092007";"8"
"276762";"3453157745";"0"
"276762";"3453176944";"0"
"276762";"3453185137";"0"
"276762";"3453185323";"0"
"276762";"3453213025";"3"
"276762";"3453877241";"0"
"276762";"3492226604";"0"
"276762";"3517017442";"0"
"276762";"3596125006";"0"
"276762";"B0000BLD7X";"0"
"276762";"N3453124715";"4"
"276765";"9029716894";"0"
"276768";"9057868059";"4"
"276772";"0140279091";"0"
"276772";"0553572369";"7"
"276772";"0571058086";"0"
"276772";"3499230933";"10"
"276772";"3596151465";"10"
"276774";"0099543818";"0"
"276774";"3404147723";"0"
"276774";"3423111321";"0"
"276774";"3442136644";"9"
"276774";"3492232000";"0"
"276780";"8434811634";"0"
"276780";"8484330478";"7"
"276780";"8484332039";"7"
"276786";"2864322102";"6"
"276786";"8402065945";"0"
"276786";"8423314901";"0"
"276786";"842333533X";"0"
"276786";"8427911769";"0"
"276786";"8433914456";"0"
"276786";"8437606322";"8"
"276786";"8445072919";"0"
"276786";"8466300821";"6"
"276786";"847765011X";"0"
"276786";"8478442588";"6"
"276786";"8495368099";"0"
"276788";"0345443683";"8"
"276788";"043935806X";"7"
"276788";"055310666X";"10"
"276796";"0330332775";"5"
"276796";"0330367358";"0"
"276798";"0006379702";"5"
"276798";"3423084049";"0"
"276798";"3442131340";"7"
"276798";"3442437407";"0"
"276798";"3446202102";"0"
"276798";"3453073398";"0"
"276798";"3453115783";"0"
"276798";"3499134004";"0"
"276798";"349915398X";"0"
"276798";"3548603203";"6"
"276798";"3764501383";"0"

Large diffs are not rendered by default.

@@ -0,0 +1,65 @@
from os.path import join
from os.path import dirname
from nose.tools import assert_equal, assert_raises
from ..base import load_movielens_r100k, load_sample_songs, load_sample_movies
from ..book_crossing import load_bookcrossings
from ...utils.testing import assert_in


def test_movielens_r100k():
#with timestamp = False
movies = load_movielens_r100k()
assert_in(movies, in_=['data', 'item_ids', 'user_ids', 'DESCR'])
assert_equal(len(movies.data), 943)
assert_equal(len(movies.item_ids), 1682)
assert_equal(sum([len(items) for key, items in
movies.data.iteritems()]), 100000)

#with timestamp = True
movies = load_movielens_r100k(True)
assert_in(movies, in_=['data', 'item_ids', 'user_ids', 'DESCR'])
assert_equal(len(movies.data), 943)
assert_equal(len(movies.item_ids), 1682)
assert_equal(movies.data[1][1], (874965758, 5))
assert_equal(sum([len(items) for key, items in
movies.data.iteritems()]), 100000)


def test_sample_songs():
songs = load_sample_songs()
assert_in(songs, in_=['data', 'item_ids', 'user_ids', 'DESCR'])
assert_equal(len(songs.data), 8)
assert_equal(len(songs.item_ids), 8)
assert_equal(sum([len(items) for key, items in
songs.data.iteritems()]), 49)


def test_sample_movies():
movies = load_sample_movies()
assert_in(movies, in_=['data', 'item_ids', 'user_ids', 'DESCR'])
assert_equal(len(movies.data), 7)
assert_equal(len(movies.item_ids), 6)
assert_equal(sum([len(items) for key, items in
movies.data.iteritems()]), 35)


def test_load_bookcrossings():
data_home = join(dirname(__file__), 'data/')
#explicit with sample data from tests/data
books = load_bookcrossings(data_home)
assert_equal(len(books.data), 26)
assert_equal(len(books.item_ids), 100)
assert_equal(sum([len(items) for key, items in
books.data.iteritems()]), 99)

#implicit with sample data from tests/data
books = load_bookcrossings(data_home, implicit=True)
assert_equal(len(books.data), 15)
assert_equal(len(books.item_ids), 100)
assert_equal(sum([len(items) for key, items in
books.data.iteritems()]), 60)

#explicit with download denied.
data_home = dirname(__file__)
assert_raises(IOError, load_bookcrossings, data_home, implicit=False,
download_if_missing=False)
@@ -1,2 +1,2 @@
from .classes import DictPreferenceDataModel, MatrixPreferenceDataModel, \
MatrixBooleanPrefDataModel, DictBooleanPrefDataModel
from .classes import MatrixPreferenceDataModel, \
MatrixBooleanPrefDataModel

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -63,7 +63,7 @@ class ItemBasedRecommender(ItemRecommender):
Examples
-----------
>>> from scikits.crab.models.classes import DictPreferenceDataModel
>>> from scikits.crab.models.classes import MatrixPreferenceDataModel
>>> from scikits.crab.recommenders.knn.classes import ItemBasedRecommender
>>> from scikits.crab.similarities.basic_similarities import ItemSimilarity
>>> from scikits.crab.recommenders.knn.item_strategies import ItemsNeighborhoodStrategy
@@ -89,7 +89,7 @@ class ItemBasedRecommender(ItemRecommender):
'Penny Frewman': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0, \
'Superman Returns':4.0}, \
'Maria Gabriela': {}}
>>> model = DictPreferenceDataModel(movies)
>>> model = MatrixPreferenceDataModel(movies)
>>> items_strategy = ItemsNeighborhoodStrategy()
>>> similarity = ItemSimilarity(model, euclidean_distances)
>>> recsys = ItemBasedRecommender(model, similarity, items_strategy)
@@ -389,7 +389,7 @@ class UserBasedRecommender(UserRecommender):
Examples
-----------
>>> from scikits.crab.models.classes import DictPreferenceDataModel
>>> from scikits.crab.models.classes import MatrixPreferenceDataModel
>>> from scikits.crab.recommenders.knn.classes import UserBasedRecommender
>>> from scikits.crab.similarities.basic_similarities import UserSimilarity
>>> from scikits.crab.recommenders.knn.neighborhood_strategies import NearestNeighborsStrategy
@@ -415,7 +415,7 @@ class UserBasedRecommender(UserRecommender):
'Penny Frewman': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0, \
'Superman Returns':4.0}, \
'Maria Gabriela': {}}
>>> model = DictPreferenceDataModel(movies)
>>> model = MatrixPreferenceDataModel(movies)
>>> nhood_strategy = NearestNeighborsStrategy()
>>> similarity = UserSimilarity(model, euclidean_distances)
>>> recsys = UserBasedRecommender(model, similarity, nhood_strategy)

Large diffs are not rendered by default.

@@ -1,7 +1,7 @@
import numpy as np
from numpy.testing import assert_array_equal
from nose.tools import assert_raises
from ....models.classes import DictPreferenceDataModel, MatrixPreferenceDataModel
from ....models.classes import MatrixPreferenceDataModel
from ..item_strategies import ItemsNeighborhoodStrategy, AllPossibleItemsStrategy
from ....models.utils import UserNotFoundError

@@ -28,76 +28,43 @@

def test_ItemsNeighborhoodStrategy():
#Empty Dataset
model = DictPreferenceDataModel({})
strategy = ItemsNeighborhoodStrategy()
assert_raises(UserNotFoundError, strategy.candidate_items, 'Lorena Abreu', model)

model = MatrixPreferenceDataModel({})
strategy = ItemsNeighborhoodStrategy()
assert_raises(UserNotFoundError, strategy.candidate_items, 'Lorena Abreu', model)

#Possible candidates
model = DictPreferenceDataModel(movies)
strategy = ItemsNeighborhoodStrategy()
assert_array_equal(np.array(['Lady in the Water']), strategy.candidate_items('Lorena Abreu', model))

model = MatrixPreferenceDataModel(movies)
strategy = ItemsNeighborhoodStrategy()
assert_array_equal(np.array(['Lady in the Water']), strategy.candidate_items('Lorena Abreu', model))

#Empty candidates
model = DictPreferenceDataModel(movies)
strategy = ItemsNeighborhoodStrategy()
assert_array_equal(np.array([], dtype='|S'), strategy.candidate_items('Marcel Caraciolo', model))

model = MatrixPreferenceDataModel(movies)
strategy = ItemsNeighborhoodStrategy()
assert_array_equal(np.array([], dtype='|S'), strategy.candidate_items('Marcel Caraciolo', model))

#Empty candidates
model = DictPreferenceDataModel(movies)
strategy = ItemsNeighborhoodStrategy()
assert_array_equal(np.array([], dtype=bool), strategy.candidate_items('Maria Gabriela', model))

model = MatrixPreferenceDataModel(movies)
strategy = ItemsNeighborhoodStrategy()
assert_array_equal(np.array([], dtype=bool), strategy.candidate_items('Maria Gabriela', model))


def test_AllPossibleItemsStrategy():
#Empty Dataset
model = DictPreferenceDataModel({})
strategy = AllPossibleItemsStrategy()
assert_raises(UserNotFoundError, strategy.candidate_items, 'Lorena Abreu', model)

model = MatrixPreferenceDataModel({})
strategy = AllPossibleItemsStrategy()
assert_raises(UserNotFoundError, strategy.candidate_items, 'Lorena Abreu', model)

#Possible candidates
model = DictPreferenceDataModel(movies)
strategy = AllPossibleItemsStrategy()
assert_array_equal(np.array(['Lady in the Water']), strategy.candidate_items('Lorena Abreu', model))

model = MatrixPreferenceDataModel(movies)
strategy = AllPossibleItemsStrategy()
assert_array_equal(np.array(['Lady in the Water']), strategy.candidate_items('Lorena Abreu', model))

#Empty candidates
model = DictPreferenceDataModel(movies)
strategy = AllPossibleItemsStrategy()
assert_array_equal(np.array([], dtype='|S'), strategy.candidate_items('Marcel Caraciolo', model))

model = MatrixPreferenceDataModel(movies)
strategy = AllPossibleItemsStrategy()
assert_array_equal(np.array([], dtype='|S'), strategy.candidate_items('Marcel Caraciolo', model))

#Empty candidates
model = DictPreferenceDataModel(movies)
strategy = AllPossibleItemsStrategy()
assert_array_equal(np.array(['Just My Luck', 'Lady in the Water', 'Snakes on a Plane',
'Superman Returns', 'The Night Listener', 'You, Me and Dupree']), strategy.candidate_items('Maria Gabriela', model))

model = MatrixPreferenceDataModel(movies)
strategy = AllPossibleItemsStrategy()
assert_array_equal(np.array(['Just My Luck', 'Lady in the Water', 'Snakes on a Plane',
@@ -1,9 +1,7 @@
import numpy as np
from numpy.testing import assert_array_equal
from nose.tools import assert_raises
from ....models.classes import DictPreferenceDataModel, MatrixPreferenceDataModel
from ....models.classes import MatrixPreferenceDataModel
from ..neighborhood_strategies import AllNeighborsStrategy, NearestNeighborsStrategy
from ....models.utils import UserNotFoundError


movies = {'Marcel Caraciolo': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
@@ -28,20 +26,11 @@

def test_AllNeighborsStrategy():
#Empty Dataset
model = DictPreferenceDataModel({})
strategy = AllNeighborsStrategy()
assert_array_equal(np.array([]), strategy.user_neighborhood('Lorena Abreu', model))

model = MatrixPreferenceDataModel({})
strategy = AllNeighborsStrategy()
assert_array_equal(np.array([]), strategy.user_neighborhood('Lorena Abreu', model))

#Possible candidates
model = DictPreferenceDataModel(movies)
strategy = AllNeighborsStrategy()
assert_array_equal(np.array(['Leopoldo Pires', 'Luciana Nunes', 'Marcel Caraciolo',
'Maria Gabriela', 'Penny Frewman', 'Sheldom', 'Steve Gates']), strategy.user_neighborhood('Lorena Abreu', model))

model = MatrixPreferenceDataModel(movies)
strategy = AllNeighborsStrategy()
assert_array_equal(np.array(['Leopoldo Pires', 'Luciana Nunes', 'Marcel Caraciolo',
@@ -50,21 +39,11 @@ def test_AllNeighborsStrategy():

def test_NearestNeighborsStrategy():
#Empty Dataset
model = DictPreferenceDataModel({})
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array([]), strategy.user_neighborhood('Lorena Abreu', model))

model = MatrixPreferenceDataModel({})
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array([]), strategy.user_neighborhood('Lorena Abreu', model))

#Possible candidates
model = DictPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array(['Leopoldo Pires', 'Marcel Caraciolo', 'Penny Frewman',
'Sheldom', 'Steve Gates', 'Luciana Nunes'], dtype='|S16'),
strategy.user_neighborhood('Lorena Abreu', model))

model = MatrixPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array(['Leopoldo Pires', 'Marcel Caraciolo', 'Penny Frewman',
@@ -84,31 +63,21 @@ def test_NearestNeighborsStrategy():
assert_array_equal(np.array(['Leopoldo Pires']),
strategy.user_neighborhood(user_id='Lorena Abreu', data_model=model,
minimal_similarity=0.4))
#Empty candidates
model = DictPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array(['Leopoldo Pires', 'Steve Gates', 'Lorena Abreu', 'Penny Frewman',
'Sheldom', 'Luciana Nunes'], dtype='|S14'),
strategy.user_neighborhood('Marcel Caraciolo', model))

#Empty candidates
model = MatrixPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array(['Leopoldo Pires', 'Steve Gates', 'Lorena Abreu', 'Penny Frewman',
'Sheldom', 'Luciana Nunes'], dtype='|S14'),
strategy.user_neighborhood('Marcel Caraciolo', model))

#Empty candidates
model = DictPreferenceDataModel(movies)
model = MatrixPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array([], dtype=bool), strategy.user_neighborhood('Maria Gabriela', model))

#Raise exception with an invalid similarity
#Empty candidates
model = DictPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_raises(ValueError, strategy.user_neighborhood,
user_id='Lorena Abreu', data_model=model, n_similarity='item_similarity')

model = MatrixPreferenceDataModel(movies)
strategy = NearestNeighborsStrategy()
assert_array_equal(np.array([], dtype=bool), strategy.user_neighborhood('Maria Gabriela', model))
@@ -1,9 +1,9 @@
import numpy as np
from numpy.testing import assert_array_equal
from nose.tools import assert_raises, assert_equals, assert_almost_equals
from nose.tools import assert_equals, assert_almost_equals
from ...knn.item_strategies import AllPossibleItemsStrategy, ItemsNeighborhoodStrategy
from ....models.classes import DictPreferenceDataModel, MatrixPreferenceDataModel, \
DictBooleanPrefDataModel, MatrixBooleanPrefDataModel
from ....models.classes import MatrixPreferenceDataModel, \
MatrixBooleanPrefDataModel
from ..classes import MatrixFactorBasedRecommender


@@ -26,9 +26,7 @@
'Penny Frewman': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0},
'Maria Gabriela': {}}

dict_model = DictPreferenceDataModel(movies)
matrix_model = MatrixPreferenceDataModel(movies)
boolean_model = DictBooleanPrefDataModel(movies)
boolean_matrix_model = MatrixBooleanPrefDataModel(movies)


@@ -51,11 +49,6 @@ def test_create_MatrixFactorBasedRecommender():
assert_equals(recsys.item_factors.shape, (6, 2))
assert_equals(recsys._global_bias, 3.2285714285714286)

assert_raises(TypeError, MatrixFactorBasedRecommender,
model=dict_model,
items_selection_strategy=items_strategy,
n_features=2)


def test_all_other_items_MatrixFactorBasedRecommender():
items_strategy = AllPossibleItemsStrategy()
@@ -61,7 +61,7 @@ class UserSimilarity(BaseSimilarity):
Examples
---------
>>> from scikits.crab.models.classes import DictPreferenceDataModel
>>> from scikits.crab.models.classes import MatrixPreferenceDataModel
>>> from scikits.crab.metrics.pairwise import cosine_distances
>>> from scikits.crab.similarities.basic_similarities import UserSimilarity
>>> movies = {'Marcel Caraciolo': {'Lady in the Water': 2.5, \
@@ -85,7 +85,7 @@ class UserSimilarity(BaseSimilarity):
'Penny Frewman': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0, \
'Superman Returns':4.0}, \
'Maria Gabriela': {}}
>>> model = DictPreferenceDataModel(movies)
>>> model = MatrixPreferenceDataModel(movies)
>>> similarity = UserSimilarity(model, cosine_distances, 3)
>>> similarity['Marcel Caraciolo']
[('Marcel Caraciolo', 1.0), ('Sheldom', 0.99127582693458016),
@@ -165,7 +165,7 @@ class ItemSimilarity(BaseSimilarity):
Examples
---------
>>> from scikits.crab.models.classes import DictPreferenceDataModel
>>> from scikits.crab.models.classes import MatrixPreferenceDataModel
>>> from scikits.crab.metrics.pairwise import cosine_distances
>>> from scikits.crab.similarities.basic_similarities import ItemSimilarity
>>> movies = {'Marcel Caraciolo': {'Lady in the Water': 2.5, \
@@ -189,7 +189,7 @@ class ItemSimilarity(BaseSimilarity):
'Penny Frewman': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0, \
'Superman Returns':4.0}, \
'Maria Gabriela': {}}
>>> model = DictPreferenceDataModel(movies)
>>> model = MatrixPreferenceDataModel(movies)
>>> similarity = ItemSimilarity(model, cosine_distances, 3)
>>> similarity['The Night Listener']
[('The Night Listener', 1.0), ('Lady in the Water', 0.98188311415053031),

Large diffs are not rendered by default.

Empty file.
@@ -0,0 +1,15 @@
"""Testing utilities."""

# Copyright (c) 2011 Marcel Caraciolo <marcel@muricoca.com>
# License: Simplified BSD


def assert_in(obj, in_=None, out_=None):
"""Checks that all names in `in_` as in `obj`, but no name
in `out_` is."""
if in_ is not None:
for name in in_:
assert name in obj
if out_ is not None:
for name in out_:
assert name not in obj