| @@ -0,0 +1,167 @@ | ||
| """Caching loader for the Book-Crossing Dataset | ||
| The description of the dataset is available on the official website at: | ||
| http://www.informatik.uni-freiburg.de/~cziegler/BX/ | ||
| Quoting the introduction: | ||
| Collected by Cai-Nicolas Ziegler in a 4-week crawl | ||
| (August / September 2004) from the Book-Crossing community | ||
| with kind permission from Ron Hornbaker, CTO of Humankind | ||
| Systems. Contains 278,858 users (anonymized but with | ||
| demographic information) providing 1,149,780 ratings | ||
| (explicit / implicit) about 271,379 books. | ||
| This dataset loader will download the dataset, | ||
| which its size is around 22 Mb compressed. Once | ||
| uncompressed the train set is around 130 MB. | ||
| The data is downloaded, extracted and cached in the '~/scikit_crab_data' | ||
| folder. | ||
| References | ||
| ---------- | ||
| Improving Recommendation Lists Through Topic Diversification, | ||
| Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen; | ||
| Proceedings of the 14th International World Wide Web Conference (WWW '05), | ||
| May 10-14, 2005, Chiba, Japan. | ||
| """ | ||
| # Copyright (c) 2011 Marcel Caraciolo <marcel@muricoca.com> | ||
| # License: Simplified BSD | ||
|
|
||
| import os | ||
| import urllib | ||
| import logging | ||
| import zipfile | ||
| from os.path import dirname | ||
| from os.path import join | ||
| import numpy as np | ||
| from base import Bunch | ||
| import csv | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| URL = "http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip" | ||
| ARCHIVE_NAME = "BX-CSV-Dump.zip" | ||
|
|
||
|
|
||
| def download_book_crossings(target_dir): | ||
| """ Download the book-crossing data and unzip it """ | ||
| archive_path = os.path.join(target_dir, ARCHIVE_NAME) | ||
|
|
||
| if not os.path.exists(target_dir): | ||
| os.makedirs(target_dir) | ||
|
|
||
| if not os.path.exists(archive_path): | ||
| logger.warn("Downloading dataset from %s (77 MB)", URL) | ||
| opener = urllib.urlopen(URL) | ||
| open(archive_path, 'wb').write(opener.read()) | ||
|
|
||
| logger.info("Decompressing %s", archive_path) | ||
|
|
||
| source_zip = zipfile.ZipFile(archive_path, 'r') | ||
| archives = [] | ||
| for name in source_zip.namelist(): | ||
| if name.find('.csv') != -1: | ||
| source_zip.extract(name, target_dir) | ||
| archives.append(name) | ||
| source_zip.close() | ||
| os.remove(archive_path) | ||
|
|
||
| return archives | ||
|
|
||
|
|
||
| def load_bookcrossings(data_home=None, download_if_missing=True, | ||
| implicit=False): | ||
| """ | ||
| Load the filenames of the Book Crossings dataset | ||
| data_home: optional, default: None | ||
| Specify the storage folder for the datasets. If None, | ||
| all files is stored in '~/data subfolders. | ||
| download_if_missing: optional, True by default | ||
| If False, raise an IOError if the data is not locally available | ||
| instead of trying to download the data from the source site. | ||
| implicit: optional, False by default | ||
| If True, it will load the implicit ratings expressed by rating 0, | ||
| otherwise it will load the explicit ratings expressed by rating 1-10. | ||
| Examples | ||
| -------- | ||
| >>> from os.path import join | ||
| >>> from os.path import dirname | ||
| >>> from scikits.crab.datasets.book_crossing import load_bookcrossings | ||
| >>> data_home = join(dirname(__file__), 'scikits/crab/datasets/tests/data/') | ||
| >>> books = load_bookcrossings(data_home) | ||
| >>> len(books.data) | ||
| 26 | ||
| >>> len(books.item_ids) | ||
| 100 | ||
| """ | ||
|
|
||
| if data_home: | ||
| if not os.path.exists(data_home): | ||
| os.makedirs(data_home) | ||
| else: | ||
| data_home = join(dirname(__file__), 'data/') | ||
|
|
||
| try: | ||
| if not os.path.exists(os.path.join(data_home, 'BX-Book-Ratings.csv')) \ | ||
| and not open(os.path.join(data_home, 'BX-Books.csv')): | ||
| raise IOError | ||
| except Exception, e: | ||
| print 80 * '_' | ||
| print 'Loading files failed' | ||
| print 80 * '_' | ||
| print e | ||
|
|
||
| if download_if_missing: | ||
| print 'downloading the dataset...' | ||
| try: | ||
| download_book_crossings(data_home) | ||
| except: | ||
| raise Exception('FAIL: Problems during the download.') | ||
| print 'dataset downloaded.' | ||
| else: | ||
| raise IOError('Book-Crossing dataset not found') | ||
|
|
||
| #TO FIX: it is not working for np.loadtxt | ||
| #ratings_m = np.loadtxt(os.path.join(data_home, 'BX-Book-Ratings.csv'), | ||
| # delimiter=';', skiprows=1) | ||
|
|
||
| ratings_m = csv.reader(open(os.path.join(data_home, | ||
| 'BX-Book-Ratings.csv')), delimiter=';') | ||
| ratings_m.next() | ||
| data_books = {} | ||
| if implicit: | ||
| for user_id, item_id, rating in ratings_m: | ||
| if rating == "0": | ||
| data_books.setdefault(user_id, {}) | ||
| data_books[user_id][item_id] = True | ||
| else: | ||
| for user_id, item_id, rating in ratings_m: | ||
| rating = int(rating) | ||
| if rating != "0": | ||
| data_books.setdefault(user_id, {}) | ||
| data_books[user_id][item_id] = int(rating) | ||
|
|
||
| #Read the titles | ||
| data_titles = np.loadtxt(os.path.join(data_home, 'BX-Books.csv'), | ||
| delimiter=';', usecols=(0, 1), dtype=str) | ||
|
|
||
| data_t = [] | ||
| for item_id, label in data_titles: | ||
| data_t.append((item_id, label)) | ||
| data_titles = dict(data_t) | ||
|
|
||
| fdescr = open(dirname(__file__) + '/descr/book-crossing.rst') | ||
|
|
||
| return Bunch(data=data_books, item_ids=data_titles, | ||
| user_ids=None, DESCR=fdescr.read()) |
| @@ -0,0 +1,35 @@ | ||
| Jack Matthews;Lady in the Water;3.0 | ||
| Jack Matthews;Snakes on a Planet;4.0 | ||
| Jack Matthews;You, Me and Dupree;3.5 | ||
| Jack Matthews;Superman Returns;5.0 | ||
| Jack Matthews;The Night Listener;3.0 | ||
| Mick LaSalle;Lady in the Water;3.0 | ||
| Mick LaSalle;Snakes on a Planet;4.0 | ||
| Mick LaSalle;Just My Luck;2.0 | ||
| Mick LaSalle;Superman Returns;3.0 | ||
| Mick LaSalle;You, Me and Dupree;2.0 | ||
| Mick LaSalle;The Night Listener;3.0 | ||
| Claudia Puig;Snakes on a Planet;3.5 | ||
| Claudia Puig;Just My Luck;3.0 | ||
| Claudia Puig;You, Me and Dupree;2.5 | ||
| Claudia Puig;Superman Returns;4.0 | ||
| Claudia Puig;The Night Listener;4.5 | ||
| Lisa Rose;Lady in the Water;2.5 | ||
| Lisa Rose;Snakes on a Planet;3.5 | ||
| Lisa Rose;Just My Luck;3.0 | ||
| Lisa Rose;Superman Returns;3.5 | ||
| Lisa Rose;The Night Listener;3.0 | ||
| Lisa Rose;You, Me and Dupree;2.5 | ||
| Toby;Snakes on a Planet;4.5 | ||
| Toby;Superman Returns;4.0 | ||
| Toby;You, Me and Dupree;1.0 | ||
| Gene Seymour;Lady in the Water;3.0 | ||
| Gene Seymour;Snakes on a Planet;3.5 | ||
| Gene Seymour;Just My Luck;1.5 | ||
| Gene Seymour;Superman Returns;5.0 | ||
| Gene Seymour;You, Me and Dupree;3.5 | ||
| Gene Seymour;The Night Listener;3.0 | ||
| Michael Phillips;Lady in the Water;2.5 | ||
| Michael Phillips;Snakes on a Planet;3.0 | ||
| Michael Phillips;Superman Returns;3.5 | ||
| Michael Phillips;The Night Listener;4.0 |
| @@ -0,0 +1,49 @@ | ||
| Angelica,The Strokes,2.5 | ||
| Angelica,Blues Traveler,3.5 | ||
| Angelica,Phoenix,5 | ||
| Angelica,Broken Bells,2 | ||
| Angelica,Norah Jones,4.5 | ||
| Angelica,Slightly Stoopid,1.5 | ||
| Angelica,Vampire Weekend,2 | ||
| Veronica,The Strokes,3 | ||
| Veronica,Slightly Stoopid,2.5 | ||
| Veronica,Norah Jones,5 | ||
| Veronica,Phoenix,4 | ||
| Veronica,Blues Traveler,3 | ||
| Sam,The Strokes,5 | ||
| Sam,Blues Traveler,5 | ||
| Sam,Phoenix,5 | ||
| Sam,Broken Bells,2 | ||
| Sam,Norah Jones,3 | ||
| Sam,Slightly Stoopid,4 | ||
| Jordyn,The Strokes,4 | ||
| Jordyn,Phoenix,5 | ||
| Jordyn,Broken Bells,4.5 | ||
| Jordyn,Deadmau5,4 | ||
| Jordyn,Norah Jones,5 | ||
| Jordyn,Slightly Stoopid,4.5 | ||
| Jordyn,Vampire Weekend,4 | ||
| Dan,The Strokes,4 | ||
| Dan,Blues Traveler,3 | ||
| Dan,Phoenix,3 | ||
| Dan,Broken Bells,4 | ||
| Dan,Deadmau5,4.5 | ||
| Dan,Slightly Stoopid,4.5 | ||
| Dan,Vampire Weekend,2 | ||
| Bill,Blues Traveler,2 | ||
| Bill,Phoenix,2 | ||
| Bill,Broken Bells,3.5 | ||
| Bill,Deadmau5,4 | ||
| Bill,Slightly Stoopid,3.5 | ||
| Bill,Vampire Weekend,3 | ||
| Chan,Blues Traveler,5 | ||
| Chan,Phoenix,5 | ||
| Chan,Broken Bells,1 | ||
| Chan,Deadmau5,1 | ||
| Chan,Norah Jones,3 | ||
| Chan,Slightly Stoopid,1 | ||
| Hailey,Broken Bells,4 | ||
| Hailey,Deadmau5,1 | ||
| Hailey,Norah Jones,4 | ||
| Hailey,Vampire Weekend,1 | ||
| Hailey,The Strokes,4 |
| @@ -0,0 +1,27 @@ | ||
| Book-Crossing Dataset ... mined by Cai-Nicolas Ziegler, DBIS Freiburg | ||
| http://www.informatik.uni-freiburg.de/~cziegler/BX/ | ||
|
|
||
| Notes | ||
| ----- | ||
| Collected by Cai-Nicolas Ziegler in a 4-week crawl (August / September 2004) from the Book-Crossing community with kind permission from Ron Hornbaker, CTO of Humankind Systems. Contains 278,858 users (anonymized but with demographic information) providing 1,149,780 ratings (explicit / implicit) about 271,379 books. | ||
|
|
||
| [Freely available for research use when acknowledged with the following reference (further details on the dataset are given in this publication): | ||
| Improving Recommendation Lists Through Topic Diversification, | ||
| Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen; Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan. To appear. | ||
|
|
||
|
|
||
| This data set consists of | ||
| * 1,149,780 ratings (explicit / implicit) with (0-10) from 278,858 users to 271,379 books. | ||
|
|
||
| Format | ||
| ------ | ||
|
|
||
| The Book-Crossing dataset comprises 3 tables. | ||
| *BX-Users | ||
| Contains the users. Note that user IDs (`User-ID`) have been anonymized and map to integers. Demographic data is provided (`Location`, `Age`) if available. Otherwise, these fields contain NULL-values. | ||
| *BX-Books | ||
| Books are identified by their respective ISBN. Invalid ISBNs have already been removed from the dataset. Moreover, some content-based information is given (`Book-Title`, `Book-Author`, `Year-Of-Publication`, `Publisher`), obtained from Amazon Web Services. Note that in case of several authors, only the first is provided. URLs linking to cover images are also given, appearing in three different flavours (`Image-URL-S`, `Image-URL-M`, `Image-URL-L`), i.e., small, medium, large. These URLs point to the Amazon web site. | ||
| *BX-Book-Ratings | ||
| Contains the book rating information. Ratings (`Book-Rating`) are either explicit, expressed on a scale from 1-10 (higher values denoting higher appreciation), or implicit, expressed by 0. |
| @@ -0,0 +1,145 @@ | ||
| SUMMARY & USAGE LICENSE | ||
| ============================================= | ||
|
|
||
| MovieLens data sets were collected by the GroupLens Research Project | ||
| at the University of Minnesota. | ||
|
|
||
| This data set consists of: | ||
| * 100,000 ratings (1-5) from 943 users on 1682 movies. | ||
| * Each user has rated at least 20 movies. | ||
| * Simple demographic info for the users (age, gender, occupation, zip) | ||
|
|
||
| The data was collected through the MovieLens web site | ||
| (movielens.umn.edu) during the seven-month period from September 19th, | ||
| 1997 through April 22nd, 1998. This data has been cleaned up - users | ||
| who had less than 20 ratings or did not have complete demographic | ||
| information were removed from this data set. Detailed descriptions of | ||
| the data file can be found at the end of this file. | ||
|
|
||
| Neither the University of Minnesota nor any of the researchers | ||
| involved can guarantee the correctness of the data, its suitability | ||
| for any particular purpose, or the validity of results based on the | ||
| use of the data set. The data set may be used for any research | ||
| purposes under the following conditions: | ||
|
|
||
| * The user may not state or imply any endorsement from the | ||
| University of Minnesota or the GroupLens Research Group. | ||
|
|
||
| * The user must acknowledge the use of the data set in | ||
| publications resulting from the use of the data set, and must | ||
| send us an electronic or paper copy of those publications. | ||
|
|
||
| * The user may not redistribute the data without separate | ||
| permission. | ||
|
|
||
| * The user may not use this information for any commercial or | ||
| revenue-bearing purposes without first obtaining permission | ||
| from a faculty member of the GroupLens Research Project at the | ||
| University of Minnesota. | ||
|
|
||
| If you have any further questions or comments, please contact Jon Herlocker | ||
| <herlocke@cs.umn.edu>. | ||
|
|
||
| ACKNOWLEDGEMENTS | ||
| ============================================== | ||
|
|
||
| Thanks to Al Borchers for cleaning up this data and writing the | ||
| accompanying scripts. | ||
|
|
||
| PUBLISHED WORK THAT HAS USED THIS DATASET | ||
| ============================================== | ||
|
|
||
| Herlocker, J., Konstan, J., Borchers, A., Riedl, J.. An Algorithmic | ||
| Framework for Performing Collaborative Filtering. Proceedings of the | ||
| 1999 Conference on Research and Development in Information | ||
| Retrieval. Aug. 1999. | ||
|
|
||
| FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT | ||
| ============================================== | ||
|
|
||
| The GroupLens Research Project is a research group in the Department | ||
| of Computer Science and Engineering at the University of Minnesota. | ||
| Members of the GroupLens Research Project are involved in many | ||
| research projects related to the fields of information filtering, | ||
| collaborative filtering, and recommender systems. The project is lead | ||
| by professors John Riedl and Joseph Konstan. The project began to | ||
| explore automated collaborative filtering in 1992, but is most well | ||
| known for its world wide trial of an automated collaborative filtering | ||
| system for Usenet news in 1996. The technology developed in the | ||
| Usenet trial formed the base for the formation of Net Perceptions, | ||
| Inc., which was founded by members of GroupLens Research. Since then | ||
| the project has expanded its scope to research overall information | ||
| filtering solutions, integrating in content-based methods as well as | ||
| improving current collaborative filtering technology. | ||
|
|
||
| Further information on the GroupLens Research project, including | ||
| research publications, can be found at the following web site: | ||
|
|
||
| http://www.grouplens.org/ | ||
|
|
||
| GroupLens Research currently operates a movie recommender based on | ||
| collaborative filtering: | ||
|
|
||
| http://www.movielens.org/ | ||
|
|
||
| DETAILED DESCRIPTIONS OF DATA FILES | ||
| ============================================== | ||
|
|
||
| Here are brief descriptions of the data. | ||
|
|
||
| ml-data.tar.gz -- Compressed tar file. To rebuild the u data files do this: | ||
| gunzip ml-data.tar.gz | ||
| tar xvf ml-data.tar | ||
| mku.sh | ||
|
|
||
| u.data -- The full u data set, 100000 ratings by 943 users on 1682 items. | ||
| Each user has rated at least 20 movies. Users and items are | ||
| numbered consecutively from 1. The data is randomly | ||
| ordered. This is a tab separated list of | ||
| user id | item id | rating | timestamp. | ||
| The time stamps are unix seconds since 1/1/1970 UTC | ||
|
|
||
| u.info -- The number of users, items, and ratings in the u data set. | ||
|
|
||
| u.item -- Information about the items (movies); this is a tab separated | ||
| list of | ||
| movie id | movie title | release date | video release date | | ||
| IMDb URL | unknown | Action | Adventure | Animation | | ||
| Children's | Comedy | Crime | Documentary | Drama | Fantasy | | ||
| Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | | ||
| Thriller | War | Western | | ||
| The last 19 fields are the genres, a 1 indicates the movie | ||
| is of that genre, a 0 indicates it is not; movies can be in | ||
| several genres at once. | ||
| The movie ids are the ones used in the u.data data set. | ||
|
|
||
| u.genre -- A list of the genres. | ||
|
|
||
| u.user -- Demographic information about the users; this is a tab | ||
| separated list of | ||
| user id | age | gender | occupation | zip code | ||
| The user ids are the ones used in the u.data data set. | ||
|
|
||
| u.occupation -- A list of the occupations. | ||
|
|
||
| u1.base -- The data sets u1.base and u1.test through u5.base and u5.test | ||
| u1.test are 80%/20% splits of the u data into training and test data. | ||
| u2.base Each of u1, ..., u5 have disjoint test sets; this if for | ||
| u2.test 5 fold cross validation (where you repeat your experiment | ||
| u3.base with each training and test set and average the results). | ||
| u3.test These data sets can be generated from u.data by mku.sh. | ||
| u4.base | ||
| u4.test | ||
| u5.base | ||
| u5.test | ||
|
|
||
| ua.base -- The data sets ua.base, ua.test, ub.base, and ub.test | ||
| ua.test split the u data into a training set and a test set with | ||
| ub.base exactly 10 ratings per user in the test set. The sets | ||
| ub.test ua.test and ub.test are disjoint. These data sets can | ||
| be generated from u.data by mku.sh. | ||
|
|
||
| allbut.pl -- The script that generates training and test sets where | ||
| all but n of a users ratings are in the training data. | ||
|
|
||
| mku.sh -- A shell script to generate all the u data sets from u.data. |
| @@ -0,0 +1,7 @@ | ||
| sample_movies data set was collected by the book called | ||
| Programming the Collective Intelligence by Toby Segaran | ||
|
|
||
| Notes | ||
| ----- | ||
| This data set consists of | ||
| * n ratings with (1-5) from n users to n movies. |
| @@ -0,0 +1,7 @@ | ||
| Sample_song data set was collected by the e-book called | ||
| A Programmer's Guide to Data Mining - http://guidetodatamining.com | ||
|
|
||
| Notes | ||
| ----- | ||
| This data set consists of | ||
| * 49 ratings with (1-5) from 8 users to 8 movies. |
| @@ -0,0 +1,100 @@ | ||
| "User-ID";"ISBN";"Book-Rating" | ||
| "276725";"034545104X";"0" | ||
| "276726";"0155061224";"5" | ||
| "276727";"0446520802";"0" | ||
| "276729";"052165615X";"3" | ||
| "276729";"0521795028";"6" | ||
| "276733";"2080674722";"0" | ||
| "276736";"3257224281";"8" | ||
| "276737";"0600570967";"6" | ||
| "276744";"038550120X";"7" | ||
| "276745";"342310538";"10" | ||
| "276746";"0425115801";"0" | ||
| "276746";"0449006522";"0" | ||
| "276746";"0553561618";"0" | ||
| "276746";"055356451X";"0" | ||
| "276746";"0786013990";"0" | ||
| "276746";"0786014512";"0" | ||
| "276747";"0060517794";"9" | ||
| "276747";"0451192001";"0" | ||
| "276747";"0609801279";"0" | ||
| "276747";"0671537458";"9" | ||
| "276747";"0679776818";"8" | ||
| "276747";"0943066433";"7" | ||
| "276747";"1570231028";"0" | ||
| "276747";"1885408226";"7" | ||
| "276748";"0747558167";"6" | ||
| "276748";"3442437407";"0" | ||
| "276751";"033390804X";"0" | ||
| "276751";"3596218098";"8" | ||
| "276754";"0684867621";"8" | ||
| "276755";"0451166892";"5" | ||
| "276760";"8440682697";"10" | ||
| "276762";"034544003X";"0" | ||
| "276762";"0380000059";"0" | ||
| "276762";"0380711524";"5" | ||
| "276762";"0451167317";"0" | ||
| "276762";"0451454952";"0" | ||
| "276762";"0843920262";"0" | ||
| "276762";"3404122879";"0" | ||
| "276762";"3404182928";"0" | ||
| "276762";"3404611306";"0" | ||
| "276762";"342662429";"0" | ||
| "276762";"3426690179";"0" | ||
| "276762";"3442424216";"0" | ||
| "276762";"3442425573";"0" | ||
| "276762";"3453092007";"8" | ||
| "276762";"3453157745";"0" | ||
| "276762";"3453176944";"0" | ||
| "276762";"3453185137";"0" | ||
| "276762";"3453185323";"0" | ||
| "276762";"3453213025";"3" | ||
| "276762";"3453877241";"0" | ||
| "276762";"3492226604";"0" | ||
| "276762";"3517017442";"0" | ||
| "276762";"3596125006";"0" | ||
| "276762";"B0000BLD7X";"0" | ||
| "276762";"N3453124715";"4" | ||
| "276765";"9029716894";"0" | ||
| "276768";"9057868059";"4" | ||
| "276772";"0140279091";"0" | ||
| "276772";"0553572369";"7" | ||
| "276772";"0571058086";"0" | ||
| "276772";"3499230933";"10" | ||
| "276772";"3596151465";"10" | ||
| "276774";"0099543818";"0" | ||
| "276774";"3404147723";"0" | ||
| "276774";"3423111321";"0" | ||
| "276774";"3442136644";"9" | ||
| "276774";"3492232000";"0" | ||
| "276780";"8434811634";"0" | ||
| "276780";"8484330478";"7" | ||
| "276780";"8484332039";"7" | ||
| "276786";"2864322102";"6" | ||
| "276786";"8402065945";"0" | ||
| "276786";"8423314901";"0" | ||
| "276786";"842333533X";"0" | ||
| "276786";"8427911769";"0" | ||
| "276786";"8433914456";"0" | ||
| "276786";"8437606322";"8" | ||
| "276786";"8445072919";"0" | ||
| "276786";"8466300821";"6" | ||
| "276786";"847765011X";"0" | ||
| "276786";"8478442588";"6" | ||
| "276786";"8495368099";"0" | ||
| "276788";"0345443683";"8" | ||
| "276788";"043935806X";"7" | ||
| "276788";"055310666X";"10" | ||
| "276796";"0330332775";"5" | ||
| "276796";"0330367358";"0" | ||
| "276798";"0006379702";"5" | ||
| "276798";"3423084049";"0" | ||
| "276798";"3442131340";"7" | ||
| "276798";"3442437407";"0" | ||
| "276798";"3446202102";"0" | ||
| "276798";"3453073398";"0" | ||
| "276798";"3453115783";"0" | ||
| "276798";"3499134004";"0" | ||
| "276798";"349915398X";"0" | ||
| "276798";"3548603203";"6" | ||
| "276798";"3764501383";"0" |
| @@ -0,0 +1,65 @@ | ||
| from os.path import join | ||
| from os.path import dirname | ||
| from nose.tools import assert_equal, assert_raises | ||
| from ..base import load_movielens_r100k, load_sample_songs, load_sample_movies | ||
| from ..book_crossing import load_bookcrossings | ||
| from ...utils.testing import assert_in | ||
|
|
||
|
|
||
| def test_movielens_r100k(): | ||
| #with timestamp = False | ||
| movies = load_movielens_r100k() | ||
| assert_in(movies, in_=['data', 'item_ids', 'user_ids', 'DESCR']) | ||
| assert_equal(len(movies.data), 943) | ||
| assert_equal(len(movies.item_ids), 1682) | ||
| assert_equal(sum([len(items) for key, items in | ||
| movies.data.iteritems()]), 100000) | ||
|
|
||
| #with timestamp = True | ||
| movies = load_movielens_r100k(True) | ||
| assert_in(movies, in_=['data', 'item_ids', 'user_ids', 'DESCR']) | ||
| assert_equal(len(movies.data), 943) | ||
| assert_equal(len(movies.item_ids), 1682) | ||
| assert_equal(movies.data[1][1], (874965758, 5)) | ||
| assert_equal(sum([len(items) for key, items in | ||
| movies.data.iteritems()]), 100000) | ||
|
|
||
|
|
||
| def test_sample_songs(): | ||
| songs = load_sample_songs() | ||
| assert_in(songs, in_=['data', 'item_ids', 'user_ids', 'DESCR']) | ||
| assert_equal(len(songs.data), 8) | ||
| assert_equal(len(songs.item_ids), 8) | ||
| assert_equal(sum([len(items) for key, items in | ||
| songs.data.iteritems()]), 49) | ||
|
|
||
|
|
||
| def test_sample_movies(): | ||
| movies = load_sample_movies() | ||
| assert_in(movies, in_=['data', 'item_ids', 'user_ids', 'DESCR']) | ||
| assert_equal(len(movies.data), 7) | ||
| assert_equal(len(movies.item_ids), 6) | ||
| assert_equal(sum([len(items) for key, items in | ||
| movies.data.iteritems()]), 35) | ||
|
|
||
|
|
||
| def test_load_bookcrossings(): | ||
| data_home = join(dirname(__file__), 'data/') | ||
| #explicit with sample data from tests/data | ||
| books = load_bookcrossings(data_home) | ||
| assert_equal(len(books.data), 26) | ||
| assert_equal(len(books.item_ids), 100) | ||
| assert_equal(sum([len(items) for key, items in | ||
| books.data.iteritems()]), 99) | ||
|
|
||
| #implicit with sample data from tests/data | ||
| books = load_bookcrossings(data_home, implicit=True) | ||
| assert_equal(len(books.data), 15) | ||
| assert_equal(len(books.item_ids), 100) | ||
| assert_equal(sum([len(items) for key, items in | ||
| books.data.iteritems()]), 60) | ||
|
|
||
| #explicit with download denied. | ||
| data_home = dirname(__file__) | ||
| assert_raises(IOError, load_bookcrossings, data_home, implicit=False, | ||
| download_if_missing=False) |
| @@ -1,2 +1,2 @@ | ||
| from .classes import MatrixPreferenceDataModel, \ | ||
| MatrixBooleanPrefDataModel |
| @@ -0,0 +1,15 @@ | ||
| """Testing utilities.""" | ||
|
|
||
| # Copyright (c) 2011 Marcel Caraciolo <marcel@muricoca.com> | ||
| # License: Simplified BSD | ||
|
|
||
|
|
||
| def assert_in(obj, in_=None, out_=None): | ||
| """Checks that all names in `in_` as in `obj`, but no name | ||
| in `out_` is.""" | ||
| if in_ is not None: | ||
| for name in in_: | ||
| assert name in obj | ||
| if out_ is not None: | ||
| for name in out_: | ||
| assert name not in obj |