### Item-based Collabrative Fitering

In [1]:
import pandas as pd

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")

m_cols = ['movie_id', 'title']
movies = pd.read_csv('ml-100k/ml-100k/u.item', sep='|', names=m_cols, usecols=range(2), encoding="ISO-8859-1")

ratings = pd.merge(movies, ratings)

ratings.head()


Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [4]:
#### pivot the table to construct a nice matrix of users and the movies they rated. NaN indicates missing data, or movies that a given user did not watch.


In [5]:
userRatings = ratings.pivot_table(index=['user_id'],columns=['title'],values='rating')

In [6]:
userRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


#### pandas has a built-in corr() method that will compute a correlation score for every column pair in the matrix! This gives us a correlation score between every pair of movies (where at least one user rated both movies - otherwise NaN's will show up.


In [7]:
corrMatrix = userRatings.corr()


In [8]:
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,,-1.0,-0.5,-0.5,0.522233,,-0.426401,,,...,,,,,,,,,,
1-900 (1994),,1.0,,,,,,-0.981981,,,...,,,,-0.944911,,,,,,
101 Dalmatians (1996),-1.0,,1.0,-0.04989,0.269191,0.048973,0.266928,-0.043407,,0.111111,...,,-1.0,,0.15884,0.119234,0.680414,0.0,0.707107,,
12 Angry Men (1957),-0.5,,-0.04989,1.0,0.666667,0.256625,0.274772,0.178848,,0.457176,...,,,,0.096546,0.068944,-0.361961,0.144338,1.0,1.0,
187 (1997),-0.5,,0.269191,0.666667,1.0,0.596644,,-0.5547,,1.0,...,,0.866025,,0.455233,-0.5,0.5,0.475327,,,


####  movies that lots of people rated together - and also give us more popular results that are more easily recongnizable - we'll use the min_periods argument to throw out results where fewer than 100 users rated a given movie pair:


In [9]:
corrMatrix = userRatings.corr(method='pearson', min_periods=100)


In [10]:
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,,1.0,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,,,1.0,,,,,,,...,,,,,,,,,,
187 (1997),,,,,,,,,,,...,,,,,,,,,,


In [16]:
myRatings = userRatings.iloc[0].dropna()


In [77]:
myRatings

title
101 Dalmatians (1996)                                      2.0
12 Angry Men (1957)                                        5.0
20,000 Leagues Under the Sea (1954)                        3.0
2001: A Space Odyssey (1968)                               4.0
Abyss, The (1989)                                          3.0
Ace Ventura: Pet Detective (1994)                          3.0
Air Bud (1997)                                             1.0
Akira (1988)                                               4.0
Aladdin (1992)                                             4.0
Alien (1979)                                               5.0
Aliens (1986)                                              5.0
All Dogs Go to Heaven 2 (1996)                             1.0
Amadeus (1984)                                             5.0
Angels and Insects (1995)                                  4.0
Antonia's Line (1995)                                      5.0
Apocalypse Now (1979)                            

In [78]:
simCandidates = pd.Series()
for i in range(0, len(myRatings.index)):
    print ("Adding sims for " + myRatings.index[i] + "...")
    # Retrieve similar movies to this one that I rated
    sims = corrMatrix[myRatings.index[i]].dropna()
    # Now scale its similarity by how well I rated this movie
    sims = sims.map(lambda x: x * myRatings[i])
    # Add the score to the list of similarity candidates
    simCandidates = simCandidates.append(sims)
    
#Glance at our results so far:
print ("sorting...")
simCandidates.sort_values(inplace = True, ascending = False)
print (simCandidates.head(10))


Adding sims for 101 Dalmatians (1996)...
Adding sims for 12 Angry Men (1957)...
Adding sims for 20,000 Leagues Under the Sea (1954)...
Adding sims for 2001: A Space Odyssey (1968)...
Adding sims for Abyss, The (1989)...
Adding sims for Ace Ventura: Pet Detective (1994)...
Adding sims for Air Bud (1997)...
Adding sims for Akira (1988)...
Adding sims for Aladdin (1992)...
Adding sims for Alien (1979)...
Adding sims for Aliens (1986)...
Adding sims for All Dogs Go to Heaven 2 (1996)...
Adding sims for Amadeus (1984)...
Adding sims for Angels and Insects (1995)...
Adding sims for Antonia's Line (1995)...
Adding sims for Apocalypse Now (1979)...
Adding sims for Apollo 13 (1995)...
Adding sims for Aristocats, The (1970)...
Adding sims for Army of Darkness (1993)...
Adding sims for Austin Powers: International Man of Mystery (1997)...
Adding sims for Babe (1995)...
Adding sims for Back to the Future (1985)...
Adding sims for Bad Boys (1995)...
Adding sims for Basic Instinct (1992)...
Adding s

Adding sims for Stargate (1994)...
Adding sims for Starship Troopers (1997)...
Adding sims for Steel (1997)...
Adding sims for Sting, The (1973)...
Adding sims for Strange Days (1995)...
Adding sims for Striptease (1996)...
Adding sims for Supercop (1992)...
Adding sims for Swingers (1996)...
Adding sims for Taxi Driver (1976)...
Adding sims for Terminator 2: Judgment Day (1991)...
Adding sims for Terminator, The (1984)...
Adding sims for Theodore Rex (1995)...
Adding sims for This Is Spinal Tap (1984)...
Adding sims for Three Colors: Blue (1993)...
Adding sims for Three Colors: Red (1994)...
Adding sims for Three Colors: White (1994)...
Adding sims for To Wong Foo, Thanks for Everything! Julie Newmar (1995)...
Adding sims for Top Gun (1986)...
Adding sims for Toy Story (1995)...
Adding sims for True Romance (1993)...
Adding sims for Truth About Cats & Dogs, The (1996)...
Adding sims for Turbo: A Power Rangers Movie (1997)...
Adding sims for Twelve Monkeys (1995)...
Adding sims for Twi

In [79]:
simCandidates = simCandidates.groupby(simCandidates.index).sum()


In [80]:
simCandidates.sort_values(inplace = True, ascending = False)
simCandidates.head(10)


Raiders of the Lost Ark (1981)               115.630218
Back to the Future (1985)                    113.927926
Indiana Jones and the Last Crusade (1989)    100.752361
Fugitive, The (1993)                          96.951700
Return of the Jedi (1983)                     96.098239
Star Wars (1977)                              95.885584
Empire Strikes Back, The (1980)               93.639628
Toy Story (1995)                              91.288603
Terminator 2: Judgment Day (1991)             90.631513
Independence Day (ID4) (1996)                 87.497565
dtype: float64

In [81]:
###filteredSims = simCandidates.drop(myRatings.index,index=1)

In [82]:
myRatings.head()

title
101 Dalmatians (1996)                  2.0
12 Angry Men (1957)                    5.0
20,000 Leagues Under the Sea (1954)    3.0
2001: A Space Odyssey (1968)           4.0
Abyss, The (1989)                      3.0
Name: 1, dtype: float64

In [83]:
simCandidates.head()

Raiders of the Lost Ark (1981)               115.630218
Back to the Future (1985)                    113.927926
Indiana Jones and the Last Crusade (1989)    100.752361
Fugitive, The (1993)                          96.951700
Return of the Jedi (1983)                     96.098239
dtype: float64

In [88]:
###simCandidates.drop(myRatings.index,axis=0)
##myRatings.index
simCandidates.drop()

Raiders of the Lost Ark (1981)               115.630218
Back to the Future (1985)                    113.927926
Indiana Jones and the Last Crusade (1989)    100.752361
Fugitive, The (1993)                          96.951700
Return of the Jedi (1983)                     96.098239
Star Wars (1977)                              95.885584
Empire Strikes Back, The (1980)               93.639628
Toy Story (1995)                              91.288603
Terminator 2: Judgment Day (1991)             90.631513
Independence Day (ID4) (1996)                 87.497565
Braveheart (1995)                             85.855090
Jurassic Park (1993)                          85.551288
Terminator, The (1984)                        84.790625
Aliens (1986)                                 83.612414
E.T. the Extra-Terrestrial (1982)             81.664953
Speed (1994)                                  79.265119
Fargo (1996)                                  78.296356
Top Gun (1986)                                77