# 協同過濾(Collaborative Filtering)

## download data
[ml-100k.zip](http://files.grouplens.org/datasets/movielens/ml-100k.zip)

In [1]:
import pandas as pd

In [15]:
# Read the input training data
input_data_file_movie = "./ml-100k/u.item"
input_data_file_rating = "./ml-100k/u.data"

movie = pd.read_csv(input_data_file_movie, sep='|', encoding='ISO-8859-1', names=['movie_id', 'movie_title'], usecols = [0,1,])
rating = pd.read_csv(input_data_file_rating, sep='\t', encoding='ISO-8859-1', names=["user_id","movie_id","rating"], usecols = [0,1,2])
print(movie.head())
print(rating.head())


   movie_id        movie_title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)
   user_id  movie_id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1


In [16]:
# then merge movie and rating data
data = pd.merge(movie,rating)
data.head()

Unnamed: 0,movie_id,movie_title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


## USER-ITEM Matrix

In [17]:
# lets make a pivot table in order to make rows are users and columns are movies. And values are rating
pivot_table = data.pivot_table(index = ["user_id"],columns = ["movie_title"],values = "rating")
pivot_table.head(10)

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
6,,,,4.0,,,,5.0,,,...,,,,4.0,,,,,,
7,,,,4.0,,,5.0,5.0,,4.0,...,,,,5.0,3.0,,3.0,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,4.0,...,,,,,,,,,,
10,,,,5.0,,,,5.0,,4.0,...,,,,,,,,,,


## ITEM-ITEM 協同過濾相似性(Similarity)計算

In [18]:
movie_watched = pivot_table["Bad Boys (1995)"]
similarity_with_other_movies = pivot_table.corrwith(movie_watched)  # find correlation between "Bad Boys (1995)" and other movies
similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
similarity_with_other_movies.head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


movie_title
Enchanted April (1991)                             1.0
Homeward Bound II: Lost in San Francisco (1996)    1.0
Race the Sun (1996)                                1.0
Ready to Wear (Pret-A-Porter) (1994)               1.0
Great Dictator, The (1940)                         1.0
dtype: float64

## USER-USER 協同過濾相似性(Similarity)計算

In [19]:
# lets make a pivot table in order to make rows are users and columns are movies. And values are rating
pivot_table = data.pivot_table(index =["movie_title"],columns =  ["user_id"],values = "rating")
pivot_table.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),2.0,,,,2.0,,,,,,...,2.0,,,2.0,4.0,,,,,
12 Angry Men (1957),5.0,,,,,4.0,4.0,,,5.0,...,,,,,,,,,,
187 (1997),,,2.0,,,,,,,,...,,,,,,,,,,
2 Days in the Valley (1996),,,,,,,,,,,...,,,4.0,,,,,,,2.0
"20,000 Leagues Under the Sea (1954)",3.0,,,,,,5.0,,,,...,,,,,,,,,,
2001: A Space Odyssey (1968),4.0,,,,4.0,5.0,5.0,,,5.0,...,4.0,,,,,,,,3.0,
3 Ninjas: High Noon At Mega Mountain (1998),,1.0,,,,,,,,,...,,,,,,,,,,
"39 Steps, The (1935)",,,,,,,4.0,,4.0,4.0,...,,,,,,,,,3.0,


In [22]:
target_user = pivot_table[10]
similarity_with_other_movies = pivot_table.corrwith(target_user)  # find correlation between "Bad Boys (1995)" and other movies
similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
similarity_with_other_movies.head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


user_id
400    1.0
636    1.0
772    1.0
477    1.0
10     1.0
dtype: float64