In [3]:
# Importing libraries
import pandas as pd
import numpy as np

In [4]:
# Listing columns to select
ratings_cols = ['user_id', 'movie_id', 'rating']

# Loading rating data and filtering by selected columns 
ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, usecols=(range(3)))

# Listing columns to select
movies_cols = ['movie_id', 'title']

# Loading movie data and filtering by selected columns 
movies_df = pd.read_csv('ml-100k/u.item', sep="|", names=movies_cols, usecols=(range(2)), encoding='latin-1')

# Merging rating and movie dataframes
ratings_df = pd.merge(movies_df, ratings_df)

In [5]:
# Showing head of dataframe
ratings_df.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [6]:
# Creating a pivot table with movie ratings
movie_ratings = ratings_df.pivot_table(index=['user_id'], columns=['title'],
                                      values='rating')

In [7]:
# Showing haed of table
movie_ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Selecting ratings for the 'Star Wars' Movie
star_wars_ratings = movie_ratings['Star Wars (1977)']
star_wars_ratings.head()

user_id
0    5.0
1    5.0
2    5.0
3    NaN
4    5.0
Name: Star Wars (1977), dtype: float64

In [9]:
# Creating a correlation table to find similar movie ratings to star wars
similar_movies = movie_ratings.corrwith(star_wars_ratings)

  c *= 1. / np.float64(fact)


In [10]:
# Drop NA values
similar_movies.dropna(inplace=True)

In [11]:
# Creating a dataframe from result
df = pd.DataFrame(similar_movies)

In [13]:
# Sorting values by the correlation column
df.sort_values(by=0, axis=0, ascending=False).head(10)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Commandments (1997),1.0
Cosi (1996),1.0
No Escape (1994),1.0
Stripes (1981),1.0
Man of the Year (1995),1.0
Hollow Reed (1996),1.0
"Beans of Egypt, Maine, The (1994)",1.0
"Good Man in Africa, A (1994)",1.0
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",1.0
"Outlaw, The (1943)",1.0


In [14]:
# Grouping movies by title and calculating size (how many times) and mean for ratings
movie_stats = ratings_df.groupby('title').agg({'rating': [np.size, np.mean]})

In [15]:
# Showing head of dataframe
movie_stats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [16]:
# Filtering movies that have at least 100 ratings
popular_movies = movie_stats['rating']['size'] >= 100
#popular_movies

In [17]:
# Apply filtering
resulted_df = movie_stats[popular_movies]
resulted_df.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
2001: A Space Odyssey (1968),259,3.969112
Absolute Power (1997),127,3.370079
"Abyss, The (1989)",151,3.589404


In [18]:
# Sorting results by mean rating
resulted_df.sort_values(by=[('rating', 'mean')], ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
# Showing to 15 results
resulted_df.head(15)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.45679
"Shawshank Redemption, The (1994)",283,4.44523
Rear Window (1954),209,4.38756
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),584,4.359589
12 Angry Men (1957),125,4.344
Citizen Kane (1941),198,4.292929


In [20]:
# Creating a new dataframe joining movie_stats and similar_movies dataframes
df = movie_stats[popular_movies].join(pd.DataFrame(similar_movies, columns=['similarity']))



In [21]:
# Showing the head of dataframe
df.head()

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101 Dalmatians (1996),109,2.908257,0.211132
12 Angry Men (1957),125,4.344,0.184289
2001: A Space Odyssey (1968),259,3.969112,0.230884
Absolute Power (1997),127,3.370079,0.08544
"Abyss, The (1989)",151,3.589404,0.203709


In [22]:
# Sorting movies by similarity; of couse 'Star Wars' is the most similar to itself, but the other movies in this list can be considered similar movies
df.sort_values(by='similarity', ascending=False).head(10)

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars (1977),584,4.359589,1.0
"Empire Strikes Back, The (1980)",368,4.206522,0.748353
Return of the Jedi (1983),507,4.00789,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Austin Powers: International Man of Mystery (1997),130,3.246154,0.377433
"Sting, The (1973)",241,4.058091,0.367538
Indiana Jones and the Last Crusade (1989),331,3.930514,0.350107
Pinocchio (1940),101,3.673267,0.347868
"Frighteners, The (1996)",115,3.234783,0.332729
L.A. Confidential (1997),297,4.161616,0.319065
