In [None]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# data: 1. fix dictionary data about critics 
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [None]:
#  what score Lisa Rose gave to the movie Lady in the Water
critics['Lisa Rose']['Lady in the Water']

In [None]:
# All rating provided by Toby
critics['Toby']

In [None]:
# return a distance-based similarity score for person 1 and person 2 
# based on Euclidean distance
def sum_distance(prefs, person1, person2):
    si={}
    # get list of common items
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] =1
    if len(si)==0:
        return 0
    sum_of_square=sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in si])
    return 1/(1+sum_of_square)
            
            
sum_distance(critics, 'Lisa Rose', 'Gene Seymour')


In [None]:
# pearson correlation
from scipy.stats.stats import pearsonr
def sim_pearson_fancy(prefs,p1,p2):
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1

    if len(si)==0: return 0
    else:
        si_1 = [prefs[p1][it] for it in si]
        si_2 = [prefs[p2][it] for it in si]
        return pearsonr(si_1, si_2)[0]
    
sim_pearson_fancy(critics,'Lisa Rose','Jack Matthews')

In [None]:
def topMatches(prefs, person, n=5, similarity = sim_pearson_fancy):
    scores=[(similarity(prefs,person, other), other) for other in prefs if other!=person]
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [None]:
topMatches(critics,'Toby', n=3)

In [None]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson_fancy):
    totals={}
    simSums={}

    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)

        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:

            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings

In [None]:
getRecommendations(critics,'Toby')

In [None]:
# data : 2. from file
# the data contains three tables: users, rating, movies

unames = ['user_id','gender','age','occupation','zip code']
users = pd.read_table('ml-1m/users.dat', sep='::',header=None, names=unames, engine='python')

rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None,names=rnames,engine='python')

mnames = ['movie_id','title','genres']
movies=pd.read_table('ml-1m/movies.dat', sep='::',header=None, names=mnames, engine='python')

In [None]:
# merge three tables
data = pd.merge(pd.merge(ratings, users), movies)

In [None]:
# 25 most rated movies
data.groupby('title').size().sort_values(ascending=False)[:5]
# or
data.title.value_counts()[:5]

In [None]:
# most highly rated movies
data.groupby('title').agg({'rating':[np.size, np.mean]}).sort_values([('rating','mean')], ascending=False)[:10]

In [None]:
# get mean movie ratings for each film grouped by gender
mean_ratings = data.pivot_table('rating', index='title',columns='gender', aggfunc='mean')
mean_ratings[:5]

In [None]:
# we wish to only look at movies that received more than a certain number of ratings
ratings_by_title = data.groupby('title').size()
ratings_by_title.index[ratings_by_title>=250][:5]
# or
movie_status =data.groupby('title').agg({'rating':[np.size,np.mean]})
movie_status[movie_status['rating']['size']>=100].sort_values([('rating','mean')], ascending=False).head()

In [None]:
# grab the titles of movies that were rated more than 250 times:
active_titles =ratings_by_title.index[ratings_by_title>=250]

In [None]:
# The index of titles receiving at least 250 ratings can then be used to select rows from mean_ratings above
mean_ratings_select_moreThan250 = mean_ratings.ix[active_titles]
mean_ratings_select_moreThan250[:5]

In [None]:
#  the top films among female viewers
top_female_ratings = mean_ratings_select_moreThan250.sort_values(by='F',ascending=False)
top_female_ratings[:10]

In [None]:
top_male_ratings=mean_ratings_select_moreThan250.sort_values(by='M',ascending=False)
top_male_ratings[:10]

In [None]:
# the movies that are most divisive between male and female viewers
mean_ratings_select_moreThan250['diff']=mean_ratings_select_moreThan250['M']-mean_ratings_select_moreThan250['F']

In [None]:
# Sorting by 'diff' gives us the movies with the greatest rating difference and which were preferred by women
sorted_by_diff= mean_ratings_select_moreThan250.sort_values(by='diff')
sorted_by_diff[:10]

In [None]:
# Reversing the order of the rows and again slicing off the top 10 rows, we get the movies preferred by men that women didn’t rate highly
sorted_by_diff[::-1][:10]

In [None]:
# handle with the movies rated more than 50 times
movie_50= data.groupby('movie_id').size().sort_values(ascending=False)[:50]

In [None]:
# how age is distributed amongst our users.
users.age.plot.hist(bins=30)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age')

In [None]:
# bin our users into age groups using pandas.cut.
label = ['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79']
data['age_group']=pd.cut(data.age, range(0,81,10),right=False,labels=label)
data[['age','age_group']].drop_duplicates()[:10]

In [None]:
#  compare ratings across age groups.
data.groupby('age_group').agg({'rating':[np.size,np.mean]})

In [None]:
# how the 50 most rated movies are viewed across each age group
data.set_index('movie_id',inplace=True)

In [None]:
by_age=data.loc[movie_50.index].groupby(['title','age_group'])

In [None]:
by_age.rating.mean().unstack(0).fillna(0).head()

In [None]:
data.reset_index('movie_id', inplace=True)


In [None]:
pivoted=data.pivot_table(index=['movie_id','title'],columns='gender',values='rating',fill_value=0)
pivoted.head()

In [None]:
pivoted['diff']=pivoted.M-pivoted.F
pivoted.head()

In [None]:
pivoted.reset_index('movie_id', inplace=True)

In [None]:
disagreements=pivoted[pivoted.movie_id.isin(movie_50)]['diff']
disagreements.sort_values().plot(kind='barh',figsize=[9,10])
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)')
plt.ylabel('Title')
plt.xlabel('Average Rating Difference');

In [None]:
 # An aggregate of the movie ratings for each particular genre.
for row in data.genres:
    row.str.split("|")
    

In [None]:
##### The top 5 highest ranked genre by women.
data[data.gender=='F'].groupby('genres').agg({'rating': np.mean}).sort_values(by='rating',ascending=False).head(5)


In [None]:
#  The top 5 highest ranked genre by men.
data[data.gender=='M'].groupby('genres').agg({'rating': np.mean}).sort_values(by='rating',ascending=False).head(5)

In [None]:
# A breakdown of a movies ratings by age, use any movie of your choice.
data[data.title=='One Flew Over the Cuckoo\'s Nest (1975)'].groupby('age').agg({'rating': np.mean}).sort_values(by='age',ascending=True).head(5)

In [None]:
# A function that given a user id and movie id , returns a list of user ids for
# other users that rated the movie identi_ed by the provided movie id with the same score.

def findSimilarUser(data,useid, movieid):
    score = data[data.user_id==useid][data.movie_id==movieid].rating
    users = []
    for user in data[data.movie_id==movieid].user_id:
        if(user!=useid):
            users.append(user)
    return users
        
    

In [None]:
#For the following problems, genres were separated and a file containing that information was saved for future use. 
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('ml-1m/users.dat', sep='::', header=None, names=unames, engine='python')

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None, names=rnames, engine='python')

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=mnames, engine='python')

data = pd.merge(pd.merge(ratings, users), movies)

data.head()