In [30]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [3]:
links = pd.read_csv('./data/links.csv')
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')
tags = pd.read_csv('./data/tags.csv')

In [78]:
df = ratings.join(movies.set_index('movieId'), on='movieId')

In [83]:
def top_for_genre(df, genre='Comedy', top=20):
    # Оставляем в датафрейме фильмы нужного нам жанра
    df_by_genre = df[df.apply(lambda x: pd.Series(genre).isin(x.genres.split('|')), axis=1)[0]]
    
    # Агрегируем по фильму и считаем среднюю оценку и количество оценок
    df_agg = df_by_genre.groupby(by='title').agg(['mean', 'count']).rating.reset_index()
    
    # Считаем статистики по количеству оценок
    mean_num_ratigs = df_agg['count'].mean()
    min_num_ratigs = df_agg['count'].min()
    max_num_ratigs = df_agg['count'].max()
    
    # Взвешиваем рейтинг по нормированному количеству оценок
    df_agg['weighted_rating'] = df_agg.apply(lambda x: x['mean'] * (x['count'] - mean_num_ratigs) / (max_num_ratigs - min_num_ratigs), axis=1)
    
    # Сортируем по полученной метрике и отдаем пользователю
    topfilms = df_agg.sort_values(by='weighted_rating', ascending=False).head(top)
    
    return topfilms

In [88]:
# Список жанров
genres = set()
all_genres = df.genres.unique()
for genre in all_genres:
    for each_genre in genre.split('|'):
        genres.add(each_genre)
genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [89]:
top_for_genre(df, genre='Action', top=10)

Unnamed: 0,title,mean,count,weighted_rating
1038,"Matrix, The (1999)",4.192446,278,3.953796
1505,Star Wars: Episode IV - A New Hope (1977),4.231076,251,3.577811
249,Braveheart (1995),4.031646,237,3.205407
551,Fight Club (1999),4.272936,218,3.104158
861,Jurassic Park (1993),3.75,238,2.995019
1592,Terminator 2: Judgment Day (1991),3.970982,224,2.970812
1506,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211,2.956002
1279,Raiders of the Lost Ark (Indiana Jones and the...,4.2075,200,2.783209
1507,Star Wars: Episode VI - Return of the Jedi (1983),4.137755,196,2.677323
1393,Saving Private Ryan (1998),4.146277,188,2.563089


In [94]:
top_for_genre(df, genre='Comedy', top=10)

Unnamed: 0,title,mean,count,weighted_rating
1181,Forrest Gump (1994),4.164134,329,4.044687
2713,Pulp Fiction (1994),4.197068,307,3.795166
3442,Toy Story (1995),3.92093,215,2.445697
1095,Fargo (1996),4.116022,181,2.140726
116,Aladdin (1992),3.79235,183,1.995509
280,Back to the Future (1985),4.038012,171,1.977043
3002,Shrek (2001),3.867647,170,1.881839
3472,True Lies (1994),3.497191,178,1.786888
2689,"Princess Bride, The (1987)",4.232394,142,1.698008
2187,Men in Black (a.k.a. MIB) (1997),3.487879,165,1.643891


In [93]:
top_for_genre(df, genre='(no genres listed)', top=10)

Unnamed: 0,title,mean,count,weighted_rating
20,Pirates of the Caribbean: Dead Men Tell No Tal...,3.785714,7,3.544468
11,Green Room (2015),3.333333,3,0.898693
33,Whiplash (2013),4.75,2,0.488971
28,The Godfather Trilogy: 1972-1990 (1992),4.75,2,0.488971
6,Cosmos,4.5,2,0.463235
26,The Brand New Testament (2015),4.0,2,0.411765
10,Grease Live (2016),2.0,2,0.205882
4,Ben-hur (2016),0.5,1,-0.031863
22,Superfast! (2015),0.5,1,-0.031863
17,Maria Bamford: Old Baby,1.0,1,-0.063725


(no genres listed) - интересный жанр.

Странно, что у обладателя трех оскаров Whiplash (2013) не проставлен жанр и так мало оценок.