### Постройте топ фильмов в категориях Action и Comedy

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [4]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [5]:
links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


#### Сформируем итоговый рейтинг на основе рейтинга, нормированного на количество оценок

In [9]:
# Выбор фильмов в жаре Action и Comedy
action = movies[movies['genres'].str.contains('Action')]
comedy = movies[movies['genres'].str.contains('Comedy')]

# Добавление столбца рейтинг
action_ratings = pd.merge(action, ratings, left_on = 'movieId', right_on = 'movieId', how='left')
comedy_ratings = pd.merge(comedy, ratings, left_on = 'movieId', right_on = 'movieId', how='left')

# Выделение нужных полей (без timestamp)
a=action_ratings[['movieId','title','userId','rating']]
c=comedy_ratings[['movieId','title','userId','rating']]


In [10]:
#Расчет кол-ва голос за каждый рейтинг
a_count = a.groupby(['movieId','title','rating'], as_index=False).count()
c_count = c.groupby(['movieId','title','rating'], as_index=False).count()

# Расчет новой переменной - вес рейтинга
a_weight_rate = a_count['rating'].astype(float) * a_count['userId']
с_weight_rate = c_count['rating'].astype(float) * c_count['userId']

# Добавление новой переменной в общий набор данных
a_new_rate = pd.concat((a_count, pd.DataFrame(a_weight_rate, columns=['weight_rate'])), axis=1)
a_new_rate['weight_rate']=a_new_rate['weight_rate'].astype(float)
c_new_rate = pd.concat((c_count, pd.DataFrame(с_weight_rate, columns=['weight_rate'])), axis=1)
c_new_rate['weight_rate']=c_new_rate['weight_rate'].astype(float)

# Группировка рейтинга по новому признаку
a_rating = a_new_rate.groupby('title', as_index=False)[['weight_rate']].sum().sort_values(by='weight_rate', ascending=False)
c_rating = c_new_rate.groupby('title', as_index=False)[['weight_rate']].sum().sort_values(by='weight_rate', ascending=False)


In [12]:
a_rating

Unnamed: 0,title,weight_rate
1038,"Matrix, The (1999)",1165.5
1505,Star Wars: Episode IV - A New Hope (1977),1062.0
249,Braveheart (1995),955.5
551,Fight Club (1999),931.5
861,Jurassic Park (1993),892.5
...,...,...
274,Captain America (1979),0.5
412,Derailed (2002),0.5
1727,Unforgiven (2013),0.5
852,Journey 2: The Mysterious Island (2012),0.5


In [11]:
c_rating

Unnamed: 0,title,weight_rate
1181,Forrest Gump (1994),1370.0
2713,Pulp Fiction (1994),1288.5
3442,Toy Story (1995),843.0
1095,Fargo (1996),745.0
116,Aladdin (1992),694.0
...,...,...
2888,Saving Christmas (2014),0.5
1442,"Haunted House 2, A (2014)",0.5
2934,Secret Society (2002),0.5
1426,Hard Ticket to Hawaii (1987),0.5


#### Попробуем учесть данные по кол-ву упомянутых тегов 

In [107]:
# Объединение с основным набором данных (movies + rating)
comedy_w = pd.merge(comedy, c_rating, left_on = 'title', right_on = 'title', how='left').sort_values(by='weight_rate', ascending=False)
action_w = pd.merge(action, a_rating, left_on = 'title', right_on = 'title', how='left').sort_values(by='weight_rate', ascending=False)

# Выбор используемых тегов
tags_cw = pd.merge(tags, comedy_w, left_on = 'movieId', right_on = 'movieId', how='left')
tags_aw = pd.merge(tags, action_w, left_on = 'movieId', right_on = 'movieId', how='left')

tags_cw_nn = tags_cw[tags_cw['genres'].notna()]
tags_aw_nn = tags_aw[tags_aw['genres'].notna()]

# Группировка фильмов по кол-ву упомянутых тегов 
tag_rate_c = tags_cw_nn.groupby('movieId', as_index=False)[['tag']].count().sort_values(by='tag', ascending=False)
tag_rate_a = tags_aw_nn.groupby('movieId', as_index=False)[['tag']].count().sort_values(by='tag', ascending=False)

# Объединение рейтинга тегов с основным набором данных (movies + rating)
c_fin_rate = pd.merge(comedy_w, tag_rate_c, left_on = 'movieId', right_on = 'movieId', how='left')
a_fin_rate = pd.merge(action_w, tag_rate_a, left_on = 'movieId', right_on = 'movieId', how='left')

# Удаление пропусков
c_fin_rate['weight_rate'].fillna(0, inplace=True)
c_fin_rate['tag'].fillna(0, inplace=True)
a_fin_rate['weight_rate'].fillna(0, inplace=True)
a_fin_rate['tag'].fillna(0, inplace=True)

# Расчет финального рейтинга
c_fin_rate['final_rate'] = c_fin_rate['weight_rate'][0:3756] + c_fin_rate['tag'][0:3756]
a_fin_rate['final_rate'] = a_fin_rate['weight_rate'][0:1828] + a_fin_rate['tag'][0:1828]

In [164]:
c_fin_rate.sort_values(by='final_rate', ascending=False)

Unnamed: 0,movieId,title,genres,weight_rate,tag,final_rate
1,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1288.5,181.0,1469.5
0,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1370.0,9.0,1379.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,843.0,3.0,846.0
3,608,Fargo (1996),Comedy|Crime|Drama|Thriller,745.0,5.0,750.0
4,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,694.0,1.0,695.0
...,...,...,...,...,...,...
3732,165645,Bad Santa 2 (2016),Comedy,0.5,0.0,0.5
3731,110773,"Haunted House 2, A (2014)",Comedy|Horror,0.5,0.0,0.5
3729,92681,Journey 2: The Mysterious Island (2012),Action|Adventure|Comedy|Sci-Fi|IMAX,0.5,0.0,0.5
3728,91414,Arthur Christmas (2011),Animation|Children|Comedy|Drama,0.5,0.0,0.5


In [165]:
a_fin_rate.sort_values(by='final_rate', ascending=False)

Unnamed: 0,movieId,title,genres,weight_rate,tag,final_rate
0,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1165.5,5.0,1170.5
1,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1062.0,26.0,1088.0
3,2959,Fight Club (1999),Action|Crime|Drama|Thriller,931.5,54.0,985.5
2,110,Braveheart (1995),Action|Drama|War,955.5,10.0,965.5
6,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,889.5,10.0,899.5
...,...,...,...,...,...,...
1807,157172,Wizards of the Lost Kingdom II (1989),Action|Fantasy,0.5,0.0,0.5
1806,104017,3 dev adam (Three Giant Men) (1973),Action|Adventure|Sci-Fi,0.5,0.0,0.5
1805,5700,The Pumaman (1980),Action|Adventure|Fantasy|Sci-Fi,0.5,0.0,0.5
1804,4750,3 Ninjas Knuckle Up (1995),Action|Children,0.5,0.0,0.5
