In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# načtení dat – filmy
movie_col_names = ['movie_id', 'title', 'genre']
movies = pd.read_csv('./movies.dat', sep="::", header=None, names=movie_col_names, engine="python", encoding="latin")

# načtení dat - hodnocení
ratings_col_names = ['movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./ratings.dat', sep="::", header=None, names=ratings_col_names, engine="python")

In [3]:
# spojení tabulek hodnocení a filmů
data = pd.merge(movies, ratings)

# odstranění datumu z názvu filmu
title = [] 
for values in data['title']:
    title.append(re.sub(r'(\([1-3][0-9]{3})\)', '', values))

data['title'] = title
data.head()

Unnamed: 0,movie_id,title,genre,rating,timestamp
0,1,Toy Story,Animation|Children's|Comedy,5,978824268
1,1,Toy Story,Animation|Children's|Comedy,4,978237008
2,1,Toy Story,Animation|Children's|Comedy,4,978233496
3,1,Toy Story,Animation|Children's|Comedy,5,978225952
4,1,Toy Story,Animation|Children's|Comedy,5,978226474


In [4]:
############## Výpočet hodnocení filmů ############## 

mean_ratings = data.pivot_table(index='title', values='rating', aggfunc=[np.mean, len])

# odstranění prvního řádku atributů pro usnadnění práce
mean_ratings.columns = mean_ratings.columns.droplevel(1)
mean_ratings.columns.name = None
mean_ratings = mean_ratings.reset_index()

# pomocné parametry pro následující výpočet
m = (mean_ratings['mean'] * mean_ratings['len'] / mean_ratings['len'].sum(axis=0)).sum(axis=0)
C = mean_ratings['len'].quantile(0.25)

# výpočet hodnocení pomocí bayesova průměru
mean_ratings['score'] = (C * m + mean_ratings['len'] * mean_ratings['mean']) / (C + mean_ratings['len'])

# setřízení
mean_ratings = mean_ratings.sort_values(by='score', ascending=False)

mean_ratings


  mean_ratings = data.pivot_table(index='title', values='rating', aggfunc=[np.mean, len])


Unnamed: 0,title,mean,len,score
2934,"Shawshank Redemption, The",4.554558,2227,4.540350
2905,Seven Samurai (The Magnificent Seven) (Shichin...,4.560510,628,4.511636
1345,"Godfather, The",4.524966,2223,4.511167
3464,"Usual Suspects, The",4.517106,1783,4.500106
2866,Schindler's List,4.510417,2304,4.497301
...,...,...,...,...
247,Baby Geniuses,1.701220,164,2.016201
3078,Speed 2: Cruise Control,1.871935,367,2.012979
2579,Police Academy 6: City Under Siege,1.657718,149,2.006547
1786,Kazaam,1.466667,120,1.922821


In [5]:
############## Získání datumu z timestampu ############## 

data = data.pivot_table(index=['title', 'genre'], values='timestamp', aggfunc=np.max)

# odstranění prvního řádku atributů
data.columns.name = None
data = data.reset_index()

years = []
months = []
days = []

# výpočet datumu z timestampu
for values in data['timestamp']:
    value = pd.Timestamp(values, unit='s')
    years.append(value.year)
    months.append(value.month)
    days.append(value.day)

data['year'] = years
data['month'] = months
data['day'] = days

data.drop(columns=['timestamp'], inplace=True)
data.head()

  data = data.pivot_table(index=['title', 'genre'], values='timestamp', aggfunc=np.max)


Unnamed: 0,title,genre,year,month,day
0,"$1,000,000 Duck",Children's|Comedy,2002,11,25
1,'Night Mother,Drama,2003,1,28
2,'Til There Was You,Drama|Romance,2002,10,7
3,"'burbs, The",Comedy,2003,2,25
4,...And Justice for All,Drama|Thriller,2003,1,10


In [6]:
# spojení by mělo zůstat setřízené
data_extended = pd.merge(mean_ratings, data)

genres = data_extended['genre'].str.split('|')

g = set([])
for i in genres:
    g = g.union(set(i))

for j in g:
    data_extended[j] = data_extended['genre'].str.contains(str(j))

data_extended.drop(columns=['genre','len','score'], inplace=True)

# spojení výsledných top 100 filmů každé kategorie
final = pd.DataFrame()

for j in g:
    final = pd.concat([final, data_extended.query('`' + j + '`').head(100)], ignore_index = True)
    
final

Unnamed: 0,title,mean,year,month,day,Sci-Fi,War,Horror,Adventure,Western,...,Musical,Thriller,Animation,Film-Noir,Romance,Documentary,Crime,Children's,Action,Comedy
0,Star Wars: Episode IV - A New Hope,4.453694,2003,2,22,True,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
1,Dr. Strangelove or: How I Learned to Stop Worr...,4.449890,2003,2,21,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,"Matrix, The",4.315830,2003,2,6,True,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
3,Star Wars: Episode V - The Empire Strikes Back,4.292977,2003,2,27,True,True,False,True,False,...,False,False,False,False,False,False,False,False,True,False
4,Blade Runner,4.273333,2003,2,15,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674,Clerks,3.946884,2003,2,25,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1675,Strictly Ballroom,3.951183,2003,1,27,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1676,"Blues Brothers, The",3.939597,2003,2,6,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
1677,Swingers,3.941476,2003,2,6,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
