In [55]:
import pandas as pd

# Simple Demographic Filtering: Filter -> Scoring -> Sort

In [56]:
df = pd.read_csv("data/demographic.csv")

In [57]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# df.sort_values("vote_average", ascending=False)

### Step 1: Filter

In [59]:
genre = ["Animation"]
duration = (60, 150)
year = (2000, 2019)
topk = 20

In [60]:
df = df[df.release_year.between(year[0], year[1]) & df.runtime.between(duration[0], duration[1]) & df[genre].all(axis=1)]
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
2646,The Yards,Drama; Action; Thriller; Crime,115.0,6.0,98.0,2000,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3150,Gun Shy,Action; Comedy; Romance; Thriller,101.0,5.4,24.0,2000,1,0,0,1,...,0,0,0,0,1,0,0,1,0,0
3153,Knockout,Drama; Action,99.0,0.0,0.0,2000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3174,Pitch Black,Thriller; Science Fiction; Action,108.0,6.7,1812.0,2000,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
3192,Deterrence,Action; Drama; Mystery; Thriller,101.0,6.1,7.0,2000,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


### Step 2: Scoring
kita tinggal pake vote average sebagai score

### Step 3: Sorting

In [61]:
recomendation = df.loc[:, "title":"release_year"]
recomendation = recomendation.sort_values("vote_average", ascending=False).head(topk)
recomendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
44314,Tokyo Ghoul,Action; Drama; Horror; Thriller,119.0,10.0,1.0,2017
30181,Backyard Dogs,Action; Comedy,96.0,10.0,1.0,2001
20164,Road to Redemption,Action; Comedy,89.0,10.0,1.0,2001
9378,High Roller: The Stu Ungar Story,Drama; Action,120.0,10.0,1.0,2003
42020,The River Thief,Crime; Action; Adventure,97.0,9.3,3.0,2016
33861,From Mexico With Love,Action; Adventure; Drama,96.0,9.0,1.0,2009
29219,Say Nothing,Action; Drama; Mystery; Romance; Science Ficti...,94.0,9.0,1.0,2001
41152,Frontier,Adventure; Action,82.0,9.0,1.0,2001
35577,"Fuse, Memoirs of the Hunter Girl",Action; Animation; Drama; History,110.0,8.8,4.0,2012
44297,Extraordinary Mission,Action; Crime; Thriller,117.0,8.7,3.0,2017


## Improve Demographic Filtering

In [62]:
def imdb_score(df, q=0.9):
    df = df.copy()
    m = df.vote_count.quantile(q)
    C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum() #rata2 rating semua film
    df = df[df.vote_count >= m]
    df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m) / (x.vote_count  + m), axis=1)
    return df

In [63]:
df = imdb_score(df)

In [64]:
recomendation = df.loc[:, "title":"release_year"]
recomendation = recomendation.sort_values("vote_average", ascending=False).head(topk)
recomendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
15459,Inception,Action; Thriller; Science Fiction; Mystery; Ad...,148.0,8.1,14075.0,2010
9419,Oldboy,Drama; Thriller; Mystery; Action,120.0,8.0,2000.0,2003
23685,Guardians of the Galaxy,Action; Science Fiction; Adventure,121.0,7.9,10014.0,2014
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
10828,V for Vendetta,Action; Thriller; Fantasy,132.0,7.7,4562.0,2006
6714,Kill Bill: Vol. 1,Action; Crime,111.0,7.7,5091.0,2003
7260,Kill Bill: Vol. 2,Action; Crime; Thriller,136.0,7.7,4061.0,2004
21557,Rush,Drama; Action,123.0,7.7,2310.0,2013
17799,Warrior,Action; Drama,140.0,7.7,1297.0,2011
21737,Captain Phillips,Action; Drama; Thriller,134.0,7.6,2495.0,2013


# Membungkus Kode diatas dengan Class

In [65]:
class RecommenderSystem:
    def __init__(self, data):
        self.df = pd.read_csv(data)

    def recommender(self, genre=None, duration=None, year=None, topk=10):
        df = self.df.copy()
        df = self.demographic_filter(df, genre=genre, duration=duration, year=year)
        df = self.compute_imdb_score(df)
        
        result = df.loc[:, "title":"release_year"]
        result = recomendation.sort_values("vote_average", ascending=False).head(topk)
        result = result.head(topk)
        return result

    @staticmethod
    def demographic_filter(df, genre=None, duration=None, year=None):
        if genre is not None:
            df =  df[df[genre].all(axis=1)]
        if year is not None:
            df = df[df.release_year.between(year[0], year[1])]
        if duration is not None:
            df = df[df.runtime.between(duration[0], duration[1])]
        return df

    @staticmethod
    def compute_imdb_score(df, q=0.9):
        df = df.copy()
        
        m = df.vote_count.quantile(q)
        C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum() #rata2 rating semua film
        df = df[df.vote_count >= m]
        
        df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m) / (x.vote_count  + m), axis=1)
        return df
        

In [66]:
recsys = RecommenderSystem(data="data/demographic.csv")

In [68]:
recsys.recommender(genre=["Comedy"], year=(2015, 2020), duration=(60,150), topk=10)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
15459,Inception,Action; Thriller; Science Fiction; Mystery; Ad...,148.0,8.1,14075.0,2010
9419,Oldboy,Drama; Thriller; Mystery; Action,120.0,8.0,2000.0,2003
23685,Guardians of the Galaxy,Action; Science Fiction; Adventure,121.0,7.9,10014.0,2014
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
10828,V for Vendetta,Action; Thriller; Fantasy,132.0,7.7,4562.0,2006
6714,Kill Bill: Vol. 1,Action; Crime,111.0,7.7,5091.0,2003
7260,Kill Bill: Vol. 2,Action; Crime; Thriller,136.0,7.7,4061.0,2004
21557,Rush,Drama; Action,123.0,7.7,2310.0,2013
17799,Warrior,Action; Drama,140.0,7.7,1297.0,2011
21737,Captain Phillips,Action; Drama; Thriller,134.0,7.6,2495.0,2013
