In [14]:
import pandas as pd

# Simple Demographic Filtering: Filter -> Scoring -> Sort

In [15]:
df = pd.read_csv("data/demographic.csv")

In [16]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# df.sort_values("vote_average", ascending=False)

### Step 1: Filter

In [18]:
genre = ["Drama"]
duration = (60, 150)
year = (2000, 2019)
topk = 20

In [19]:
df = df[df.release_year.between(year[0], year[1]) & df.runtime.between(duration[0], duration[1]) & df[genre].all(axis=1)]
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
704,Two Friends,Drama; Foreign,86.0,0.0,0.0,2002,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
861,Venice,Drama; Romance,110.0,7.5,4.0,2010,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2646,The Yards,Drama; Action; Thriller; Crime,115.0,6.0,98.0,2000,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3064,My Dog Skip,Comedy; Drama; Family,95.0,6.5,71.0,2000,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3099,Down to You,Comedy; Drama; Family; Romance,91.0,4.9,71.0,2000,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


### Step 2: Scoring
kita tinggal pake vote average sebagai score

### Step 3: Sorting

In [20]:
recomendation = df.loc[:, "title":"release_year"]
recomendation = recomendation.sort_values("vote_average", ascending=False).head(topk)
recomendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
27822,Mentor,Drama,92.0,10.0,1.0,2006
39620,Bazodee,Romance; Drama; Music,101.0,10.0,1.0,2016
40528,Farewell Baghdad,Drama,105.0,10.0,1.0,2014
36935,Shuttlecock Boys,Drama,76.0,10.0,1.0,2012
44314,Tokyo Ghoul,Action; Drama; Horror; Thriller,119.0,10.0,1.0,2017
27089,Willow and Wind,Drama,77.0,10.0,1.0,2000
32303,Butterfly,Science Fiction; Drama; Fantasy; Romance,103.0,10.0,3.0,2015
21562,Almost Kings,Drama,93.0,10.0,2.0,2010
31595,The Kreutzer Sonata,Drama; Romance,99.0,10.0,1.0,2008
27729,Kolka Cool,Drama; Comedy,97.0,10.0,2.0,2011


## Improve Demographic Filtering

In [21]:
def imdb_score(df, q=0.9):
    df = df.copy()
    m = df.vote_count.quantile(q)
    C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum() #rata2 rating semua film
    df = df[df.vote_count >= m]
    df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m) / (x.vote_count  + m), axis=1)
    return df

In [22]:
df = imdb_score(df)

In [23]:
recomendation = df.loc[:, "title":"release_year"]
recomendation = recomendation.sort_values("vote_average", ascending=False).head(topk)
recomendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
23605,Whiplash,Drama,105.0,8.3,4376.0,2014
5868,City of God,Drama; Crime,130.0,8.2,1852.0,2002
18433,The Intouchables,Drama; Comedy,112.0,8.2,5410.0,2011
25391,Mommy,Drama,138.0,8.2,734.0,2014
32011,Room,Drama; Thriller,117.0,8.1,2838.0,2015
38506,The Handmaiden,Thriller; Drama; Romance,145.0,8.1,453.0,2016
5847,The Pianist,Drama; War,150.0,8.1,1927.0,2002
40881,Lion,Drama,118.0,8.0,1699.0,2016
31652,Mustang,Drama,97.0,8.0,378.0,2015


# Membungkus Kode diatas dengan Class

In [24]:
class RecommenderSystem:
    def __init__(self, data):
        self.df = pd.read_csv(data)

    def recommender(self, genre=None, duration=None, year=None, topk=10):
        df = self.df.copy()
        df = self.demographic_filter(df, genre=genre, duration=duration, year=year)
        df = self.compute_imdb_score(df)
        
        result = df.loc[:, "title":"release_year"]
        result = recomendation.sort_values("vote_average", ascending=False).head(topk)
        result = result.head(topk)
        return result

    @staticmethod
    def demographic_filter(df, genre=None, duration=None, year=None):
        if genre is not None:
            df =  df[df[genre].all(axis=1)]
        if year is not None:
            df = df[df.release_year.between(year[0], year[1])]
        if duration is not None:
            df = df[df.runtime.between(duration[0], duration[1])]
        return df

    @staticmethod
    def compute_imdb_score(df, q=0.9):
        df = df.copy()
        
        m = df.vote_count.quantile(q)
        C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum() #rata2 rating semua film
        df = df[df.vote_count >= m]
        
        df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m) / (x.vote_count  + m), axis=1)
        return df
        

In [25]:
recsys = RecommenderSystem(data="data/demographic.csv")

In [26]:
recsys.recommender(genre=["Action"], year=(2010, 2020), duration=(60,150), topk=10)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
23605,Whiplash,Drama,105.0,8.3,4376.0,2014
5868,City of God,Drama; Crime,130.0,8.2,1852.0,2002
18433,The Intouchables,Drama; Comedy,112.0,8.2,5410.0,2011
25391,Mommy,Drama,138.0,8.2,734.0,2014
32011,Room,Drama; Thriller,117.0,8.1,2838.0,2015
38506,The Handmaiden,Thriller; Drama; Romance,145.0,8.1,453.0,2016
5847,The Pianist,Drama; War,150.0,8.1,1927.0,2002
40881,Lion,Drama,118.0,8.0,1699.0,2016
31652,Mustang,Drama,97.0,8.0,378.0,2015
