# Loading Libraries and API Package

In [1]:
from tmdbv3api import TMDb
from tmdbv3api import Movie
from tmdbv3api import TV
from tmdbv3api import Genre, Search

tmdb = TMDb()
tmdb.api_key = '3645538d59141adac64aaa49b3cb02f7'
tmdb.language = 'en'
tmdb.debug = True

movie = Movie()
tv = TV()
genre = Genre()

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import json as json
import pickle
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox

%matplotlib inline

# Loading STC JAWWY Data

In [5]:
stc = pd.read_csv('Final_Dataset.txt', delimiter = ",")
stc = stc.drop('Unnamed: 0', axis=1)

In [6]:
# creating unique id for videos
df = stc.copy()
df['vid_id'] = df.groupby(['original_name']).ngroup()
df.head()

Unnamed: 0,date_,user_id_maped,program_name,duration_seconds,program_class,season,episode,program_desc,program_genre,series_title,hd,original_name,vid_id
0,2017-05-27,26138,100 treets,40,MOVIE,0,0,Drama Movie100 Streets,Drama,0,0,100 treets,2
1,2017-05-21,7946,Moana,17,MOVIE,0,0,Animation MovieMoana (HD),Animation,0,1,Moana,924
2,2017-08-10,7418,The Mermaid Princess,8,MOVIE,0,0,Animation MovieThe Mermaid Princess (HD),Animation,0,1,The Mermaid Princess,1524
3,2017-07-26,19307,The Mermaid Princess,76,MOVIE,0,0,Animation MovieThe Mermaid Princess (HD),Animation,0,1,The Mermaid Princess,1524
4,2017-07-07,15860,Churchill,87,MOVIE,0,0,Biography MovieChurchill (HD),Biography,0,1,Churchill,317


In [7]:
df['original_name'] = [" ".join(x.split()) for x in df['original_name']]

In [8]:
# get the names of TVs and movies
movie_orig = list(set(df[df['program_class'] == 'MOVIE']['original_name']))
tv_orig = list(set(df[df['program_class'] != 'MOVIE']['original_name']))

In [9]:
print("Number of TV Shows {}".format(len(tv_orig)))
print("Number of Movies {}".format(len(movie_orig)))

Number of TV Shows 276
Number of Movies 1525


problem: in STC JAWWY dataset, it seemed that we do not have enough information about the content (movies and tv shows information) such as year release, overview, genres,  ratings, and votes. these variables are important to build content-based recommendation system. the goal is to give the engine a name of a movie the user watched and show a list of simialr movies based content of seen movies. ex, if I watched Avengers, I should get movies about superheros and stuff!

in our case with STC dataset, we do not have unique id for each movie and tv show in. order to get more data about our movies and tvs

## Find STC JAWWY Movies and TV Shows in tmbd API

In [10]:
def get_tmdb_id(names, types):
    """ function to get IDs of external movies and TV shows list from tmdb
    input
    names: list of item names (TV shows, Movies)
    
    output: 
    item_api_id; a list of all found IDs for target item names
    item_name_not_found: a list of all not found name for target item names
    item_dic: full details info about target item names from API
    
    """
    item_api_id = []
    item_name_not_found = []
    item_dic = []
    for name in names:
        search  = types.search(name) # search api for names
        
        if len(search) != 0: # if nothing, meand we do not have the name in tmdb
            search = search[0]
            item_dic.append(search) # add found items in list
            item_api_id.append(search.id)
        else:
            item_name_not_found.append(name)
    # return 3 
    return item_api_id, item_name_not_found, item_dic

In [11]:
movie_api_id, movie_name_not_found, movie_dic = get_tmdb_id(movie_orig, movie)
tv_api_id, tv_name_not_found, tv_dic = get_tmdb_id(tv_orig, tv)

In [12]:
# get all the genres ids list from tmdb
genres_tv = genre.tv_list()
genres_movie = genre.movie_list()
genres_tv.extend(genres_movie)
all_genres = genres_tv

In [13]:
print("TV Shows in STC JAWWY     {}".format(len(tv_orig)))
print("TV Shows ID Found in TMDB {}".format(len(tv_api_id)))
print("TV Shows ID Not Found     {}".format(len(tv_name_not_found)))
print()
print("Movies in STC JAWWY       {}".format(len(movie_orig)))
print("Movies ID Found in TMDB   {}".format(len(movie_api_id)))
print("Movies ID Not Found       {}".format(len(movie_name_not_found)))

TV Shows in STC JAWWY     276
TV Shows ID Found in TMDB 134
TV Shows ID Not Found     142

Movies in STC JAWWY       1525
Movies ID Found in TMDB   1330
Movies ID Not Found       195


## Save dictionary of movies and tv shows to CSV

In [14]:
def export_list_dic(dic, filename):
    import csv
    csv_columns = list(dic[0].keys())
    dict_data = dic
    csv_file = "{}.csv".format(filename)
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for data in dict_data:
                writer.writerow(data)
    except IOError:
        print("I/O error")

In [15]:
export_list_dic(tv_dic, 'tv_df')
export_list_dic(movie_dic, 'movie_df')
export_list_dic(all_genres, 'genre_df')

## It seems that we less than 50% find in TV shows from TMDB and almot 80% of Movies found in TMDB API. 
## Something to note that JAWWY has Arabic movies and TV shows and we can assume some of them are not in TMDB. Also, typos can be another issue here where we can not search a typo in TMDB and it has to be correct original name of content.

In [16]:
# load dfs
movie_df = pd.read_csv('movie_df.csv')
tv_df    = pd.read_csv('tv_df.csv')
genre_df  = pd.read_csv('genre_df.csv')
genre_df  = genre_df.drop_duplicates()

In [17]:
tv_df.head(2)

Unnamed: 0,backdrop_path,first_air_date,genre_ids,id,name,origin_country,original_language,original_name,overview,popularity,poster_path,vote_average,vote_count
0,/xxBB3y7Q9h9qRN5bggjb0ZZ4ZWc.jpg,2006-04-10,[16],2424,Pinky Dinky Doo,['US'],en,Pinky Dinky Doo,Pinky Dinky Doo dances her way into her Story ...,3.158,/aphrMB7w0O2WDIGFWQsMcLyW17S.jpg,6.7,3
1,/pOU3IrRJ1lAruhYxmFf1lg1P8zE.jpg,2013-10-14,"[10759, 35, 10751, 10765]",58981,The Thundermans,['US'],en,The Thundermans,"Meet The Thundermans, a typical suburban famil...",53.042,/mjbjpF2vIfunNd0JPE1UCvB7akr.jpg,8.1,446


In [18]:
# add a type in the dict
movie_df['program_type'] = 'Movie'
tv_df['program_type'] = 'TV Show'

### if we notice in the genre_ids column, we have a genre ids list that are a referrence to genres API. We will perform a convertions from these list into the real genre names for movies and tv shows

In [20]:
tv_df.head()

Unnamed: 0,backdrop_path,first_air_date,genre_ids,id,name,origin_country,original_language,original_name,overview,popularity,poster_path,vote_average,vote_count,program_type
0,/xxBB3y7Q9h9qRN5bggjb0ZZ4ZWc.jpg,2006-04-10,[16],2424,Pinky Dinky Doo,['US'],en,Pinky Dinky Doo,Pinky Dinky Doo dances her way into her Story ...,3.158,/aphrMB7w0O2WDIGFWQsMcLyW17S.jpg,6.7,3,TV Show
1,/pOU3IrRJ1lAruhYxmFf1lg1P8zE.jpg,2013-10-14,"[10759, 35, 10751, 10765]",58981,The Thundermans,['US'],en,The Thundermans,"Meet The Thundermans, a typical suburban famil...",53.042,/mjbjpF2vIfunNd0JPE1UCvB7akr.jpg,8.1,446,TV Show
2,/vQwKpCOG2MM4F6dFCvxkzb1Pz1K.jpg,2015-08-22,[35],62238,Blunt Talk,['US'],en,Blunt Talk,A British newscaster moves to Los Angeles with...,12.394,/tPPJbwR1Z1zZzTvXTu3DaCfUpgk.jpg,6.8,44,TV Show
3,/D2DL5b9HBAy9jST2ZaDJLNHEsY.jpg,2014-06-07,"[80, 18]",54650,Power,['US'],en,Power,A successful New York entrepreneur lives a dou...,60.428,/ctftzvyj8b0odco7EoS9VfJhf7K.jpg,7.7,1015,TV Show
4,/4DgAubucJP6y2yX2Yx4CLEgdIPA.jpg,2006-09-25,"[10765, 18]",1639,Heroes,['US'],en,Heroes,Common people discover that they have super po...,110.135,/7Cn1Sj5yipu7tBX14BbaXuBHbL1.jpg,7.5,1633,TV Show


In [21]:
def convert_string_list(df, column):
    """ convert dataframe colument that is quoted "list" into [list] class
    input:
    df: dataframe
    column: takes str of column name that need to be converted
    
    output: 
    df: converted values of target column from string-list to list
    """
    print('Before: {}, type: {}'.format(df[column][1], type(df[column][1])))
    
    for i in range(len(df[column])):
        df[column][i] = json.loads(df[column][i])
    
    print('After: {}, type: {}'.format(df[column][1], type(df[column][1])))


In [22]:
convert_string_list(movie_df, 'genre_ids')
convert_string_list(tv_df, 'genre_ids')

Before: [35], type: <class 'str'>
After: [35], type: <class 'list'>
Before: [10759, 35, 10751, 10765], type: <class 'str'>
After: [10759, 35, 10751, 10765], type: <class 'list'>


In [23]:
def id_to_genre(df):
    """ convert ids of genre_ids primery key into actual values 
    
    input: dataframe with genre_ids column as [35,53,543]
    
    output: dataframe with genre_ids column as [action, comdey, horror]
    """
    print('Before: {}'.format(df['genre_ids'][1]))

    for i in range(len(df)):
        lists = df['genre_ids'][i]
        names = list(genre_df[genre_df['id'].isin(lists)]['name'])
        df.loc[:,'genre_ids'][i] = names
          
    print('After: {}'.format(df['genre_ids'][1]))


In [24]:
id_to_genre(tv_df)
id_to_genre(movie_df)

Before: [10759, 35, 10751, 10765]
After: ['Action & Adventure', 'Comedy', 'Family', 'Sci-Fi & Fantasy']
Before: [35]
After: ['Comedy']


## Now We have have two dataframes that have enough data points for content recommendation
### we will concate movie_df and tv_df into one final df, and we can join the final df to our orginal dataframe (stc)

In [25]:
tv_df.head(2)

Unnamed: 0,backdrop_path,first_air_date,genre_ids,id,name,origin_country,original_language,original_name,overview,popularity,poster_path,vote_average,vote_count,program_type
0,/xxBB3y7Q9h9qRN5bggjb0ZZ4ZWc.jpg,2006-04-10,[Animation],2424,Pinky Dinky Doo,['US'],en,Pinky Dinky Doo,Pinky Dinky Doo dances her way into her Story ...,3.158,/aphrMB7w0O2WDIGFWQsMcLyW17S.jpg,6.7,3,TV Show
1,/pOU3IrRJ1lAruhYxmFf1lg1P8zE.jpg,2013-10-14,"[Action & Adventure, Comedy, Family, Sci-Fi & ...",58981,The Thundermans,['US'],en,The Thundermans,"Meet The Thundermans, a typical suburban famil...",53.042,/mjbjpF2vIfunNd0JPE1UCvB7akr.jpg,8.1,446,TV Show


In [26]:
tv_df.columns = ['backdrop_path', 'release_date', 'genre_ids', 'id', 'name',
       'origin_country', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'vote_average', 'vote_count',
       'program_type']

In [27]:
column_needed = ['id', 'original_title','release_date','program_type', 'genre_ids', 'overview', 'popularity', 'vote_average', 'vote_count']

In [28]:
movie_df = movie_df[column_needed]
tv_df = tv_df[column_needed]

In [29]:
dfs = [movie_df, tv_df]
df  = pd.concat(dfs)
df  = df.reset_index()

In [30]:
df = df.drop('index', axis=1)

In [31]:
df.columns = ['id', 'original_title','release_date','program_type', 'genres', 'overview', 'popularity', 'vote_average', 'vote_count']

In [32]:
df

Unnamed: 0,id,original_title,release_date,program_type,genres,overview,popularity,vote_average,vote_count
0,59962,This Means War,2012-02-14,Movie,"[Comedy, Action, Romance]",Two top CIA operatives wage an epic battle aga...,18.739,6.1,2854
1,73937,The Big Year,2011-10-13,Movie,[Comedy],Three fanatical bird-watchers spend an entire ...,11.271,6.0,612
2,387889,Buddymoon,2016-07-01,Movie,[Comedy],When David is left by his fiancé just days bef...,3.944,6.1,28
3,200,Star Trek: Insurrection,1998-12-11,Movie,"[Action, Adventure, Science Fiction, Thriller]",When an alien race and factions within Starfle...,15.670,6.4,898
4,9991,Mean Machine,2001-12-26,Movie,"[Comedy, Drama]","Disgraced ex-England football captain, Danny '...",12.028,6.2,445
...,...,...,...,...,...,...,...,...,...
1459,46860,Mike the Knight,2011-10-31,TV Show,"[Action & Adventure, Animation]",Mike the Knight is a Canadian/British animated...,8.130,6.5,4
1460,7869,The Penguins of Madagascar,2008-11-28,TV Show,"[Animation, Comedy]",The Penguins of Madagascar is an American CGI ...,79.081,7.2,153
1461,61550,Marvel's Agent Carter,2015-01-06,TV Show,"[Drama, Sci-Fi & Fantasy]","It's 1946, and peace has dealt Peggy Carter a ...",60.535,7.6,1329
1462,61986,Bloodline,2015-03-20,TV Show,[Drama],A dramatic thriller that explores the demons l...,22.422,7.3,230


In [33]:
# get release year
df['release_date'] = [str(x)[:4] for x in df['release_date']]

In [41]:
# filter only stc df to match found movies and tv shows full information
items  = list(df['original_title'].values)

In [42]:
stc1 = stc[stc['original_name'].isin(items)]
stc_working = stc[~stc['original_name'].isin(items)]

In [45]:
print("Total records in STC JAWWY {}".format(stc.shape[0]))
print("Total records in after getting full items info  {}".format(stc1.shape[0]))
print("Total records that we did not get their record  {}".format(stc_working.shape[0]))

Total records in STC JAWWY 3598607
Total records in after getting full items info  974007
Total records that we did not get their record  2624600


### Sadly, we will lose a lot of data by getting only the full information about movies and tvs. We still have almot 1Million record and that has high portion of content from diffeerent items

# content-based recommendation system