In [7]:
from collections import defaultdict

d = defaultdict(list)
cant_remap_tmdb = []
with open('tmdb_not_found.csv') as f:
    for line in f.readlines()[1:]:
        line = line.strip().split(',')
        if len(line) == 1:
            cant_remap_tmdb.append(line[0])
        elif len(line) > 1:
            d['movieId'].append(line[0])
            d['tmdbId'].append(line[1])
            if len(line) > 2:
                d['type'].append('tv')
            else:
                d['type'].append('movie')

In [8]:
import pandas as pd

df = pd.DataFrame(d)
df.to_csv('tmdb_remap.csv', index=False)

In [14]:
import json

with open('cant_remap_tmdb.csv', 'w') as f:
    f.write('movieId\n')
    for _ in cant_remap_tmdb:
        f.write(_ + '\n')

## Get all sampled items

In [17]:
def read_jsonl(fpath: str):
    res = []
    with open(fpath, 'r', encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            res.append(data)
    return res

In [18]:
tmdb = read_jsonl("tmdb.jsonl")

In [44]:
all_tmdb_items = []
for d in tmdb:
    item = {}
    item['movieId'] = d['movieID']
    if 'tmdID' in d:
        item['tmdbId'] = d['tmdID']
    item['url'] = d['url']
    item['title'] = d['movie_title']
    item['keywords'] = list(set(d['keyword_list']) - set(""))
    item['genres'] = list(set(d['keyword_list']) - set(""))
    item['overview'] = d['overview']
    if 'averge_rating' in d:
        item['average_rating'] = float(d['average_rating'])
    runtime = d['runtime'].split(' ')
    try:
        if len(runtime) == 1 and runtime[0].endswith('m'):
            item['duration'] = int(runtime[0].strip('m'))
        elif len(runtime) == 1 and runtime[0].endswith('h'):
            item['duration'] = 60*int(runtime[0].strip('h'))
        else:
            assert len(runtime) == 2
            assert runtime[-1].endswith('m') and runtime[0].endswith('h')
            item['duration'] = int(runtime[-1].strip('m')) + 60*int(runtime[0].strip('h'))
    except:
        item['duration'] = 0
    item['release_year'] = d['release_year']

    tmp = defaultdict(list)
    for k, v in d['people_dict'].items():
        for _ in k.split(', '):
            tmp[_.lower()].append(v)
    for k, v in tmp.items():
        if not isinstance(v, list):
            item[k] = [v]
        else:
            item[k] = v
    try:
        item['language'] = d['property_dict']['Original Language']
    except:
        pass
    try:
        item['status'] = d['property_dict']['Status']
    except:
        pass
    try:
        item['budget'] = d['property_dict']['Budget']
    except:
        pass
    try:
        item['revenue'] = d['property_dict']['Revenue']
    except:
        pass

    all_tmdb_items.append(item)
  

In [45]:
movielens = read_jsonl("movielens.jsonl")
all_movielens_items = []
for d in movielens:
    item = {}
    item['movieId'] = d['movieID']
    if 'tmdID' in d:
        item['tmdbId'] = d['tmdID']
    item['url'] = d['url']
    item['overview'] = d['overview']
    item['title'] = d['movie_title']
    item['keywords'] = list(set(d['keyword_list']) - set(""))
    item['genres'] = list(set(d['keyword_list']) - set(""))
    if 'averge_rating' in d:
        item['average_rating'] = float(d['average_rating'])
    runtime = d['runtime']
    if runtime.endswith(' minutes'):
        item['duration'] = int(runtime.strip(' minutes'))
    elif runtime == '':
        item['duration'] = 0
    else:
        assert int(runtime) > 1000
        item['duration'] = 0

    item['release_year'] = d['release_year']
    if 'Director' in d['people_dict']:
        if d['people_dict']['Director'][0] != "":
            try:
                item['director'] = d['people_dict']['Director']
            except:
                print(d['movieID'])
    try:
        if d['property_dict']['original_language'] != 'No Language':
            item['language'] = d['property_dict']['original_language']
    except:
        pass

    all_movielens_items.append(item)
  

In [None]:
with open('sampled_items.jsonl', 'w') as f:
    for _ in all_tmdb_items + all_movielens_items:
        json.dump(_, f)
        f.write('\n')

# Get summary

In [1]:
def read_jsonl(fpath: str):
    res = []
    with open(fpath, 'r', encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            res.append(data)
    return res

In [4]:
import json
items = read_jsonl('sampled_items.jsonl')

In [6]:
summaries = dict()
for _ in items:
    summary = ''
    if _['title'] != ''and len(_['keywords']) + len(_['genres']) > 0:
        summary += f"{_['title']} is a movie about {', '.join(_['keywords'] + _['genres'])}. "
    summary += f"It's released in {_['release_year']}. "
    if _['duration'] != 0:
        summary += f"It's {_['duration']} minutes long. "
    if 'director' in _:
        summary += f"Directed by {' '.join(_['director'])}, "
    if 'overview' != '':
        summary += f"it tell a story about {_['overview']}"
    summaries[_['movieId']] = summary


In [8]:
with open('sampled_summaries.json', 'w') as f:
    json.dump(summaries, f)