In [3]:
import openai
from pathlib import Path
from string import Template
from random import shuffle
import json
import pandas as pd
from tqdm.auto import tqdm

client = openai.Client()

## Tags Movies (V1 style)

This initial template may be refined, to have more of a controlled vocabulary, after analysis. For now, a lot of the ball is in the interpretation by GPT.

In [55]:
prompt_template = Template(Path('tags-v1-template.txt').read_text().strip())
movies = Path("data/movie-list.csv").read_text().split("\n")

shuffle(movies)
print(movies[0])

Get Out (2017)


In [5]:
def tag_movie(name):
    prompt = prompt_template.substitute(movie_name=name)
    msgs = [
        {'role':'system', 'content': 'You are a cataloguer at a movie streaming service. You are tasked with adding tags to movies to improve the recommendation system. You are given a movie title and must add tags to the movie.'},
        {'role':'user', 'content': prompt}
    ]
    response = client.chat.completions.create(
        model='gpt-3.5-turbo-1106',
        response_format={ "type": "json_object" },
        messages=msgs,
        )
    completion = response.choices[0].message.content
    tags_dict = json.loads(completion)
    return tags_dict

In [56]:
outdir = Path('data/v1-tags/')
# write
pbar = tqdm(movies)
for movie in pbar:
    pbar.set_description(movie)
    outpath = outdir / f'{movie}.json'
    if outpath.exists():
        continue
    tags = tag_movie(movie)
    outpath.write_text(json.dumps(tags, indent=2))

  0%|          | 0/1066 [00:00<?, ?it/s]

### Flatten tags

Make tags into single row == tag.

['movie_id', 'title', 'year', 'category', 'value']

In [105]:
def flatten_tags(tags):
    '''
    Flatten the tag dictionary into a list of category: value pairs
    '''
    flat_tags = []
    for category, values in tags.items():
        if category == 'relatedFilms':
            for film in values:
                try:
                    flat_tags.append((f"related ({film['relationship']})", f"{film['name']} ({film['year']})"))
                except:
                    print("Related film error: the format wasn't correct. Skipping.")
                    print("Recommendation: fix or delete the file for ", tags['title'])
        elif type(values) == list:
            for value in values:
                flat_tags.append((category, value))
        elif type(values) == int:
            flat_tags.append((category, values))
        elif type(values) == str:
            flat_tags.append((category, values))
    return flat_tags

all_flat_tags = []
for movie in tqdm(outdir.glob('*.json')):
    tags = json.loads(movie.read_text())
    flat_tags = flatten_tags(tags)
    title = movie.stem[:-7]
    year = movie.stem[-5:-1]
    flat_tags = [(movie.stem, title, year, category, value) for category, value in flat_tags]
    all_flat_tags.extend(flat_tags)

df = pd.DataFrame(all_flat_tags, columns=['movie_id', 'title', 'year', 'category', 'value'])
df.to_csv('data/v1-tags-flat.csv', index=False)

0it [00:00, ?it/s]

In [130]:
just_titles = df.movie_id.drop_duplicates().sort_values().to_list()
Path('../../src/lib/movietags/movies.json').write_text(json.dumps(just_titles))

29941

Bad pipe message: %s [b'\xbe.\xcc\xd8\x1c\x1dUS;\xe6y\x13\xab\x8e\x07\xcf\x01# ?\xc6\xdd\xa3\xf3\xf6\x8ag\xe2s\x94\x08\x87")>h^Y\xb6a\x8c5\x19\xb9\x0e7\xb9My\xbe7\x00 \xba\xba\x13\x01\x13\x02\x13\x03\xc0+\xc0/\xc0,\xc00\xcc\xa9\xcc\xa8\xc0\x13\xc0\x14\x00\x9c\x00\x9d\x00/\x005\x01\x00\x01\xcd\x9a\x9a\x00\x00\x003\x00+\x00)\xfa\xfa\x00\x01\x00\x00\x1d\x00 \xa6\xad\xd6\xa6[\xc8yl\x85\x92\xa18\x00{\xc6\xeb\xca$o\xdaOi\xec\xa3]\xf4T\xd5\x7f\xd7\xf1\x1e\x00#\x00\x00\xff\x01\x00\x01\x00\x00\x17\x00\x00\x00-\x00\x02\x01\x01']
Bad pipe message: %s [b'J\xb4\xab4sP\xbd\x899\x06\x9e\xc9\xa5\x11\x0e\xe8\x10\xf3 \xb69\x9c\x82$\x85!f\x08\xae\x86P#\x82@\x19\x89\x95^6\x8f\x8c*z\xb1z\xec\xa3\x89\x10\xefO\x00 z']


## Analyze Tags

In [106]:
# count all related ({film['relationship']}) relationships
df = pd.read_csv('data/v1-tags-flat.csv')
df[df.category.str.startswith('related')].category.value_counts()

category
related (sequel)                             521
related (prequel)                            254
related (spin-off)                            20
related (original)                            17
related (related)                             10
related (remake)                               9
related (crossover)                            5
related (reboot)                               4
related (spiritual successor)                  3
related (preceding film)                       3
related (similar theme)                        3
related (live-action remake)                   2
related (Crossover)                            2
related (companion film)                       2
related (shared universe)                      2
related (similar style)                        2
related (saga)                                 2
related (Other Film by Quentin Tarantino)      2
related (Sequel)                               1
related (alternate adaptation)                 1
related (sp

In [117]:
df[df.category == 'plotTags'].value.value_counts().head(20)

value
Rescue Mission             39
Friendship                 34
Talking Animals            27
Epic Battle                26
Betrayal                   26
Quest                      26
Adventure                  26
Superhero                  24
Redemption                 23
Mystery                    22
Violence                   18
Underdog Story             18
Rebellion                  18
Coming of Age              18
Forbidden Love             17
Survival                   17
Journey                    16
Musical                    15
Father-Son Relationship    15
Rivalry                    14
Name: count, dtype: int64

In [123]:
shortlist = df.copy()
for plotTag in ["Talking Animals", "Alternate Reality"]:
    filter_ids = shortlist[shortlist.value == plotTag].movie_id.unique()
    shortlist = shortlist[shortlist.movie_id.isin(filter_ids)]
display(shortlist.query('category == "plotTags"').drop_duplicates(subset='movie_id').value.value_counts().head(20))
shortlist.drop_duplicates(subset='movie_id')

value
Alternate Reality    1
Name: count, dtype: int64

Unnamed: 0,movie_id,title,year,category,value
24535,Alice in Wonderland (2010),Alice in Wonderland,2010,genres,Adventure


In [126]:
shortlist = df.copy()
for plotTags in ['Talking Animals', 'Musical']:
    filter_ids = shortlist[shortlist.value == plotTag].movie_id
    shortlist = shortlist[shortlist.movie_id.isin(filter_ids)]
shortlist = shortlist.drop_duplicates(subset='movie_id')
shortlist

Unnamed: 0,movie_id,title,year,category,value
4115,Watchmen (2009),Watchmen,2009,genres,Action
4146,Donnie Darko (2001),Donnie Darko,2001,genres,Drama
10559,Shrek Forever After (2010),Shrek Forever After,2010,genres,Animation
12524,Inception (2010),Inception,2010,genres,Action
17360,Sucker Punch (2011),Sucker Punch,2011,genres,Action
21983,Tomorrowland (2015),Tomorrowland,2015,genres,Science Fiction
24535,Alice in Wonderland (2010),Alice in Wonderland,2010,genres,Adventure
25341,The Matrix Resurrections (2021),The Matrix Resurrections,2021,genres,Science Fiction


In [124]:
df[df.category == 'genericHintTags'].value.value_counts().head(20)

value
Cult Classic             152
Heartwarming             130
Feel-Good                122
Suspenseful              116
Action-Packed             90
Blockbuster               79
Character-Driven          76
Classic                   76
Intense                   70
Emotional                 63
Sequel                    58
Epic                      58
Family-Friendly           57
Humorous                  47
Violent                   37
Iconic Characters         36
Musical                   33
Special Effects           33
Adaptation                32
Comic Book Adaptation     32
Name: count, dtype: int64

In [125]:
df[df.category == 'genericHintTags'].value.value_counts().head(40)

value
Cult Classic                 152
Heartwarming                 130
Feel-Good                    122
Suspenseful                  116
Action-Packed                 90
Blockbuster                   79
Character-Driven              76
Classic                       76
Intense                       70
Emotional                     63
Sequel                        58
Epic                          58
Family-Friendly               57
Humorous                      47
Violent                       37
Iconic Characters             36
Musical                       33
Special Effects               33
Adaptation                    32
Comic Book Adaptation         32
Ensemble Cast                 30
Historical                    29
Visually Stunning             29
High-Octane                   28
Iconic Soundtrack             27
Animated                      27
Inspiring                     25
Satirical                     25
Oscar Winner                  24
Inspirational                 24
Thri

In [12]:
# value counts on genres
df[df.category == 'genres'].value.value_counts()

value
Action             74
Adventure          73
Comedy             54
Drama              53
Fantasy            43
Thriller           32
Animation          30
Horror             23
Romance            20
Science Fiction    18
Family             18
Crime              16
Sci-Fi             13
Biography          11
Musical            11
Mystery             8
History             6
War                 5
Western             5
Sport               4
Sports              2
Music               2
Suspense            1
Neo-noir            1
Film Noir           1
Superhero           1
Dystopian           1
Slasher             1
Political           1
Romantic Comedy     1
Spy                 1
Zombie              1
Name: count, dtype: int64

In [13]:
df[df.category == 'productionCountries'].value.value_counts()

value
United States           178
United Kingdom           30
Canada                   13
Australia                12
Germany                  11
New Zealand               8
Japan                     4
France                    4
Brazil                    3
China                     3
Taiwan                    2
Denmark                   1
Hong Kong                 1
Serbia                    1
Czech Republic            1
United Arab Emirates      1
South Africa              1
Mexico                    1
UK                        1
Norway                    1
Sweden                    1
Italy                     1
Name: count, dtype: int64

In [14]:
df[df.category == 'peopleInvolved'].value.value_counts()

value
Tom Cruise          6
Brad Pitt           5
Morgan Freeman      5
Steven Spielberg    4
Nicole Kidman       4
                   ..
Chevy Chase         1
Lily Tomlin         1
Ned Beatty          1
Keith Carradine     1
Rachel Weisz        1
Name: count, Length: 633, dtype: int64

## Example format

In [39]:
movie = df.sample(1).iloc[0]['movie_id']
print(movie)

Elysium (2013)


Things to tag:
- make related movies into sensible types (e.g. `has 'x' sequels`, `is a prequel`, `reboot`)
- have a sorting order. This will be a big consideration, don't get too bogged down early in dev
- focus just on high popularity for now

In [54]:
tags_df = df[df.movie_id == movie].sample(frac=1)
tags_df = tags_df[tags_df.category.isin(['title', 'popularity'])]
[(x.category, x.value) for x in tags_df.itertuples()]

[('genres', 'Science Fiction'),
 ('timePeriods', '2154'),
 ('peopleInvolved', 'Matt Damon'),
 ('genericHintTags', 'Sci-Fi Action'),
 ('popularity', '3'),
 ('settings', 'Space'),
 ('themes', 'Class Warfare'),
 ('plotTags', 'Rebel Hero'),
 ('settings', 'Futuristic Earth'),
 ('settings', 'Orbital Space Station'),
 ('plotTags', 'Cybernetic Enhancements'),
 ('releaseYear', '2013'),
 ('genres', 'Dystopian'),
 ('productionCountries', 'United States'),
 ('filmDecade', '2010s'),
 ('themes', 'Technological Advancement'),
 ('plotTags', 'Class Disparity'),
 ('genericHintTags', 'Visually Stunning'),
 ('genres', 'Action'),
 ('peopleInvolved', 'Neill Blomkamp'),
 ('peopleInvolved', 'Jodie Foster'),
 ('genericHintTags', 'Social Commentary'),
 ('genericHintTags', 'Futuristic Technology'),
 ('peopleInvolved', 'Sharlto Copley'),
 ('notableAwards', 'Saturn Award for Best Visual Effects (won)'),
 ('themes', 'Social Injustice'),
 ('plotTags', 'Political Corruption'),
 ('themes', 'Healthcare Inequality')]