-
Notifications
You must be signed in to change notification settings - Fork 7
/
movie_people.sql
116 lines (105 loc) · 3.04 KB
/
movie_people.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
with movies_src as (
select
imdb_id,
release_year,
director,
writer,
actors
from
{{ref('all_movies_combined_columns')}} -- could also use all_movies_combined_columns but don't need to combine columns for director, writer, actors
),
all_movies_and_people as (
SELECT
imdb_id,
release_year,
trim(dir.value) AS person_name,
'DIRECTOR' AS person_role
FROM movies_src,
LATERAL SPLIT_TO_TABLE(director, ',') AS dir
UNION
SELECT
imdb_id,
release_year,
trim(wri.value) AS person_name,
'WRITER' AS person_role
FROM movies_src,
LATERAL SPLIT_TO_TABLE(writer, ',') AS wri
UNION
SELECT
imdb_id,
release_year,
trim(act.value) AS person_name,
'ACTOR' AS person_role
FROM movies_src,
LATERAL SPLIT_TO_TABLE(actors, ',') AS act
),
deduped as (
SELECT
imdb_id,
release_year,
person_name,
person_role,
FROM
all_movies_and_people
QUALIFY ROW_NUMBER() OVER (
PARTITION BY imdb_id, release_year, person_name, person_role
ORDER BY imdb_id -- this is arbitrary
) = 1
),
-- In some cases there are details about the role after the person's name, in parentheses
-- Eg. A director may appear as "Joe Bloggs (co-director)"
-- Extract this information and insert into PERSON_ROLE_DETAILS field
role_details as (
SELECT
*,
case
when person_role = 'DIRECTOR' OR person_role = 'WRITER'
then REGEXP_REPLACE(REGEXP_SUBSTR(PERSON_NAME, '\\(\.\*\\)$'), '\\(|\\)', '')
else
NULL
end as person_role_details
FROM
deduped
),
-- Update the PERSON_NAME value to remove any extra information about the role in parentheses
-- Will use PERSON_NAME to generate a unique identifier so only want each person appearing once.
-- "Joe Bloggs" is the same person as "Joe Bloggs (co-director)"
cleaned as (
select
imdb_id,
release_year,
case
when person_role_details is not null
then RTRIM(REGEXP_REPLACE(REPLACE(person_name, person_role_details, ''), '\\(|\\)', ''))
else
person_name
end as person_name,
person_role,
UPPER(person_role_details) as person_role_details
from
role_details
where
person_name != 'N/A'
),
-- Some people names appear in the movie dataset in a format that does not match names in wikidata.
-- Update these names to wikidata format to help with matching later on
names_fixed as (
select
imdb_id,
release_year,
IFNULL(names.to_name, person_name) as person_name,
person_role,
person_role_details
from
cleaned
left join {{ref('person_name_fixes')}} as names
on names.from_name = cleaned.person_name
)
select
imdb_id,
release_year,
person_name,
person_role,
person_role_details
from
names_fixed