## Data Enrichment with AI

### Setup

##### Create a dataset for storing the AI tables and another for storing the AI models

In [None]:
%%bigquery
CREATE SCHEMA `daniel-daisy-cs329e`.country_stg_ai

Query is running:   0%|          |

In [None]:
%%bigquery
CREATE SCHEMA `daniel-daisy-cs329e`.remote_models

Query is running:   0%|          |

##### Before running this cell, create the remote connection and assign the IAM role `Vertex AI User` to the service account associated with the connection.

In [None]:
%%bigquery
create or replace model remote_models.gemini_pro
  remote with connection `projects/daniel-daisy-cs329e/locations/us/connections/vertex_connection`
  options (endpoint = 'gemini-pro');

Query is running:   0%|          |

### Scenario: Predict the nationality of each movie's director

In [None]:
%%bigquery
select * except(data_source, load_time)
from country_stg.Film
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rank,name,director,d_nationality
0,213,Ratatouille,Brad Bird,
1,245,The Iron Giant,Brad Bird,
2,227,The Incredibles,Brad Bird,
3,170,Fargo,Joel Coen,
4,206,The Big Lebowski,Joel Coen,


#### Test the generate_text function

In [None]:
%%bigquery
declare prompt_query STRING default "Suggest a nationality for each director based on the origin of their last name. Return the output as json, include the director in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("name", name, "director", director))) as prompt
    from country_stg.Film
    order by name
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{\n ""director"": ""Sidney Lumet"",\n ""...",,,Suggest a nationality for each director based ...
1,"```json\n{\n ""director"": ""Steve McQueen"",\n ...",,,Suggest a nationality for each director based ...
2,"```json\n{\n ""director"": ""Sam Mendes"",\n ""na...",,,Suggest a nationality for each director based ...
3,"```json\n{\n ""director"": ""Stanley Kubrick"",\n...",,,Suggest a nationality for each director based ...
4,"```json\n{\n ""director"": ""Rajkumar Hirani"",\n...",,,Suggest a nationality for each director based ...
5,"```json\n[\n {\n ""director"": ""Ron Howard"",...",,,Suggest a nationality for each director based ...
6,"```json\n{\n ""director"": ""Stanley Kubrick"",\n...",,,Suggest a nationality for each director based ...
7,"```json\n[\n {\n ""director"": ""Asghar Farha...",,,Suggest a nationality for each director based ...
8,"```json\n{\n ""director"": ""Ron Clements"",\n ""...",,,Suggest a nationality for each director based ...
9,"```json\n{\n ""director"": ""Ridley Scott"",\n ""...",,,Suggest a nationality for each director based ...


#### Tweak the prompt and save the output
##### [More details](https://cloud.google.com/bigquery/docs/generate-text#generate_text_from_text_data_by_using_a_prompt_from_a_query) on `ML.generate_text` parameters

In [None]:
%%bigquery
declare prompt_query STRING default "For the given movie and its director, suggest a nationality based on the origin of the director's last name. Return the output as a JSON object, including only the movie's name, the director's name, and the nationality. If the nationality is indiscernable, write the nationality as 'Unknown'.";
create or replace table country_stg_ai.nationality_predictions_raw_10 as
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("name", name, "director", director))) as prompt
    from country_stg.Film
    order by name
    limit 10
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from country_stg_ai.nationality_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"{""name"": ""Alien"", ""director"": ""Ridley Scott"", ...","For the given movie and its director, suggest ..."
1,"```json\n{\n ""name"": ""2001: A Space Odyssey"",...","For the given movie and its director, suggest ..."
2,"{""name"": ""12 Years a Slave"", ""director"": ""Stev...","For the given movie and its director, suggest ..."
3,,"For the given movie and its director, suggest ..."
4,"```json\n{\n ""name"": ""A Separation"",\n ""dire...","For the given movie and its director, suggest ..."
5,"```json\n{\n ""name"": ""Aladdin"",\n ""director""...","For the given movie and its director, suggest ..."
6,"```json\n{\n ""name"": ""12 Angry Men"",\n ""dire...","For the given movie and its director, suggest ..."
7,"```json\n{\n ""name"": ""A Beautiful Mind"",\n ""...","For the given movie and its director, suggest ..."
8,"```json\n{\n ""name"": ""1917"",\n ""director"": ""...","For the given movie and its director, suggest ..."
9,"```json\n{\n ""name"": ""A Clockwork Orange"",\n ...","For the given movie and its director, suggest ..."




```
# This is formatted as code
```

#### Format the output to proper json

In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from country_stg_ai.nationality_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"{""name"": ""Alien"", ""director"": ""Ridley Scott"", ...","{""name"": ""Alien"", ""director"": ""Ridley Scott"", ..."
1,"```json\n{\n ""name"": ""2001: A Space Odyssey"",...","{ ""name"": ""2001: A Space Odyssey"", ""director..."
2,"{""name"": ""12 Years a Slave"", ""director"": ""Stev...","{""name"": ""12 Years a Slave"", ""director"": ""Stev..."
3,,
4,"```json\n{\n ""name"": ""A Separation"",\n ""dire...","{ ""name"": ""A Separation"", ""director"": ""Asgha..."
5,"```json\n{\n ""name"": ""Aladdin"",\n ""director""...","{ ""name"": ""Aladdin"", ""director"": ""Ron Clemen..."
6,"```json\n{\n ""name"": ""12 Angry Men"",\n ""dire...","{ ""name"": ""12 Angry Men"", ""director"": ""Sidne..."
7,"```json\n{\n ""name"": ""A Beautiful Mind"",\n ""...","{ ""name"": ""A Beautiful Mind"", ""director"": ""R..."
8,"```json\n{\n ""name"": ""1917"",\n ""director"": ""...","{ ""name"": ""1917"", ""director"": ""Sam Mendes"", ..."
9,"```json\n{\n ""name"": ""A Clockwork Orange"",\n ...","{ ""name"": ""A Clockwork Orange"", ""director"": ..."


In [None]:
%%bigquery
create or replace table country_stg_ai.nationality_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from country_stg_ai.nationality_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.name') as name,
  json_value(ml_generate_text_llm_result, '$.director') as director,
  json_value(ml_generate_text_llm_result, '$.nationality') as d_nationality
from country_stg_ai.nationality_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,name,director,d_nationality
0,12 Years a Slave,Steve McQueen,Unknown
1,12 Angry Men,Sidney Lumet,American
2,Alien,Ridley Scott,British
3,Aladdin,Ron Clements,American
4,1917,Sam Mendes,British
5,A Separation,Asghar Farhadi,Iranian
6,,,
7,A Clockwork Orange,Stanley Kubrick,American
8,A Beautiful Mind,Ron Howard,American
9,2001: A Space Odyssey,Stanley Kubrick,Unknown


New "d_nationality" column didn't need to be added as it already existed in our staging and consumption layer tables.

#### Update the d_nationality records with the predicted nationality

In [None]:
%%bigquery
update country_stg.Film set d_nationality =
  (select json_value(ml_generate_text_llm_result, '$.nationality')
   from country_stg_ai.nationality_predictions_formatted_10
   where name = json_value(ml_generate_text_llm_result, '$.name'))
where 1=1

Query is running:   0%|          |

#### Inspect the output

In [None]:
%%bigquery
select * except(data_source, load_time)
from country_stg.Film
where d_nationality is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rank,name,director,d_nationality
0,123,1917,Sam Mendes,British
1,51,Alien,Ridley Scott,British
2,114,A Separation,Asghar Farhadi,Iranian
3,179,12 Years a Slave,Steve McQueen,Unknown
4,91,2001: A Space Odyssey,Stanley Kubrick,Unknown
5,143,A Beautiful Mind,Ron Howard,American
6,248,Aladdin,Ron Clements,American
7,5,12 Angry Men,Sidney Lumet,American
8,103,A Clockwork Orange,Stanley Kubrick,American


#### Apply at larger scale

Gemini-pro is slow, it would take **3.7 hours** to process 226780 rows in the Snack table based on a 1000 QPM limit (226780/1000 = 227 min).

And the default quota is only 300 QPM in us-central1.

To process larger volumes, we can request a quota increaseby following these steps:
- Go to the [Quota page](https://console.cloud.google.com/iam-admin/quotas)
- Click on "Generate content requests per minute per project per base model per minute per region per base_model" for us-central1 and gemini-pro
-Click the Edit Quotas button
-In the new value field, enter 1000
-In the justification field, enter "To process a 226780 row table"
-Click Next until you get to the last page
-Click Submit
-Wait for a few minutes, you should get an email once it's been approved. Usually takes ~5 minutes.

country_stg.Film table is only 250 rows, so it's not necessary to change default quota or create a smaller table.

In [None]:
%%bigquery
declare prompt_query STRING default "For the given movie and its director, suggest a nationality based on the origin of the director's last name. Return the output as a JSON object, including only the movie's name, the director's name, and the nationality. If the nationality is indiscernable, write the nationality as 'Unknown'.";
create or replace table country_stg_ai.nationality_predictions_raw_full as
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("name", name, "director", director))) as prompt
    from country_stg.Film
    order by name
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = 'a441f6b3-d6b4-4b33-a8f0-30e044e7b326'

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,creation_time,end_time,query


In [None]:
%%bigquery
create or replace table country_stg_ai.nationality_predictions_formatted_full as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from country_stg_ai.nationality_predictions_raw_full;

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as nationality_count
from country_stg_ai.nationality_predictions_formatted_full

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,nationality_count
0,250


In [None]:
%%bigquery
update country_stg.Film f
  set d_nationality = json_value(p.ml_generate_text_llm_result, '$.nationality')
  from country_stg_ai.nationality_predictions_formatted_full p
  where f.name = json_value(p.ml_generate_text_llm_result, '$.name');

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as nationality_count
from country_stg.Film
where d_nationality is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,nationality_count
0,244


In [None]:
%%bigquery
select director, d_nationality, count(*) as count
from country_stg.Film
where d_nationality is not null
group by director, d_nationality
order by count(*) desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,director,d_nationality,count
0,Christopher Nolan,British,7
1,Martin Scorsese,American,7
2,Akira Kurosawa,Japanese,7
3,Alfred Hitchcock,British,6
4,Stanley Kubrick,Unknown,6
...,...,...,...
150,Michael Curtiz,Hungarian,1
151,George Miller,Australian,1
152,Adam Elliot,Australian,1
153,Damián Szifron,Argentinian,1


In [None]:
%%bigquery
update country_stg.Film
  set data_source = 'kaggle_ai' where d_nationality is not null

Query is running:   0%|          |

### Scenario: Detect the livability of cities!

#### Experiment with prompt

In [None]:
%%bigquery
declare prompt_query STRING default "Write a review on how livable this given city is. Some factors to consider could be walkability, safety, nightlife, excursions, etc. Return the output as json, include the city id which is attribute 'city_id' in the output as well";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("city_id", city_id, "name", name, "country_code", country_code,
                  "district", district, "population", population))) as prompt
    from country_stg.City
    order by city_id
    limit 10
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{\n ""city_id"": 1,\n ""country_code"":...",,,Write a review on how livable this given city ...
1,"```json\n{\n ""city_id"": 2,\n ""country_code"":...",,,Write a review on how livable this given city ...
2,"```json\n{\n ""city_id"": 3,\n ""livability"": {...",,,Write a review on how livable this given city ...
3,"```json\n{\n ""city_id"": 4,\n ""livability"": {...",,,Write a review on how livable this given city ...
4,"```json\n{\n ""city_id"": 5,\n ""country_code"":...",,,Write a review on how livable this given city ...
5,"```json\n{\n ""city_id"": 6,\n ""livability"": {...",,,Write a review on how livable this given city ...
6,"```json\n{\n ""city_id"": 7,\n ""livability"": {...",,,Write a review on how livable this given city ...
7,"```json\n{\n ""city_id"": 8,\n ""livability"": {...",,,Write a review on how livable this given city ...
8,"```json\n{\n ""city_id"": 9,\n ""livability"": {...",,,Write a review on how livable this given city ...
9,"```json\n{\n ""city_id"": 10,\n ""livability"": ...",,,Write a review on how livable this given city ...


In [None]:
%%bigquery
declare prompt_query STRING default "Write a review on how livable this given city is. Some factors to consider could be walkability, safety, nightlife, excursions, etc. Return the output as json, include the city id which is attribute 'city_id' in the output as well";
create or replace table country_stg_ai.city_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("city_id", city_id, "name", name, "country_code", country_code,
                  "district", district, "population", population))) as prompt
      from country_stg.City
      order by city_id
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from country_stg_ai.city_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{\n ""city_id"": 3,\n ""livability"": {...",Write a review on how livable this given city ...
1,"```json\n{\n ""city_id"": 7,\n ""livability"": {...",Write a review on how livable this given city ...
2,"```json\n{\n ""city_id"": 10,\n ""livability"": ...",Write a review on how livable this given city ...
3,"```json\n{\n ""city_id"": 9,\n ""livability"": {...",Write a review on how livable this given city ...
4,"```json\n{\n ""city_id"": 5,\n ""country_code"":...",Write a review on how livable this given city ...
5,"```json\n{\n ""city_id"": 8,\n ""livability"": {...",Write a review on how livable this given city ...
6,"```json\n{\n ""city_id"": 6,\n ""livability"": {...",Write a review on how livable this given city ...
7,"```json\n{\n ""city_id"": 2,\n ""country_code"":...",Write a review on how livable this given city ...
8,"```json\n{\n ""city_id"": 4,\n ""livability"": {...",Write a review on how livable this given city ...
9,"```json\n{\n ""city_id"": 1,\n ""country_code"":...",Write a review on how livable this given city ...


In [None]:
%%bigquery
create or replace table country_stg_ai.city_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from country_stg_ai.city_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select * from country_stg_ai.city_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"{ ""city_id"": 3, ""livability"": { ""walkabil..."
1,"{ ""city_id"": 4, ""livability"": { ""walkabil..."
2,"{ ""city_id"": 10, ""livability"": { ""walkabi..."
3,"{ ""city_id"": 6, ""livability"": { ""walkabil..."
4,"{ ""city_id"": 5, ""country_code"": ""NLD"", ""dis..."
5,"{ ""city_id"": 2, ""country_code"": ""AFG"", ""dis..."
6,"{ ""city_id"": 7, ""livability"": { ""walkabil..."
7,"{ ""city_id"": 1, ""country_code"": ""AFG"", ""dis..."
8,"{ ""city_id"": 8, ""livability"": { ""walkabil..."
9,"{ ""city_id"": 9, ""livability"": { ""walkabil..."


In [None]:
%%bigquery
alter table country_stg.City add column livability string;

Executing query with job ID: 444fab9c-cedb-40c2-8014-da917428ff61
Query executing: 0.33s


ERROR:
 400 Column already exists: livability at [1:41]

Location: US
Job ID: 444fab9c-cedb-40c2-8014-da917428ff61



In [None]:
%%bigquery
update country_stg.City set livability =
  (select json_query(ml_generate_text_llm_result, '$.livability')
   from country_stg_ai.city_predictions_formatted_10
   where city_id = CAST(json_value(ml_generate_text_llm_result, '$.city_id') AS INT64))
where 1=1;

Query is running:   0%|          |

In [None]:
%%bigquery
select * except(data_source, load_time)
from country_stg.City
where livability is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,city_id,name,country_code,district,population,livability
0,4,Mazar-e-Sharif,AFG,Balkh,127800,"{""walkability"":3,""safety"":2,""nightlife"":1,""exc..."
1,3,Herat,AFG,Herat,186800,"{""walkability"":7,""safety"":6,""nightlife"":5,""exc..."
2,1,Kabul,AFG,Kabol,1780000,"{""walkability"":2,""safety"":1,""nightlife"":1,""exc..."
3,2,Qandahar,AFG,Qandahar,237500,"{""walkability"":3,""safety"":2,""nightlife"":1,""exc..."
4,8,Utrecht,NLD,Utrecht,234323,"{""walkability"":8.5,""safety"":8.0,""nightlife"":7...."
5,6,Rotterdam,NLD,Zuid-Holland,593321,"{""walkability"":8,""safety"":7,""nightlife"":9,""exc..."
6,7,Haag,NLD,Zuid-Holland,440900,"{""walkability"":8,""safety"":7,""nightlife"":9,""exc..."
7,9,Eindhoven,NLD,Noord-Brabant,201843,"{""walkability"":7.5,""safety"":8.0,""nightlife"":7...."
8,10,Tilburg,NLD,Noord-Brabant,193238,"{""walkability"":7.5,""safety"":8.0,""nightlife"":7...."
9,5,Amsterdam,NLD,Noord-Holland,731200,"{""walkability"":9,""safety"":8,""nightlife"":9,""exc..."


#### Apply at larger scale

In [None]:
%%bigquery
declare prompt_query STRING default "Write a review on how livable this given city is. Some factors to consider could be walkability, safety, nightlife, excursions, etc. Return the output as json, include the city id which is attribute 'city_id' in the output as well";
create or replace table country_stg_ai.city_predictions_full as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("city_id", city_id, "name", name, "country_code", country_code,
                  "district", district, "population", population))) as prompt
      from country_stg.City
      order by city_id
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from country_stg_ai.city_predictions_full

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{\n ""city_id"": 77,\n ""livability"": ...",Write a review on how livable this given city ...
1,"```json\n{\n ""city_id"": 151,\n ""livability"":...",Write a review on how livable this given city ...
2,"```json\n{\n ""city_id"": 171,\n ""country_code...",Write a review on how livable this given city ...
3,"```json\n{\n ""city_id"": 179,\n ""livability"":...",Write a review on how livable this given city ...
4,"```json\n{\n ""city_id"": 206,\n ""country_code...",Write a review on how livable this given city ...
...,...,...
4070,"```json\n{\n ""city_id"": 3837,\n ""livability""...",Write a review on how livable this given city ...
4071,"```json\n{\n ""city_id"": 3872,\n ""livability""...",Write a review on how livable this given city ...
4072,"```json\n{\n ""city_id"": 3886,\n ""livability""...",Write a review on how livable this given city ...
4073,"```json\n{\n ""city_id"": 3934,\n ""livability""...",Write a review on how livable this given city ...


In [None]:
%%bigquery
select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
from country_stg_ai.city_predictions_full

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"{ ""city_id"": 77, ""livability"": { ""walkabi..."
1,"{ ""city_id"": 151, ""livability"": { ""walkab..."
2,"{ ""city_id"": 171, ""country_code"": ""BGD"", ""d..."
3,"{ ""city_id"": 179, ""livability"": { ""walkab..."
4,"{ ""city_id"": 206, ""country_code"": ""BRA"", ""d..."
...,...
4070,"{ ""city_id"": 3837, ""livability"": { ""walka..."
4071,"{ ""city_id"": 3872, ""livability"": { ""walka..."
4072,"{ ""city_id"": 3886, ""livability"": { ""walka..."
4073,"{ ""city_id"": 3934, ""livability"": { ""walka..."


In [None]:
%%bigquery
create or replace table country_stg_ai.city_predictions_formatted_full as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from country_stg_ai.city_predictions_full

Query is running:   0%|          |

In [None]:
%%bigquery
select *
from country_stg_ai.city_predictions_formatted_full

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,
1,
2,
3,"{""city_id"": 2693,""livability"": {""walkability"":..."
4,"{ ""city_id"": 2328, ""livability"": { ""walkabili..."
...,...
4070,"{ ""city_id"": 698, ""country_code"": ""ESP"", ""d..."
4071,"{ ""city_id"": 936, ""country_code"": ""HKG"", ""d..."
4072,"{ ""city_id"": 3144, ""country_code"": ""DEU"", ""..."
4073,"{ ""city_id"": 696, ""country_code"": ""ESP"", ""d..."


In [None]:
%%bigquery
alter table country_stg.City add column livability string;

Executing query with job ID: 29aacc22-c187-4394-b1c9-7f1d44c9337a
Query executing: 0.42s


ERROR:
 400 Column already exists: livability at [1:41]

Location: US
Job ID: 29aacc22-c187-4394-b1c9-7f1d44c9337a



In [None]:
%%bigquery
update country_stg.City set livability =
  (select json_query(ml_generate_text_llm_result, '$.livability')
   from country_stg_ai.city_predictions_formatted_full
   where city_id = CAST(json_value(ml_generate_text_llm_result, '$.city_id') AS INT64))
where 1=1;

Query is running:   0%|          |

In [None]:
%%bigquery
select * except(data_source, load_time)
from country_stg.City
where livability is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,city_id,name,country_code,district,population,livability
0,129,Oranjestad,ABW,–,29034,"{""walkability"":7.5,""safety"":8.0,""nightlife"":7...."
1,4,Mazar-e-Sharif,AFG,Balkh,127800,"{""walkability"":3,""safety"":2,""nightlife"":1,""exc..."
2,3,Herat,AFG,Herat,186800,"{""walkability"":7,""safety"":6,""nightlife"":5,""exc..."
3,1,Kabul,AFG,Kabol,1780000,"{""walkability"":2,""safety"":1,""nightlife"":1,""exc..."
4,2,Qandahar,AFG,Qandahar,237500,"{""walkability"":3,""safety"":2,""nightlife"":1,""exc..."
...,...,...,...,...,...,...
3982,4070,Chitungwiza,ZWE,Harare,274912,"{""walkability"":3,""safety"":2,""nightlife"":2,""exc..."
3983,4068,Harare,ZWE,Harare,1410000,"{""walkability"":3,""safety"":2,""nightlife"":3,""exc..."
3984,4069,Bulawayo,ZWE,Bulawayo,621742,"{""walkability"":3,""safety"":2,""nightlife"":3,""exc..."
3985,4073,Gweru,ZWE,Midlands,128037,"{""walkability"":7,""safety"":6,""nightlife"":5,""exc..."


In [None]:
%%bigquery
update country_stg.City
  set data_source = 'bird_ai' where livability is not null

Query is running:   0%|          |

### Scenario: Generate a review for each movie based on the movie rank, name, and director.

#### Explore the raw data

In [None]:
%%bigquery
select rank, name, director
from country_stg.Film
order by rank;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rank,name,director
0,1,The Shawshank Redemption,Frank Darabont
1,2,The Godfather,Francis Ford Coppola
2,3,The Dark Knight,Christopher Nolan
3,4,The Godfather Part II,Francis Ford Coppola
4,5,12 Angry Men,Sidney Lumet
...,...,...,...
245,246,The Help,Tate Taylor
246,247,Dersu Uzala,Akira Kurosawa
247,248,Aladdin,Ron Clements
248,249,Gandhi,Richard Attenborough


#### Setup

In [None]:
%%bigquery
create or replace model remote_models.gemini_pro
  remote with connection `projects/daniel-daisy-cs329e/locations/us/connections/vertex_connection`
  options (endpoint = 'gemini-pro');

Query is running:   0%|          |

In [None]:
%%bigquery
declare prompt_query STRING default "Generate a brief movie review based on the given rank, movie name, and director. Return the output as a JSON object, including the movie rank and name.";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("rank", rank, "name", name, "director", director))) as prompt
    from country_stg.Film
    order by rank
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{\n ""rank"": 1,\n ""name"": ""The Shaws...",,,Generate a brief movie review based on the giv...
1,"```json\n{\n ""rank"": 2,\n ""name"": ""The Godfa...",,,Generate a brief movie review based on the giv...
2,"```json\n{\n ""rank"": 3,\n ""name"": ""The Dark ...",,,Generate a brief movie review based on the giv...
3,"```json\n{\n ""rank"": 4,\n ""name"": ""The Godfa...",,,Generate a brief movie review based on the giv...
4,"```json\n{\n ""rank"": 5,\n ""name"": ""12 Angry ...",,,Generate a brief movie review based on the giv...
5,"```json\n{\n ""rank"": 6,\n ""name"": ""Schindler...",,,Generate a brief movie review based on the giv...
6,"```json\n{\n ""rank"": 7,\n ""name"": ""The Lord ...",,,Generate a brief movie review based on the giv...
7,"```json\n{\n ""rank"": 8,\n ""name"": ""Pulp Fict...",,,Generate a brief movie review based on the giv...
8,"```json\n{\n ""rank"": 9,\n ""name"": ""The Lord ...",,,Generate a brief movie review based on the giv...
9,"```json\n{\n ""rank"": 10,\n ""name"": ""The Good...",,,Generate a brief movie review based on the giv...


#### Tweak the prompt

In [None]:
%%bigquery
declare prompt_query STRING default "Generate a brief movie review based on the given rank, movie name, and director. The review should consider the movie's significance, its themes, or its cultural impact. Return the output as a JSON object, including the movie rank, name, director, and the generated review.";
create or replace table country_stg_ai.movie_review_predictions_raw_10 as
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("rank", rank, "name", name, "director", director))) as prompt
    from country_stg.Film
    order by rank
    limit 10
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from country_stg_ai.movie_review_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{\n ""rank"": 2,\n ""name"": ""The Godfa...",Generate a brief movie review based on the giv...
1,"```json\n{\n ""rank"": 6,\n ""name"": ""Schindler...",Generate a brief movie review based on the giv...
2,"```json\n{\n ""rank"": 1,\n ""name"": ""The Shaws...",Generate a brief movie review based on the giv...
3,"```json\n{\n ""rank"": 9,\n ""name"": ""The Lord ...",Generate a brief movie review based on the giv...
4,"```json\n{\n ""rank"": 3,\n ""name"": ""The Dark ...",Generate a brief movie review based on the giv...
5,"```json\n{\n ""rank"": 7,\n ""name"": ""The Lord ...",Generate a brief movie review based on the giv...
6,"```json\n{\n ""rank"": 5,\n ""name"": ""12 Angry ...",Generate a brief movie review based on the giv...
7,"```json\n{\n ""rank"": 4,\n ""name"": ""The Godfa...",Generate a brief movie review based on the giv...
8,"```json\n{\n ""rank"": 10,\n ""name"": ""The Good...",Generate a brief movie review based on the giv...
9,"```json\n{\n ""rank"": 8,\n ""name"": ""Pulp Fict...",Generate a brief movie review based on the giv...


#### Format the json

In [None]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from country_stg_ai.movie_review_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{\n ""rank"": 2,\n ""name"": ""The Godfa...","{ ""rank"": 2, ""name"": ""The Godfather"", ""dire..."
1,"```json\n{\n ""rank"": 6,\n ""name"": ""Schindler...","{ ""rank"": 6, ""name"": ""Schindler's List"", ""d..."
2,"```json\n{\n ""rank"": 1,\n ""name"": ""The Shaws...","{ ""rank"": 1, ""name"": ""The Shawshank Redempti..."
3,"```json\n{\n ""rank"": 9,\n ""name"": ""The Lord ...","{ ""rank"": 9, ""name"": ""The Lord of the Rings:..."
4,"```json\n{\n ""rank"": 3,\n ""name"": ""The Dark ...","{ ""rank"": 3, ""name"": ""The Dark Knight"", ""di..."
5,"```json\n{\n ""rank"": 7,\n ""name"": ""The Lord ...","{ ""rank"": 7, ""name"": ""The Lord of the Rings:..."
6,"```json\n{\n ""rank"": 5,\n ""name"": ""12 Angry ...","{ ""rank"": 5, ""name"": ""12 Angry Men"", ""direc..."
7,"```json\n{\n ""rank"": 4,\n ""name"": ""The Godfa...","{ ""rank"": 4, ""name"": ""The Godfather Part II""..."
8,"```json\n{\n ""rank"": 10,\n ""name"": ""The Good...","{ ""rank"": 10, ""name"": ""The Good, the Bad and..."
9,"```json\n{\n ""rank"": 8,\n ""name"": ""Pulp Fict...","{ ""rank"": 8, ""name"": ""Pulp Fiction"", ""direc..."


In [None]:
%%bigquery
create or replace table country_stg_ai.movie_review_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from country_stg_ai.movie_review_predictions_raw_10

Query is running:   0%|          |

In [None]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.name') as movie_name,
  json_value(ml_generate_text_llm_result, '$.review') as review
from country_stg_ai.movie_review_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,movie_name,review
0,The Godfather Part II,The Godfather Part II is a cinematic masterpie...
1,Pulp Fiction,Pulp Fiction is a cinematic masterpiece that r...
2,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King ...
3,Schindler's List,Schindler's List is a powerful and moving film...
4,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the R...
5,The Shawshank Redemption,The Shawshank Redemption is a cinematic master...
6,12 Angry Men,12 Angry Men is a powerful and thought-provoki...
7,The Godfather,The Godfather is a cinematic masterpiece that ...
8,The Dark Knight,The Dark Knight is a cinematic masterpiece tha...
9,"The Good, the Bad and the Ugly","Sergio Leone's epic masterpiece, 'The Good, th..."


#### Add new column "review" to Film, which are the AI generated reviews.

In [None]:
%%bigquery
alter table country_stg.Film add column review string;

Executing query with job ID: 1330cb92-798c-44f0-98d4-edb0404f2b4b
Query executing: 0.43s


ERROR:
 400 Column already exists: review at [1:41]

Location: US
Job ID: 1330cb92-798c-44f0-98d4-edb0404f2b4b



#### Update country_stg.Film table to include the generated movie reviews

In [None]:
%%bigquery
update country_stg.Film set review =
  (select json_value(ml_generate_text_llm_result, '$.review')
   from country_stg_ai.movie_review_predictions_formatted_10
   where name = json_value(ml_generate_text_llm_result, '$.name'))
where 1=1

Query is running:   0%|          |

In [None]:
%%bigquery
select rank, name, director, review
from country_stg.Film
where review is not null
order by rank;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rank,name,director,review
0,1,The Shawshank Redemption,Frank Darabont,The Shawshank Redemption is a cinematic master...
1,2,The Godfather,Francis Ford Coppola,The Godfather is a cinematic masterpiece that ...
2,3,The Dark Knight,Christopher Nolan,The Dark Knight is a cinematic masterpiece tha...
3,4,The Godfather Part II,Francis Ford Coppola,The Godfather Part II is a cinematic masterpie...
4,5,12 Angry Men,Sidney Lumet,12 Angry Men is a powerful and thought-provoki...
5,6,Schindler's List,Steven Spielberg,Schindler's List is a powerful and moving film...
6,7,The Lord of the Rings: The Return of the King,Peter Jackson,The Lord of the Rings: The Return of the King ...
7,8,Pulp Fiction,Quentin Tarantino,Pulp Fiction is a cinematic masterpiece that r...
8,9,The Lord of the Rings: The Fellowship of the Ring,Peter Jackson,The Lord of the Rings: The Fellowship of the R...
9,10,"The Good, the Bad and the Ugly",Sergio Leone,"Sergio Leone's epic masterpiece, 'The Good, th..."


#### Apply at larger scale

In [None]:
%%bigquery
declare prompt_query STRING default "Generate a brief movie review based on the given rank, movie name, and director. The review should consider the movie's significance, its themes, or its cultural impact. Return the output as a JSON object, including the movie rank, name, director, and the generated review.";
create or replace table country_stg_ai.movie_review_predictions_raw_full as
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("rank", rank, "name", name, "director", director))) as prompt
    from country_stg.Film
    order by rank
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

In [None]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = 'a441f6b3-d6b4-4b33-a8f0-30e044e7b326'

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,creation_time,end_time,query


#### Format the JSON

In [None]:
%%bigquery
create or replace table country_stg_ai.movie_review_predictions_formatted_full as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from country_stg_ai.movie_review_predictions_raw_full;

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as movie_review_count
from country_stg_ai.movie_review_predictions_formatted_full

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,movie_review_count
0,250


In [None]:
%%bigquery
update country_stg.Film f
  set review = json_value(p.ml_generate_text_llm_result, '$.review')
  from country_stg_ai.movie_review_predictions_formatted_full p
  where f.name = json_value(p.ml_generate_text_llm_result, '$.name');

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as movie_review_count
from country_stg.Film
where review is not null;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,movie_review_count
0,249


In [None]:
%%bigquery
select name, review, count(*) as count
from country_stg.Film
where review is not null
group by name, review
order by count(*) desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,name,review,count
0,Pan's Labyrinth,Guillermo del Toro's Pan's Labyrinth is a haun...,1
1,3 Idiots,3 Idiots is a 2009 Indian comedy-drama film di...,1
2,Pather Panchali,Satyajit Ray's Pather Panchali is a cinematic ...,1
3,The Lives of Others,Florian Henckel von Donnersmarck's 'The Lives ...,1
4,Amores Perros,Amores Perros is a powerful and complex film t...,1
...,...,...,...
244,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King ...,1
245,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the R...,1
246,Oldboy,Park Chan-wook's 'Oldboy' is a masterpiece of ...,1
247,Parasite,Parasite is a darkly comedic and thought-provo...,1


In [None]:
%%bigquery
update country_stg.Film
  set data_source = 'kaggle_ai' where review is not null

Query is running:   0%|          |

### Part 4: Merge changes into target table

#### Film

In [None]:
%%bigquery
alter table country_csp.Film
  add column d_nationality string;

In [None]:
%%bigquery
alter table country_csp.Film
  add column review string;

Executing query with job ID: 719a0799-b9c8-4a1c-9121-130e7c2f5430
Query executing: 0.39s


ERROR:
 400 Column already exists: review at [2:14]

Location: US
Job ID: 719a0799-b9c8-4a1c-9121-130e7c2f5430



In [None]:
%%bigquery
select count(*) as num_records
from country_csp.Film

In [None]:
%%bigquery
select count(*) as num_updates
from country_csp.Film t join country_stg.Film s
on t.rank = s.rank
where t.status_flag = true
and (s.review is not null and t.review is null
or s.d_nationality is not null and t.d_nationality is null);

In [None]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from country_csp.Film t join country_stg.Film s
  on t.rank = s.rank
  where t.status_flag = true
  and (s.review is not null and t.review is null
  or s.d_nationality is not null and t.d_nationality is null);

update country_csp.Film
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where rank in (select rank from updates);

insert into country_csp.Film
  (rank, name, directors, review, d_nationality, data_source, load_time, effective_time, status_flag)
  (select rank, name, director, review, d_nationality, data_source, load_time, current_ts, true from updates);

In [None]:
%%bigquery
select count(*) as num_records
from country_csp.Film

#### City

In [None]:
%%bigquery
alter table country_csp.City
  add column livability string;

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as num_records
from country_csp.City

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,4085


In [None]:
%%bigquery
select count(*) as num_updates
from country_csp.City t join country_stg.City s
on t.city_id = s.city_id
where t.status_flag = true
and (s.livability is not null and t.livability is null);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_updates
0,3987


In [None]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from country_csp.City t join country_stg.City s
  on t.city_id = s.city_id
  where t.status_flag = true
  and (s.livability is not null and t.livability is null);

update country_csp.City
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where city_id in (select city_id from updates);

insert into country_csp.City
  (city_id, name, country_code, district, population, data_source, load_time, effective_time, status_flag, livability)
    (select city_id, name, country_code, district, population, data_source, load_time, current_ts, true, livability
      from updates);

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) as num_records
from country_csp.City

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,8072
