# Creating Importation Diagrams

In [44]:
from google.cloud import storage
import pandas as pd
import io
import os
import gzip
import plotly.express as px

In [3]:
service_account_id = 'elijahsandler@net-data-viz-handbook.iam.gserviceaccount.com'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json'

In [5]:
## Import data from Google Cloud

# Initialize a GCS client
client = storage.Client()

# Specify your bucket name and the specific .csv.gz file you want
bucket_name = 'gs_net-data-viz-handbook'
file_name = 'sample/importations/sample_SIR_importations_1.csv.gz'  # Update this to the specific file name

# Get the bucket and blob
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_name)

# Download the .csv.gz file as bytes
compressed_content = blob.download_as_bytes()

# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df = pd.read_csv(gz)

In [14]:
# this is already formatted correctly! yay.
df

Unnamed: 0,source_basin,target_basin,compartment,importations,date,run_id
0,1024,1361,Infectious,1,2009-04-09,1
1,1024,2217,Infectious,1,2009-04-29,1
2,1024,1034,Infectious,1,2009-05-10,1
3,1024,1018,Infectious,1,2009-05-13,1
4,1024,2125,Infectious,1,2009-05-14,1
...,...,...,...,...,...,...
2174114,1928,3233,Infectious,1,2010-02-17,1
2174115,1890,3234,Infectious,1,2010-02-17,1
2174116,1462,3250,Infectious,1,2010-02-17,1
2174117,3244,3250,Infectious,1,2010-02-17,1


## Fetching ID Map

In [21]:
url = '../data/gleam_geo_map.csv'
geo_map = pd.read_csv(url)
geo_map.head()

Unnamed: 0,basin_id,basin_label,airport_code,latitude,longitude,country_id,country_name,country_iso3,region_id,region_label,continent_id,continent_label,hemisphere_id,hemisphere_label
0,0,Menongue,SPP,-14.6586,17.7214,2,Angola,AGO,9,Middle Africa,0,Africa,1,Tropical hemisphere
1,1,Huambo,NOV,-12.8092,15.7608,2,Angola,AGO,9,Middle Africa,0,Africa,1,Tropical hemisphere
2,2,M'BanzaCongo,SSY,-6.2697,14.2472,2,Angola,AGO,9,Middle Africa,0,Africa,1,Tropical hemisphere
3,3,Soyo,SZA,-6.1411,12.3725,2,Angola,AGO,9,Middle Africa,0,Africa,1,Tropical hemisphere
4,4,Lubango,SDD,-14.9247,13.575,2,Angola,AGO,9,Middle Africa,0,Africa,1,Tropical hemisphere


In [22]:
def map_basins(geography):
    m = dict(zip(geo_map['basin_id'], geo_map[geography]))
    return m

In [24]:
df['source_basin'].map(map_basins('country_name'))

0                      Mexico
1                      Mexico
2                      Mexico
3                      Mexico
4                      Mexico
                  ...        
2174114    Russian Federation
2174115    Russian Federation
2174116               Ecuador
2174117                  Cuba
2174118                  Cuba
Name: source_basin, Length: 2174119, dtype: object

## Creating graphing dataframe by country

In [66]:
# get only columns we care about
df_data = df[['source_basin', 'target_basin', 'importations', 'date']]

# get basin names
df_data['source_basin_name'] = df_data['source_basin'].map(map_basins('basin_label'))
df_data['target_basin_name'] = df_data['target_basin'].map(map_basins('basin_label'))

# map basin to country
df_data['source_country_name'] = df_data['source_basin'].map(map_basins('country_name'))
df_data['target_country_name'] = df_data['target_basin'].map(map_basins('country_name'))

# map basin to continent
df_data['source_continent_name'] = df_data['source_basin'].map(map_basins('continent_label'))
df_data['target_continent_name'] = df_data['target_basin'].map(map_basins('continent_label'))


df_data.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,source_basin,target_basin,importations,date,source_basin_name,target_basin_name,source_country_name,target_country_name,source_continent_name,target_continent_name
0,1024,1361,1,2009-04-09,MexicoCity,Cartagena,Mexico,Colombia,North America,South America
1,1024,2217,1,2009-04-29,MexicoCity,Chicago,Mexico,United States of America,North America,North America
2,1024,1034,1,2009-05-10,MexicoCity,Tijuana,Mexico,Mexico,North America,North America
3,1024,1018,1,2009-05-13,MexicoCity,Leon(MX),Mexico,Mexico,North America,North America
4,1024,2125,1,2009-05-14,MexicoCity,Pittsburgh,Mexico,United States of America,North America,North America


## Seeing where cases are imported to/exported from in a country

In [77]:
df_data['source_country_name'].unique()

array(['Mexico', 'United States of America', 'Cuba', 'Spain', 'Sweden',
       'France', 'Canada', 'Kenya', 'Italy', 'Portugal', 'Puerto Rico',
       'Colombia', 'Israel', 'Virgin Islands (U.S.)', 'United Kingdom',
       'Korea, Rep.', 'Peru', 'Dominican Republic', 'Greece', 'Japan',
       'Germany', 'United Arab Emirates', 'Brazil', 'Luxembourg', 'China',
       'Ireland', 'India', 'Russian Federation', 'Argentina', 'Qatar',
       'Lithuania', 'Costa Rica', 'Belgium', 'Bahamas', 'Iceland',
       'Singapore', 'Chile', 'Trinidad and Tobago', 'Saudi Arabia',
       'Switzerland', 'Croatia', 'Cayman Islands', 'Ecuador', 'Australia',
       'Taiwan', 'Panama', 'Netherlands', 'Latvia', 'Austria', 'Jamaica',
       'Norway', 'Belize', 'Thailand', 'Cambodia', 'Hungary', 'Poland',
       'Turks and Caicos Islands', 'Denmark', 'Guatemala', 'South Africa',
       'Finland', 'Turkey', 'Guam', 'Macedonia', 'Maldives', 'Malaysia',
       'Ukraine', 'Aruba', 'Philippines', 'Uruguay', 'Tunisia',

In [86]:
country='Korea, Rep.'

# getting only rows where country is the origin
country_export = df_data[df_data['source_country_name'] == country]

# group by date and origin city
df_country_exports = country_export[['source_basin_name', 'date', 'importations']]\
.groupby(['source_basin_name', 'date']).sum().reset_index()

# Create the area plot
fig = px.area(df_country_exports, x='date', y='importations', title=f'Exportations from {country}', 
              color='source_basin_name',
              labels={
                     "importations": "Cases Exported",
                     "date": "Date",
                     "source_basin_name": "Source"
                 },)
fig.show()

Note that this also works for seeing where cases are being imported from: just filter from `target_country_name`, and change the relevant column names in the following lines. We can also filter out cases sent to the country itself so that we only see international imports, maybe even group by continent to make it a little easier to look at, or make any other filters we'd like before graphing. Plotly is really nice because it has a GUI, so we can select a date range to look at and deselect countries after making the plot if we'd like to. 

In [85]:
country='United States of America'

# getting only rows where country is the target
country_export = df_data[df_data['target_country_name'] == country]

# exclude cases where source and target are the same
country_export = country_export[country_export['source_country_name'] != country]

# group by date and source continent (the world is a big place and that's a lot of basins)
df_country_exports = country_export[['source_continent_name', 'date', 'importations']]\
.groupby(['source_continent_name', 'date']).sum().reset_index()

# Create the area plot
fig = px.area(df_country_exports, x='date', y='importations', title=f'International Importations to {country}', 
              color='source_continent_name',
              labels={
                     "importations": "Cases Imported",
                     "date": "Date",
                     "source_continent_name": "Source"
                 },)
fig.show()