# SI649 FINAL
## Overview 

In [37]:
import pandas as pd
import altair as alt
from vega_datasets import data
import json

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [38]:
# load the data
pop = pd.read_csv('population.csv')
co_emission = pd.read_csv("co-emissions-per-capita.csv")
month_temp = pd.read_csv('average-monthly-surface-temperature.csv')
year_temp = pd.read_csv("annual-temperature-anomalies.csv")


In [39]:
year_temp.head()
avg_temp = year_temp.pivot_table(index='Year', values='Temperature anomaly', aggfunc='mean')
avg_temp = avg_temp.reset_index()

## Visualization 0: Population change around the world (1950-2023), Relative to 1950

In [40]:
pop.head()

Unnamed: 0,Entity,Code,Year,Population - Sex: all - Age: all - Variant: estimates
0,Afghanistan,AFG,1950,7776182
1,Afghanistan,AFG,1951,7879343
2,Afghanistan,AFG,1952,7987783
3,Afghanistan,AFG,1953,8096703
4,Afghanistan,AFG,1954,8207953


In [41]:
pop = pop.rename(columns={"Entity": "country", "Code": "abv", 'Year': 'year', 'Population - Sex: all - Age: all - Variant: estimates': 'population'})

In [42]:
pop_nan = pop[pop['abv'].isna()]
pop_nan.country.unique()



array(['Africa (UN)', 'Americas (UN)', 'Asia (UN)', 'Europe (UN)',
       'High-income countries', 'Land-locked developing countries (LLDC)',
       'Latin America and the Caribbean (UN)',
       'Least developed countries', 'Less developed regions',
       'Less developed regions, excluding China',
       'Less developed regions, excluding least developed countries',
       'Low-income countries', 'Lower-middle-income countries',
       'More developed regions', 'Northern America (UN)', 'Oceania (UN)',
       'Small island developing states (SIDS)',
       'Upper-middle-income countries'], dtype=object)

In [43]:
# only keep countries with code
pop = pop[pop['abv'].notna()]
pop.year.unique()
pop.abv


0        AFG
1        AFG
2        AFG
3        AFG
4        AFG
        ... 
18939    ZWE
18940    ZWE
18941    ZWE
18942    ZWE
18943    ZWE
Name: abv, Length: 17612, dtype: object

In [44]:
# Get the population of the year 1950 for each country
pop_1950 = pop[pop['year'] == 1950]
pop_1950 = pop_1950[['country', 'population']]
pop_1950 = pop_1950.rename(columns={'population': 'population_1950'})
pop_total = pop.merge(pop_1950, on='country')
pop_total['pop_diff']= pop_total['population'] - pop_total['population_1950']


In [45]:
country_code = pd.read_csv('country_code.csv')
country_code = country_code.rename(columns={'Country': 'country', 'alpha_2': 'abv'})
world_map = pd.read_csv('world_country.csv')
world_map = world_map.rename(columns={'country_code': 'abv'})
world_map = world_map.merge(country_code, on='abv')

In [46]:
world_map = world_map.rename(columns={'abv': 'country_abv'})
world_map = world_map.rename(columns={'alpha_3': 'abv'})
world_map = world_map[['abv', 'country_abv', 'code','latitude', 'longitude']]

In [63]:
world_map

Unnamed: 0,abv,country_abv,code,latitude,longitude
0,AND,AD,20,42.546245,1.601554
1,ARE,AE,784,23.424076,53.847818
2,AFG,AF,4,33.939110,67.709953
3,ATG,AG,28,17.060816,-61.796428
4,AIA,AI,660,18.220554,-63.068615
...,...,...,...,...,...
237,YEM,YE,887,15.552727,48.516388
238,MYT,YT,175,-12.827500,45.166244
239,ZAF,ZA,710,-30.559482,22.937506
240,ZMB,ZM,894,-13.133897,27.849332


In [48]:
pop_total = pop_total.merge(world_map, on='abv')
pop_total

Unnamed: 0,country,abv,year,population,population_1950,pop_diff,country_abv,code,latitude,longitude
0,Afghanistan,AFG,1950,7776182,7776182,0,AF,4,33.939110,67.709953
1,Afghanistan,AFG,1951,7879343,7776182,103161,AF,4,33.939110,67.709953
2,Afghanistan,AFG,1952,7987783,7776182,211601,AF,4,33.939110,67.709953
3,Afghanistan,AFG,1953,8096703,7776182,320521,AF,4,33.939110,67.709953
4,Afghanistan,AFG,1954,8207953,7776182,431771,AF,4,33.939110,67.709953
...,...,...,...,...,...,...,...,...,...,...
17015,Zimbabwe,ZWE,2019,15271377,2790907,12480470,ZW,716,-19.015438,29.154857
17016,Zimbabwe,ZWE,2020,15526888,2790907,12735981,ZW,716,-19.015438,29.154857
17017,Zimbabwe,ZWE,2021,15797220,2790907,13006313,ZW,716,-19.015438,29.154857
17018,Zimbabwe,ZWE,2022,16069061,2790907,13278154,ZW,716,-19.015438,29.154857


In [64]:
#check if code is none
pop_total[pop_total['code'].isna()]
pop_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17020 entries, 0 to 17019
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          17020 non-null  object 
 1   abv              17020 non-null  object 
 2   year             17020 non-null  int64  
 3   population       17020 non-null  int64  
 4   population_1950  17020 non-null  int64  
 5   pop_diff         17020 non-null  int64  
 6   country_abv      16946 non-null  object 
 7   code             17020 non-null  int64  
 8   latitude         17020 non-null  float64
 9   longitude        17020 non-null  float64
dtypes: float64(2), int64(5), object(3)
memory usage: 1.3+ MB


In [65]:
pop_total.pop_diff.describe()

count    1.702000e+04
mean     1.133315e+07
std      5.758495e+07
min     -9.393160e+05
25%      7.076200e+04
50%      9.211740e+05
75%      5.351420e+06
max      1.091791e+09
Name: pop_diff, dtype: float64

In [1]:
pop_total.to_csv('pop_total.csv', index=False)

NameError: name 'pop_total' is not defined

In [75]:
# Convert year to integer if necessary

slider = alt.binding_range(min=1950, max=2023, step=1)
select_year = alt.param(name='select_year', bind=slider, value=1950)

world = data.world_110m.url
base = alt.Chart(alt.topo_feature(world, 'countries')).mark_geoshape(
    fill='lightgray',
    stroke='black'
).transform_filter(
    "datum.id != 10"  # Exclude Antarctica (id=10 in world-110m.json)
).properties(
    width=800,
    height=600
    
)


In [82]:
points = alt.Chart(pop_total).mark_circle(
    size=2000, color='red', opacity=0.7
).transform_calculate(
    pop_increase="(datum.pop_diff) / 1000000",
).encode(
    longitude='longitude:Q',    # Longitude for positioning
    latitude='latitude:Q',     # Latitude for positioning
    size=alt.Size('pop_increase:Q', scale=alt.Scale(range=[10, 2000], zero=False),  title='Population Increase (Million)'),
    tooltip=[
        alt.Tooltip('country:N', title='Country Code'),
        alt.Tooltip('pop_increase:Q', title='Population Increase (Million)', format=',.2f'),
        alt.Tooltip('year:Q', title='Year')
    ]
).transform_filter(
    (alt.datum.year == select_year) & (alt.datum.year != 1950)  # Dynamic filter based on slider, exclude 1950
).add_params(
    select_year  # Add slider interaction
)
    



# Append the chart to the list
combined = (base + points).properties(
    title='Population Increase by Country (1950-2023), Relative to 1950',
)

combined



In [398]:
avg_temp= avg_temp.rename(columns={"Year": "year", "Temperature anomaly": "temp"})
avg_temp

Unnamed: 0,year,temp
0,1940,-1.302720
1,1941,-0.988722
2,1942,-1.146610
3,1943,-1.072770
4,1944,-0.932494
...,...,...
80,2020,0.570034
81,2021,0.409635
82,2022,0.437112
83,2023,0.851806


In [399]:
# calculate the average temperature from 1981 to 2000
avg_temp_1981_2000 = avg_temp[(avg_temp['year'] >= 1981) & (avg_temp['year'] <= 2000)]['temp'].mean()
avg_temp_1981_2000 # -0.450

-0.4496527827691845

## Visualization 1: Yearly Temperature Anomalies from 1940 to 2024. °C
### Relative to the 1981 to 2000 Average

In [None]:
# Define the reference line (relative temp of -0.462)
reference_temp = -0.450

# Create a selection object for interactivity
click = alt.selection_single(fields=['year'], empty='none')

# Define color logic in separate steps
# base_color = alt.Color(
#     'temp:Q',
#     scale=alt.Scale(domain=[reference_temp, avg_temp['temp'].max()], range=['blue', 'red']),
#     legend=alt.Legend(title="Temperature Anomaly (°C)", values=[-1.5, -1, -0.5, 0, 0.5, 1], orient='top') 
# )
base_color = alt.Color(
    'temp:Q',
    scale=alt.Scale(scheme='redblue', domainMid=0, reverse=True),
    legend=alt.Legend(title="Relative Temp (°C)")
)

# Define the bar chart
# Create the main bar chart
chart = alt.Chart(avg_temp).mark_bar().encode(
    x=alt.X('year:O',
            title='Year',
            axis=alt.Axis(labelExpr="datum.value % 10 == 0 ? datum.value : ''", 
                          labelAngle=0, tickCount=10, ticks=False, domain=False,grid=False)
                
    ),
    y=alt.Y('temp:Q', title='Temperature Anomaly (°C)', axis=alt.Axis(ticks=False, domain=False,grid=False)),
    color=alt.condition(
        click, alt.value('orange'), base_color  # Use base_color with legend here
    ),
    opacity=alt.condition(
        click, alt.value(1.0), alt.value(0.6)
    ),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('temp:Q', title='Relative Temp(°C)', format='.2f'),  # Format to 2 decimals
    ]
).add_selection(
    click
).properties(
    title="Yearly Temperature Anomalies from 1940 to 2024, Relative to the 1981 to 2000 Average"
)

# Add a reference line at the relative temperature
ref_line = alt.Chart(pd.DataFrame({'y': [reference_temp]})).mark_rule(
    color='gray', strokeDash=[4, 4]
).encode(y='y:Q')

# Combine the bar chart and reference line
final_chart = chart + ref_line

# Display the chart
final_chart


  click = alt.selection_single(fields=['year'], empty='none')
  ).add_selection(


## Visualization 2: Circular heatmaps representing the monthly global temperature anomalies from 1970 to 2015

### Relative to the 1981 to 2000 Average


In [401]:
month_temp.head()
month_avg_temp = month_temp.pivot_table(index='Day', values='Average monthly temperature', aggfunc='mean')
month_avg_temp = month_avg_temp.reset_index()

In [402]:
month_avg_temp.head()

Unnamed: 0,Day,Average monthly temperature
0,1940-01-15,11.458131
1,1940-02-15,12.956817
2,1940-03-15,14.661579
3,1940-04-15,17.548147
4,1940-05-15,19.489901


In [403]:
month_avg_temp

Unnamed: 0,Day,Average monthly temperature
0,1940-01-15,11.458131
1,1940-02-15,12.956817
2,1940-03-15,14.661579
3,1940-04-15,17.548147
4,1940-05-15,19.489901
...,...,...
1013,2024-06-15,22.999161
1014,2024-07-15,23.448781
1015,2024-08-15,23.351773
1016,2024-09-15,22.001160


In [404]:
month_avg_temp= month_avg_temp.rename(columns={"Day": "day", "Average monthly temperature": "temp"})
month_avg_temp['day'] =  pd.to_datetime(month_avg_temp['day'])
month_avg_temp['month'] = month_avg_temp['day'].dt.month
month_avg_temp['year'] = month_avg_temp['day'].dt.year

In [405]:
month_avg_temp.head()

Unnamed: 0,day,temp,month,year
0,1940-01-15,11.458131,1,1940
1,1940-02-15,12.956817,2,1940
2,1940-03-15,14.661579,3,1940
3,1940-04-15,17.548147,4,1940
4,1940-05-15,19.489901,5,1940


In [406]:

# Add a period column for grouping
def classify_period(year):
    if 1981 <= year <= 1990:
        return '1981 - 1990'
    elif 1991 <= year <= 2000:
        return '1991 - 2000'
    elif 2001 <= year <= 2010:
        return '2001 - 2010'
    elif 2011 <= year <= 2020:
        return '2011 - 2020'
    elif 2021 <= year <= 2024:
        return '2021 - 2024'

In [407]:
# Add period column
month_avg_temp['period'] = month_avg_temp['year'].apply(classify_period)

# drop if period is not 1981 - 2024
month_avg_temp = month_avg_temp[month_avg_temp['period'].notna()]



In [408]:
month_avg_temp.head()

Unnamed: 0,day,temp,month,year,period
492,1981-01-15,13.002741,1,1981,1981 - 1990
493,1981-02-15,14.11627,2,1981,1981 - 1990
494,1981-03-15,16.517679,3,1981,1981 - 1990
495,1981-04-15,18.134086,4,1981,1981 - 1990
496,1981-05-15,19.893929,5,1981,1981 - 1990


In [409]:
# Create average monthly temperature of 1981 - 2000
month_avg_temp_1981_2000 = month_avg_temp[(month_avg_temp['year'] >= 1981) & (month_avg_temp['year'] <= 2000)]
month_avg_temp_1981_2000 = month_avg_temp_1981_2000.groupby('month')['temp'].mean().reset_index()
month_avg_temp_1981_2000 = month_avg_temp_1981_2000.rename(columns={'temp': 'avg_temp'})

In [410]:
month_avg_temp_1981_2000 
month_avg_temp = month_avg_temp.merge(month_avg_temp_1981_2000, on='month', how='left')

In [411]:
month_avg_temp['temp_diff'] = month_avg_temp['temp'] - month_avg_temp['avg_temp']

In [412]:
# Calculate the average temperature difference per month for each period
month_avg_temp = month_avg_temp.groupby(['period', 'month'])['temp_diff'].mean().reset_index()

In [413]:
# Add angles for the pie chart

month_avg_temp['angle'] = month_avg_temp['month'] * 360 /12

In [414]:

# Base pie chart for each period
base = alt.Chart(month_avg_temp).mark_arc(innerRadius=50).encode(
    theta=alt.Theta('temp_diff:Q', stack=True),  # Each month is a slice
    color=alt.Color('temp_diff:Q',
                    scale=alt.Scale(scheme='redblue', domainMid=0),  # Diverging colors
                    legend=alt.Legend(title="Temperature Anomaly (°C)")
                    ),
    order=alt.Order('angle:Q'),  # Ensure correct month order
    tooltip=['month:N', 'temp_diff:Q']
)


In [415]:
# Base chart with a filter for a specific period
def create_period_chart(period):
    return alt.Chart(month_avg_temp).transform_filter(
        alt.datum.period == period  # Filter for one specific period
    ).mark_arc(innerRadius=50).encode(
        theta=alt.Theta('temp_diff:Q', stack=True),  # Each month's anomaly
        color=alt.Color('temp_diff:Q',
                        scale=alt.Scale(scheme='redblue', domainMid=0, reverse=True),  # Diverging colors
                        legend=alt.Legend(title="Relative Temp(°C)")
                        ),
        order=alt.Order('angle:Q'),  # Correct month order
        tooltip=[
            alt.Tooltip('month:N', title='Month'),
            alt.Tooltip('temp_diff:Q', title='Relative Temp(°C)', format='.2f')
        ]
    ).properties(
        title=period, width=150, height=150
    )

# Generate a list of charts, one for each period
chart = [create_period_chart(period) for period in month_avg_temp['period'].unique()]

# Concatenate all charts
final_chart = alt.concat(*chart, columns=5).properties(
    title="Circular Heatmaps Representing Monthly Global Temperature Anomalies (1981 - 2024), \nRelative to the 1981 to 2000 Average",
    
)

final_chart


## Visualization 3 Annual CO2emissions per capital and top 5 emitters.

In [416]:
co_emission = pd.read_csv("co-emissions-per-capita.csv")

In [417]:
co_emission.head()
co_emission = co_emission.rename(columns={"Entity": "country", "Code": "code", 'Year': 'year', 'Annual CO₂ emissions (per capita)': 'emission'})

In [418]:
co_emission.code.describe()

count     22895
unique      215
top         NOR
freq        229
Name: code, dtype: object

In [419]:
# # Get ISO 3166-1 numeric country codes for co_emission
# country_code = pd.read_csv('country_code.csv')

# # Merge the country code with co_emission
# co_emission = co_emission.merge(country_code, on='country', how='left')

NameError: name 'co' is not defined

In [None]:
# flag the year that code is NaN
co_emission['flag'] = co_emission['code'].isna()
co_emission['flag'].value_counts()
co_emission[co_emission['flag'] == True].country.unique()




array(['Africa', 'Asia', 'Asia (excl. China and India)', 'Europe',
       'Europe (excl. EU-27)', 'Europe (excl. EU-28)',
       'European Union (27)', 'European Union (28)',
       'High-income countries', 'Low-income countries',
       'Lower-middle-income countries', 'North America',
       'North America (excl. USA)', 'Oceania', 'South America',
       'Upper-middle-income countries'], dtype=object)

In [178]:
# All the missing code of countries that actually not countries, they are continents or regions
regions = co_emission[co_emission['flag'] == True]
regions = regions[~regions['country'].isin(['Europe (excl. EU-27)', 'Europe (excl. EU-28)',
       'European Union (27)', 'European Union (28)',
       'High-income countries', 'Low-income countries',
       'Lower-middle-income countries', 'North America (excl. USA)','Upper-middle-income countries', 'Asia (excl. China and India)'])]

In [194]:
regions.year.value_counts()
regions.to_csv('year_count.csv', index=False)
regions.country.unique()

array(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America'], dtype=object)

In [188]:
min_year = co_emission['year'].min()
max_year = co_emission['year'].max()

In [295]:
# Define a custom color scale
color_scale = alt.Scale(scheme='redblue', domainMid=0)



# Bar chart for albums with the most views
slider = alt.binding_range(min=min_year, max=max_year, step=1, name="By ")
selector = alt.selection_point(fields=['year'], bind=slider, value=max_year)

chart = alt.Chart(regions,title=alt.Title(text="CO2 Emissions per Capita by Regions (1750-2024)", fontSize=18)
).mark_bar().encode(
    x=alt.X('emission:Q', title='Emission per Capita'),
    y=alt.Y('country:O', sort='-x', title='Region'),
    color=alt.Color('country:O', title='Region', scale=color_scale),
    tooltip=[
        alt.Tooltip('country:O', title='Region'),
        alt.Tooltip('emission:Q', title='Emission per Capita', format='.2f'),  # Format to 2 decimals
        alt.Tooltip('year:O', title='Year')
    ]
).add_params(
    selector
).transform_filter(
    selector
).properties(
    width=500,
    height=500,
).configure_view(strokeWidth=0)

chart

## Visualization 4: World trend of CO2 Emission ()

In [255]:
world = co_emission[co_emission['country'] == 'World']
world['first_indust_rev'] = 0
world.loc[(world['year'] >= 1760) & (world['year'] <= 1840), 'first_indust_rev'] = 1
world['second_indust_rev'] = 0
world.loc[(world['year'] >= 1870) & (world['year'] <= 1914), 'second_indust_rev'] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  world['first_indust_rev'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  world['second_indust_rev'] = 0


In [260]:
# Create a DataFrame for the shaded regions
shaded_regions_data = pd.DataFrame({
    'start': [1760, 1870],
    'end': [1840, 1914],
    'label': ['First Industrial Revolution', 'Second Industrial Revolution']
})
shaded_regions_data.start = pd.to_datetime(shaded_regions_data.start, format='%Y')
shaded_regions_data.end = pd.to_datetime(shaded_regions_data.end, format='%Y')

In [261]:
world.year=pd.to_datetime(world.year, format='%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  world.year=pd.to_datetime(world.year, format='%Y')


In [307]:
# Create a zoomable selection
zoom = alt.selection_interval(bind='scales', encodings=['x'])
click = alt.selection_multi(fields=['year'], empty='none')



# Define the shaded regions for the industrial revolutions
shaded_regions = alt.Chart(shaded_regions_data).mark_rect(opacity=0.3, color='gray').encode(
    x='start:T',
    x2='end:T'
).add_params(
    zoom
).add_params(
    click
)


# Define the line chart for emissions
line_chart = alt.Chart(world).mark_line().encode(
    x=alt.X('year:T', title='Year', axis=alt.Axis(labelAngle=-45, tickCount=10, ticks=False, domain=False,grid=False)),
    y=alt.Y('emission:Q', title='Emission per Capita'),
    tooltip=[
        alt.Tooltip('year:T', title='Year',format='%Y'),
        alt.Tooltip('emission:Q', title='Emission')
    ]
).add_params(
    zoom
).add_params(
    click
).properties(
    width=800,
    height=400,
    title='Worldwide Average Emission Per Capita Over Time (1750 - 2024)'
)

points = alt.Chart(world).mark_point(size=20, opacity=0.8).encode(
    x=alt.X('year:T', title='Year'),
    y=alt.Y('emission:Q'),
    tooltip=[
        alt.Tooltip('year:T', title='Year', format='%Y'),
        alt.Tooltip('emission:Q', title='Emission per Capita', format='.2f'),
    ]
)


# Combine the shaded regions and the line chart
final_chart = shaded_regions + line_chart + points
final_chart


  click = alt.selection_multi(fields=['year'], empty='none')


In [298]:
co_emission_1980 = co_emission[(co_emission['year'] >= 1980)]
# find top 5 countries with highest emission each year
top_5 = co_emission_1980.groupby('year').apply(lambda x: x.nlargest(5, 'emission')).reset_index(drop=True)

  top_5 = co_emission_1980.groupby('year').apply(lambda x: x.nlargest(5, 'emission')).reset_index(drop=True)
