In [1]:
# import dependencies
import pandas as pd
!pip install pycountry
import pycountry
from pathlib import Path
import sqlite3
!pip install pandas psycopg2
!pip install plotly
import plotly.express as px
import psycopg2
from sqlalchemy import create_engine
from keys import post_username, post_password



Our data has been stored in a cloud server managed by Amazon RDS. The dataset was relatively small, so a simple SQL table schema was created in pgAdmin to help with the importing. The next step is to connect to the database.

In [2]:
# Set up database connection parameters
conn = psycopg2.connect(
    dbname = "team6_project3_db",
    user = post_username,
    password = post_password,
    host = "database-1.croamw4iqxpi.us-east-2.rds.amazonaws.com",
    port = "5432" 
)

query = "SELECT * FROM climate_impact_agriculture;"

In [3]:
# Create the pandas database
third_climate_df = pd.read_sql_query(query, conn)
third_climate_df.head()

  third_climate_df = pd.read_sql_query(query, conn)


Unnamed: 0,year,country,region,crop_type,average_temperature_c,total_precipitation_mm,co2_emissions_mt,crop_yield_mt_per_ha,extreme_weather_events,Irrigation_Access_%,pesticide_use_kg_per_ha,fertilizer_use_kg_per_ha,soil_health_index,adaptation_strategies,economic_impact_million_usd
0,2001,India,West Bengal,Corn,1.55,447.06,15.22,1.737,8,14.54,10.08,14.78,83.25,Water Management,808.13
1,2024,China,North,Corn,3.23,2913.57,29.82,1.737,8,11.05,33.06,23.25,54.02,Crop Rotation,616.22
2,2001,France,Ile-de-France,Wheat,21.11,1301.74,25.75,1.719,5,84.42,27.41,65.53,67.78,Water Management,796.96
3,2001,Canada,Prairies,Coffee,27.85,1154.36,13.91,3.89,5,94.06,14.38,87.58,91.39,No Adaptation,790.32
4,1998,India,Tamil Nadu,Sugarcane,2.19,1627.48,11.81,1.08,9,95.75,44.35,88.08,49.61,Crop Rotation,401.72


Now we've connected to the database and created a pandas DataFrame with the data. Our next step investigate and transform the data. While storing the database and loading the database the column headers have been changed to lowercase. We will change the columns back to their original names to help avoid confusion (some queries have already been done elsewhere using old column names).

In [4]:
# Rename the headers back to their original state, with capitalization
third_climate_df.rename(columns={
    'year': 'Year',
    'country': 'Country',
    'region': 'Region',
    'crop_type': 'Crop_Type',
    'average_temperature_c': 'Average_Temperature_C',
    'total_precipitation_mm': 'Total_Precipitation_mm',
    'co2_emissions_mt': 'CO2_Emissions_MT',
    'crop_yield_mt_per_ha': 'Crop_Yield_MT_per_HA',
    'extreme_weather_events': 'Extreme_Weather_Events',
    'pesticide_use_kg_per_ha': 'Pesticide_Use_KG_per_HA',
    'fertilizer_use_kg_per_ha': 'Fertilizer_Use_KG_per_HA',
    'soil_health_index': 'Soil_Health_Index',
    'adaptation_strategies': 'Adaptation_Strategies',
    'economic_impact_million_usd': 'Economic_Impact_Million_USD'
}, inplace=True)

# Show renamed headers
third_climate_df.head()

Unnamed: 0,Year,Country,Region,Crop_Type,Average_Temperature_C,Total_Precipitation_mm,CO2_Emissions_MT,Crop_Yield_MT_per_HA,Extreme_Weather_Events,Irrigation_Access_%,Pesticide_Use_KG_per_HA,Fertilizer_Use_KG_per_HA,Soil_Health_Index,Adaptation_Strategies,Economic_Impact_Million_USD
0,2001,India,West Bengal,Corn,1.55,447.06,15.22,1.737,8,14.54,10.08,14.78,83.25,Water Management,808.13
1,2024,China,North,Corn,3.23,2913.57,29.82,1.737,8,11.05,33.06,23.25,54.02,Crop Rotation,616.22
2,2001,France,Ile-de-France,Wheat,21.11,1301.74,25.75,1.719,5,84.42,27.41,65.53,67.78,Water Management,796.96
3,2001,Canada,Prairies,Coffee,27.85,1154.36,13.91,3.89,5,94.06,14.38,87.58,91.39,No Adaptation,790.32
4,1998,India,Tamil Nadu,Sugarcane,2.19,1627.48,11.81,1.08,9,95.75,44.35,88.08,49.61,Crop Rotation,401.72


Now that the DataFrame is adequately transformed we can now investigate it to see what we're dealing with.

In [5]:
# Create variables for unique countries, regions, and their lengths. 
unique_regions = third_climate_df['Region'].unique()
length_unique_regions = len(third_climate_df['Region'].unique())
unique_countries = third_climate_df['Country'].unique()
length_unique_countries = len(third_climate_df['Country'].unique())

# Print results
print(f"This data {length_unique_countries} unique countries, and {length_unique_regions} unique regions")
print(f"These are the unique countries: {unique_countries}")
print(f"These are the unique regions: {unique_regions}")

This data 10 unique countries, and 34 unique regions
These are the unique countries: ['India' 'China' 'France' 'Canada' 'USA' 'Argentina' 'Australia' 'Nigeria'
 'Russia' 'Brazil']
These are the unique regions: ['West Bengal' 'North' 'Ile-de-France' 'Prairies' 'Tamil Nadu' 'Midwest'
 'Northeast' 'New South Wales' 'Punjab' 'North West' 'South East'
 'Grand Est' 'Northwestern' 'Siberian' 'Northwest' 'Victoria'
 'Nouvelle-Aquitaine' 'South' 'Quebec' 'Southeast' 'Ontario' 'East'
 'Pampas' 'Western Australia' 'Volga' 'Maharashtra'
 'Provence-Alpes-Cote d’Azur' 'West' 'Central' 'North Central' 'Patagonia'
 'Queensland' 'South West' 'British Columbia']


Let's find out how many regions are represented for each country in the dataset.

In [6]:
# Group the df by Country and count the number of regions for each
group_df = third_climate_df.groupby('Country')['Region'].nunique().reset_index()
group_df

Unnamed: 0,Country,Region
0,Argentina,4
1,Australia,4
2,Brazil,4
3,Canada,4
4,China,4
5,France,4
6,India,4
7,Nigeria,4
8,Russia,4
9,USA,4


In [7]:
# Group by country again, but this time show the regions for each instead of counting them.
grouped_df = third_climate_df.groupby("Country")["Region"].unique().reset_index()

# Change to column width of the new df to avoid results being truncated.
pd.set_option('display.max_colwidth', None)
grouped_df

Unnamed: 0,Country,Region
0,Argentina,"[Northeast, Northwest, Pampas, Patagonia]"
1,Australia,"[New South Wales, Victoria, Western Australia, Queensland]"
2,Brazil,"[North, Northeast, Southeast, South]"
3,Canada,"[Prairies, Quebec, Ontario, British Columbia]"
4,China,"[North, East, South, Central]"
5,France,"[Ile-de-France, Grand Est, Nouvelle-Aquitaine, Provence-Alpes-Cote d’Azur]"
6,India,"[West Bengal, Tamil Nadu, Punjab, Maharashtra]"
7,Nigeria,"[North West, South East, North Central, South West]"
8,Russia,"[Northwestern, Siberian, Volga, Central]"
9,USA,"[Midwest, Northeast, South, West]"


Each country has 4 regions, however there are only 34 unique regions (instead of 40). There is some overlap between regions (e.g. 'South' China and 'South' USA). Now let's find out how many crops are in this dataset.

In [8]:
# Find unique crops
unique_crops = third_climate_df['Crop_Type'].unique()
length_unique_crops = len(unique_crops)

print(f"There are {length_unique_crops} unique crops in this dataset. They are: {unique_crops}")

There are 10 unique crops in this dataset. They are: ['Corn' 'Wheat' 'Coffee' 'Sugarcane' 'Fruits' 'Rice' 'Barley' 'Vegetables'
 'Soybeans' 'Cotton']


We're trying to find a primary key to the data, or at least some more information about what each column represents. It's possible that there is only one crop, per year, per country's region. This would be a unique entry and qualify as a composite key. A new grouped df might help us get a better understanding.

In [9]:
# Break it down by crop type, using size to count the indexes and unstack to reshape the data
crop_type_breakdown = third_climate_df.groupby(['Country', 'Region', 'Year', 'Crop_Type']).size().unstack(fill_value=0)
crop_type_breakdown

Unnamed: 0_level_0,Unnamed: 1_level_0,Crop_Type,Barley,Coffee,Corn,Cotton,Fruits,Rice,Soybeans,Sugarcane,Vegetables,Wheat
Country,Region,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Argentina,Northeast,1990,1,1,3,1,2,1,0,1,1,0
Argentina,Northeast,1991,0,2,1,1,2,3,2,1,2,0
Argentina,Northeast,1992,1,0,1,0,2,1,2,0,0,1
Argentina,Northeast,1993,0,1,0,2,0,0,0,0,0,1
Argentina,Northeast,1994,2,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
USA,West,2020,0,0,2,1,1,0,0,1,0,0
USA,West,2021,2,0,1,2,3,1,1,1,2,0
USA,West,2022,0,0,0,0,0,2,0,0,0,0
USA,West,2023,0,0,2,1,1,2,0,2,1,1


This pivot table is heavily trunacted here (it's a large table), however we can see even from this that there are multiple entries for crop per year per region (e.g. Corn appears 3 times in Northeast Argentina in 1990, among other such examples). 
Therefore this dataset does not have a clear primary or composite key. It's not clear exactly what each row represents, aside from perhaps a log entry of harvests. The metadata does not elaborate on this except to say: 

"To assemble this dataset, data was collected from multiple reputable sources, focusing on recent and historical records of temperature, precipitation, and crop yields. The process involved extracting data from agricultural reports, climate monitoring stations, and regional agricultural surveys. Each data point was meticulously reviewed to ensure accuracy and consistency. The dataset was then cleaned and organized to provide a clear and accessible overview of how varying climate conditions influence agricultural productivity across different regions and crop types."

                                                            --- Nick's section begins ----
Create a chloropleth animation of average CO2 emissions over time by country.

In [10]:
# Calculate average CO2 emissions per country per year
climate_impact_df_avg = third_climate_df.groupby(['Country', 'Year'])['CO2_Emissions_MT'].mean().reset_index()

# Sort the DataFrame by Year to ensure sequential display
climate_impact_df_avg = climate_impact_df_avg.sort_values('Year')

# Calculate the overall min and max values for CO2 emissions
min_emissions = climate_impact_df_avg['CO2_Emissions_MT'].min()
max_emissions = climate_impact_df_avg['CO2_Emissions_MT'].max()

# Create a choropleth plot with time series and fixed color scale range
fig = px.choropleth(
    climate_impact_df_avg,
    locations='Country', 
    locationmode='country names',
    color='CO2_Emissions_MT',
    hover_name='Country',
    animation_frame='Year',
    color_continuous_scale='Viridis_r',
    range_color=[min_emissions, max_emissions],  # Set fixed range for color scale
    projection='natural earth',
    title='Average CO2 Emissions Over Time by Country'
)

# Update layout for better appearance
fig.update_layout(
    width=1200,
    height=800,
    coloraxis_colorbar=dict(
        title="Average CO2 Emissions (Mt)",
    ),
    updatemenus=[{
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 500, 'redraw': True}, 'fromcurrent': True}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate', 'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }]
)

# Add slider
fig.update_layout(
    sliders=[{
        'active': 0,
        'yanchor': 'top',
        'xanchor': 'left',
        'currentvalue': {
            'font': {'size': 20},
            'prefix': 'Year:',
            'visible': True,
            'xanchor': 'right'
        },
        'transition': {'duration': 300, 'easing': 'cubic-in-out'},
        'pad': {'b': 10, 't': 50},
        'len': 0.9,
        'x': 0.1,
        'y': 0,
        'steps': [
            {
                'args': [[f'{year}'], {
                    'frame': {'duration': 300, 'redraw': True},
                    'mode': 'immediate',
                    'transition': {'duration': 300}
                }],
                'label': f'{year}',
                'method': 'animate'
            } for year in climate_impact_df_avg['Year'].unique()
        ]
    }]
)

# Show the plot
fig.show()

The animation is working properly, hoewever there are no clear trends. Maybe looking a a line graph could help explain the visuals.

In [11]:
# Calculate average CO2 emissions per country per year
climate_impact_df_avg = third_climate_df.groupby(['Country', 'Year'])['CO2_Emissions_MT'].mean().reset_index()

# Sort the DataFrame by Year to ensure sequential display
climate_impact_df_avg = climate_impact_df_avg.sort_values('Year')

# Create a line chart with CO2 emissions over time
fig = px.line(
    climate_impact_df_avg,
    x='Year', 
    y='CO2_Emissions_MT',
    color='Country',  # Line color by country
    title='Average CO2 Emissions by Country Over Time',
    labels={'CO2_Emissions_MT': 'CO2 Emissions (Mt)', 'Year': 'Year'}
)

# Add a dropdown menu to filter by country
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {
                    'args': [{'visible': [country == selected_country for country in climate_impact_df_avg['Country'].unique()]}],
                    'label': selected_country,
                    'method': 'update'
                }
                for selected_country in climate_impact_df_avg['Country'].unique()
            ],
            'direction': 'down',
            'showactive': True,
            'x': 0.1,
            'xanchor': 'left',
            'y': 1.15,
            'yanchor': 'top'
        }
    ]
)

# Show the plot
fig.show()


We can tell from a visual inspection that there is at least no obvious trend in the data. I would like to try comparing this to a more robust dataset. The Our World in Data CO2 emissions dataset (https://ourworldindata.org/co2-emissions) appears to have more data points and more detailed metadata.

We can import the CSV file, create a pandas DataFrame, then use that df to create import the CSV to our database.

In [12]:
# Create a df using pandas to read the CSV
owid_df = pd.read_csv('Resources\owid-co2-data.csv')

Use the pandas DataFrame to help importing the dataset into our database.

In [13]:
# Create variables for the engine connection
dbname = "team6_project3_db"
host = "database-1.croamw4iqxpi.us-east-2.rds.amazonaws.com"
port = "5432" 

# Create a connection to the database
engine = create_engine(f'postgresql://{post_username}:{post_password}@{host}:{port}/{dbname}')

# Import the DataFrame to the PostgreSQL database using the established connection
owid_df.to_sql('owid_co2_data', engine, if_exists='append', index=False)

PendingRollbackError: Can't reconnect until invalid transaction is rolled back.  Please rollback() fully before proceeding (Background on this error at: https://sqlalche.me/e/20/8s2b)

The number '333' above signifies how many rows were successfully imported to the database. However the number of rows in the data is much larger (47,416). Upon inspection all the rows appear to be available in the database in pgAdmin. The number '333' may be referrring to the last chunk of data that pandas processed. `pandas.to_sql` will sometimes break large DataFrames into smaller chunks for efficiency during this process. This appears to be what happened here.

In [14]:
# Quick view of owid_df for reference
owid_df.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,,,,,,,
1,Afghanistan,1851,AFG,3767956.0,,,,,,,...,,0.157,0.0,0.0,0.0,0.0,,,,
2,Afghanistan,1852,AFG,3783940.0,,,,,,,...,,0.156,0.0,0.0,0.0,0.0,,,,
3,Afghanistan,1853,AFG,3800954.0,,,,,,,...,,0.156,0.0,0.0,0.0,0.0,,,,
4,Afghanistan,1854,AFG,3818038.0,,,,,,,...,,0.155,0.0,0.0,0.0,0.0,,,,


Now that the data is stored in the database we can start using it. The DataFrame has many columns and many null values. For the purpose of comparing C02 emissions we're only interested in year, country, and C02 emissions columns. We can create a reduced df and try to eliminate null values.

In [15]:
# Create a new df with only the relevant columns
reduced_df = owid_df[['country', 'year', 'co2']]
reduced_df.head()

Unnamed: 0,country,year,co2
0,Afghanistan,1850,
1,Afghanistan,1851,
2,Afghanistan,1852,
3,Afghanistan,1853,
4,Afghanistan,1854,


In [16]:
# There are a lot of NaN values, I could try to eliminate those
reduced_df = reduced_df.dropna(subset=['co2'])

reduced_df['country'].unique()

array(['Afghanistan', 'Africa', 'Africa (GCP)', 'Albania', 'Algeria',
       'Andorra', 'Angola', 'Anguilla', 'Antarctica',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia',
       'Asia (GCP)', 'Asia (excl. China and India)', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Republic', 'Central America (GCP)', 'Chad',
       'Chile', 'China', 'Christmas Island', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Domini

All of the 'countries' in the country column are not strictly countries; there are groups of countries and continents in the country column that could skew the data. Using pycountry we can filter out the non-countries to hopefully help.

In [17]:
# Create a set of valid countries from the pycountry library
valid_countries = {country.name for country in pycountry.countries}

cleaned_df = reduced_df[reduced_df['country'].isin(valid_countries)].copy()

cleaned_df['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Comoros', 'Congo', 'Cook Islands',
       'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Faroe Islands', 'Fiji', 'Finland',
       'France', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Greece', 'Greenland', 'Grenada', '

Try the animations now that we have a cleaned_df with valid countries.

In [21]:
# Sort the DataFrame by 'year' to ensure proper animation order
cleaned_df = cleaned_df.sort_values('year')

# Create the choropleth map with animation
fig = px.choropleth(
    cleaned_df,
    locations='country',
    locationmode='country names',
    color='co2',
    hover_name='country',
    animation_frame='year',
    color_continuous_scale='Viridis_r',
    projection='natural earth',
    title='CO2 Emissions by Country Over Time (OWID Data)',
)

# Update layout for better appearance and adjust animation speed
fig.update_layout(
    width=1000,
    height=600,
    coloraxis_colorbar=dict(
        title="CO2 Emissions",
        ticksuffix=" Mt",  # Correct unit based on your context
        tickformat=",.0f"
    ),
    updatemenus=[
        {
            'type': 'buttons',
            'buttons': [
                {
                    'label': 'Play',
                    'method': 'animate',
                    'args': [None, {
                        'frame': {'duration': 100},  # Adjust duration here (in milliseconds)
                        'mode': 'immediate',
                        'transition': {'duration': 300}
                    }]
                },
                {
                    'label': 'Pause',
                    'method': 'animate',
                    'args': [[None], {
                        'frame': {'duration': 0, 'redraw': False},
                        'mode': 'immediate',
                        'transition': {'duration': 0}
                    }]
                }
            ],
            'showactive': False,
            'x': 0.1,
            'y': 1.1
        }
    ]
)

# Show the figure
fig.show()


This plot essentially shows mostly the US and China pushing the global limits of C02 emissions. Other countries can not keep up! It's a good visualization to view this almost competition between the two, but not so much for analyzing individual data. 

A line chart for each individual country can show their respective CO2 levels.

In [22]:
# Create a line chart with CO2 emissions over time
fig = px.line(
    cleaned_df,
    x='year', 
    y='co2',
    color='country',  # Line color by country
    title='Average CO2 Emissions by Country Over Time',
    labels={'co2': 'CO2 Emissions (Mt)', 'year': 'Year'}
)

# Show the plot
fig.show()
