In [None]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
import matplotlib.pyplot as plt
import seaborn as sns

def download_and_unzip_kaggle_dataset(api, dataset, download_path):
    response = api.dataset_download_files(dataset, path=download_path, unzip=True)
    print(f"Downloaded and unzipped dataset {dataset}")
    return response

def read_csv(csv_file_path):
    df = pd.read_csv(csv_file_path)
    return df


def main():
    # Authenticate Kaggle API
    api = KaggleApi()
    api.authenticate()

    # Define datasets and download directory
    datasets = [
        'alessandrolobello/agri-food-co2-emission-dataset-forecasting-ml',
        'berkeleyearth/climate-change-earth-surface-temperature-data'
    ]
    data_dir = '../data'

    # Download and unzip datasets
    for dataset in datasets:
        download_and_unzip_kaggle_dataset(api, dataset, data_dir)

    # Define CSV file paths
    csv_files = {
        'agri': os.path.join(data_dir, 'Agrofood_co2_emission.csv'),
        'temp': os.path.join(data_dir, 'GlobalLandTemperaturesByCountry.csv')
    }

    global emission, temperature
    emission = read_csv(csv_files['agri'])
    temperature = read_csv(csv_files['temp'])

    display(emission)
    display(temperature)
    # except Exception as e:
    #     print(f"Error occurred: {e}")

if __name__ == "__main__":
    main()

In [None]:
# Load the data
co2_emission_df = emission
temperature_df = temperature

co2_emission_df['Year'] = co2_emission_df['Year'].astype(str)

# Convert 'dt' to datetime in temperature_df and extract the year as a string
temperature_df['dt'] = pd.to_datetime(temperature_df['dt'])
temperature_df['Year'] = temperature_df['dt'].dt.year.astype(str)

co2_emission_df_filtered = co2_emission_df[(co2_emission_df['Year'] >= '2000') & (co2_emission_df['Year'] <= '2010')]
temperature_df_filtered = temperature_df[(temperature_df['Year'] >= '2000-01-01') & (temperature_df['Year'] <= '2010-12-31')]


# Data cleaning and processing
# Aggregating the temperature data by year and country
temperature_agg = temperature_df_filtered.groupby(['Year', 'Country'])['AverageTemperature'].mean().reset_index()

# Aggregating CO2 emission data by year and country
co2_emission_agg = co2_emission_df_filtered.groupby(['Year', 'Area'])['total_emission'].sum().reset_index()

# Merging the datasets on Year and Country
merged_df = pd.merge(temperature_agg, co2_emission_agg, left_on=['Year', 'Country'], right_on=['Year', 'Area'], how='inner')
merged_df.rename(columns={'EmissionValue': 'CO2 Emission'}, inplace=True)

# Plotting the correlation
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_df, x='total_emission', y='AverageTemperature')
plt.title('Correlation between Agricultural CO2 Emissions and Global Temperature (2000-2010)')
plt.xlabel('CO2 Emission (in metric tons)')
plt.ylabel('Average Temperature (°C)')
plt.show()

# Display the correlation coefficient
correlation = merged_df[['total_emission', 'AverageTemperature']].corr().iloc[0, 1]
print(f"Correlation coefficient between CO2 Emission and Average Temperature: {correlation:.2f}")

In [None]:
plt.figure(figsize=(14, 7))

# Plotting CO2 Emissions over time
sns.lineplot(data=merged_df, x='Year', y='total_emission', hue='Country', marker='o')
plt.title('Agricultural CO2 Emissions Over Time (2000-2010)')
plt.xlabel('Year')
plt.ylabel('CO2 Emission (in metric tons)')
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plotting Average Temperature over time
plt.figure(figsize=(14, 7))
sns.lineplot(data=merged_df, x='Year', y='AverageTemperature', hue='Country', marker='o')
plt.title('Average Temperature Over Time (2000-2010)')
plt.xlabel('Year')
plt.ylabel('Average Temperature (°C)')
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display the correlation coefficient
correlation = merged_df[['total_emission', 'AverageTemperature']].corr().iloc[0, 1]
print(f"Correlation coefficient between CO2 Emission and Average Temperature: {correlation:.2f}")

In [None]:
fig, ax1 = plt.subplots(figsize=(14, 7))

color = 'tab:red'
ax1.set_xlabel('Year')
ax1.set_ylabel('CO2 Emission (in metric tons)', color=color)
ax1.plot(merged_df['Year'], merged_df['total_emission'], color=color, marker='o', label='CO2 Emission')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('Average Temperature (°C)', color=color)  # we already handled the x-label with ax1
ax2.plot(merged_df['Year'], merged_df['AverageTemperature'], color=color, marker='o', label='Average Temperature')
ax2.tick_params(axis='y', labelcolor=color)

# Add title and show plot
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.title('Agricultural CO2 Emissions and Global Temperature (2000-2010)')
fig.legend(loc="upper left", bbox_to_anchor=(0.1,0.9))
plt.show()

# Display the correlation coefficient
correlation = merged_df[['total_emission', 'AverageTemperature']].corr().iloc[0, 1]
print(f"Correlation coefficient between CO2 Emission and Average Temperature: {correlation:.2f}")