#Global Climate Trends

##Analysis


In [12]:
# Dependencies and Setup
import pandas as pd
import os
import csv
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

In [19]:
# Folder paths
input_folder = os.path.join('input')
output_folder = os.path.join('output')

In [20]:
def process_file(file_name, column_name):
    # Create array to store data for merging
    df_array = []

    # Read the csv file
    input_df = pd.read_csv(os.path.join(input_folder, file_name))

    # Loop over the columns of the dataframe skipping the first column
    for column in input_df.columns[1:]:
        # Create a new dataframe with the country and the year
        temp_df = input_df[['Country', column]].copy()

        # Adding a new column to the dataframe and positionong
        temp_df.insert(0, 'Year', column)

        # Rename for column
        temp_df.rename(columns={column: column_name}, inplace=True)

        # Adding the new dataframe to the array
        df_array.append(temp_df)

    return pd.concat(df_array, ignore_index=True)

In [21]:
def merge_dfs(dfs, column_list):
    # Store the first dataframe as initial value
    merged_df = dfs[0]

    # Loop over the dataframes starting from the second place in the array - always the 2nd, regardless of the df
    for df in dfs[1:]:
        # Merge the dataframes
        merged_df = pd.merge(merged_df, df, on=column_list, how='outer')

    # Return the merged dataframe
    return merged_df

In [22]:
dfs = [
    process_file('primary_energy_consumption.csv', 'Primary Energy Consumption'),
    process_file('energy_consumption_per_capita.csv', 'Energy Consumption per Capita'),
    process_file('co2_emissions.csv', 'CO2 Emissions'),
]

merged_dfs = merge_dfs(dfs, ['Country', 'Year'])
merged_dfs

FileNotFoundError: [Errno 2] No such file or directory: 'input/primary_energy_consumption.csv'

In [None]:
# Read the energy data per source
energy_source_df = pd.read_csv(os.path.join(input_folder, 'energy_consumption_by_source.csv'))
energy_source_df

In [None]:
# Create an array to store the dataframes
df_energy_source_array = []

# Loop over unique values in the column "Product"
for product in energy_source_df['Product'].unique():

    # Create an array to store the temporary dataframes
    df_array = []

    # Loop over the columns of the dataframe skipping the first 3 columns
    for column in energy_source_df.columns[3:]:
        
        # Filter the dataframe where column "Product" is equal to the product
        filtered_df = energy_source_df[energy_source_df['Product'] == product]

        # Create a new dataframe with the country and the year
        temp_df = filtered_df[['Country', column]].copy()

        # Add a new "Year" column to the dataframe
        temp_df.insert(0, 'Year', column)
            
        # Rename the value column
        temp_df.rename(columns={column: f'Energy Source {product}'}, inplace=True)

        # Adding the new dataframe to the array
        df_array.append(temp_df)

    # Concatenate the dataframes in the array
    df_merged = pd.concat(df_array, ignore_index=True)

    # Append the dataframe to the final array
    df_energy_source_array.append(df_merged)

# Merge the dataframes
merged_energy_source_df = merge_dfs(df_energy_source_array, ['Country', 'Year'])
merged_energy_source_df


In [None]:
# Find overlapping values in column between the two dataframes
def find_overlap(dfs, column):
    # Create a set to store the unique values of the first dataframe
    unique_values = set(dfs[0][column].unique())

    # Loop over the dataframes starting from the second place in the array
    for df in dfs[1:]:
        # Update the set with the unique values of the current dataframe
        unique_values = unique_values.intersection(df[column].unique())

    # Return the set
    return unique_values


In [None]:
# Filter the dataframe to keep only values in the list per column
def filter_df(df, column, values):
    # Return the filtered dataframe
    return df[df[column].isin(values)]

In [18]:
# Find year and country intersection between the two dataframes
years = find_overlap([merged_dfs, merged_energy_source_df], 'Year')
countries = find_overlap([merged_dfs, merged_energy_source_df], 'Country')

# Filter the dataframes
filtered_df = filter_df(filter_df(merged_dfs, 'Year', years), 'Country', countries)
filtered_energy_source_df = filter_df(filter_df(merged_energy_source_df, 'Year', years), 'Country', countries)

# Merge the dataframes based on year and country
final_df = pd.merge(filtered_df, filtered_energy_source_df, on=['Country', 'Year'], how='outer')

# Write the final dataframe to a csv file
final_df.to_csv(os.path.join(output_folder, 'final_data.csv'), index=False)

final_df


NameError: name 'find_overlap' is not defined

## What are the projected carbon footprints of countries in the next 10-20 years, considering population growth and the distribution of energy sources?

In [None]:
# Get count of countries
country_counts = final_df['Country'].value_counts()

# Print each unique country
for country, count in country_counts.items():
    print(country)

# List the total number of unique countries
total_countries = len(country_counts)
print(f'Total number of unique countries: {total_countries}')

In [None]:
# Calculate total energy consumption per capita over time
average_energy_consumption_per_capita = final_df.groupby('Year')['Energy Consumption per Capita'].mean()

# Print 
print(f'Year | Average Energy Consumption per Capita {total_energy_consumption}')

In [None]:
# Calculate percentage change of energy use per capita over the years
average_energy_consumption_per_capita_shifted = average_energy_consumption_per_capita.shift(1)
percentage_change = ((average_energy_consumption_per_capita - average_energy_consumption_per_capita_shifted) / average_energy_consumption_per_capita_shifted) * 100

# Combine average energy consumption per capita and percentage change into a new DataFrame with percentages
average_energy_df = pd.DataFrame({
    'Average Annual Percentage Change of Energy Use per Capita': percentage_change.map("{:.2f}%".format)
})

# Print the average energy consumption per capita percentage change over time
print(average_energy_df)

In [None]:
# Analyze and compare CO2 emissions and energy consumption

# Convert to numerical values to avoid error
final_df['CO2 Emissions'] = pd.to_numeric(final_df['CO2 Emissions'], errors='coerce')
final_df['Primary Energy Consumption'] = pd.to_numeric(final_df['Primary Energy Consumption'], errors='coerce')

# Define columns for comparison
co2_emissions = final_df['CO2 Emissions']
energy_consumption = final_df['Primary Energy Consumption']

# Solve for correlation of CO2 emission and average energy consumption, print out result
consumption_emissions_corr = co2_emissions.corr(energy_consumption)
print(f"Correlation between CO2 emissions and energy consumption: {consumption_emissions_corr}")

CO2 emissions and energy consumption have a very strong positive correlation. This indicates that there is a strong linear relationship between CO2 emissions and energy consumption. In practical terms, this means that as energy consumption increases, CO2 emissions tend to increase as well.

In [None]:
# Create graph to visualize correlation between energy consumption and CO2 emission
consumption_emissions_corr = co2_emissions.corr(energy_consumption)

slope, intercept, rvalue, pvalue, stderr = linregress(energy_consumption, co2_emissions)

plt.scatter(energy_consumption, co2_emissions, s=2)
plt.plot(energy_consumption, slope * energy_consumption + intercept, color = 'red')

plt.xlabel('Energy Consumption (exajoules)')
plt.ylabel('CO2 Emissions (million tonnes of carbon)')
plt.grid(True)

print(f"Correlation between CO2 emissions and energy consumption: {consumption_emissions_corr}")
plt.show()

In [None]:
# Analyze and compare CO2 emissions and energy consumption per capita

# Convert to numerical value
final_df['CO2 Emissions'] = pd.to_numeric(final_df['CO2 Emissions'], errors='coerce')
final_df['Energy Consumption per Capita'] = pd.to_numeric(final_df['Energy Consumption per Capita'], errors='coerce')

# Define columns for comparison
co2_emissions = final_df['CO2 Emissions']
energy_consumption_per_capita = final_df['Energy Consumption per Capita']

# Calculate correlation
capita_emissions_corr = co2_emissions.corr(energy_consumption_per_capita)
print(f"Correlation between CO2 emissions and energy consumption per capita: {capita_emissions_corr}")

CO2 emissions and energy consumption per capita have an extremely weak positive correlation.

In [None]:
# Create graph to visualize correlation between CO2 emissions and energy consumption per capita
capita_emissions_corr = co2_emissions.corr(energy_consumption_per_capita)

slope, intercept, rvalue, pvalue, stderr = linregress(energy_consumption_per_capita, co2_emissions)

plt.scatter(energy_consumption_per_capita, co2_emissions, s=1)
plt.plot(energy_consumption_per_capita, slope * energy_consumption_per_capita + intercept, color = 'red')

plt.xlabel('Energy Consumption per Capita (gigajoules)')
plt.ylabel('CO2 Emissions (million tonnes of carbon)')
plt.grid(True)

print(f"Correlation between CO2 emissions and energy consumption per capita: {capita_emissions_corr}")
plt.show()

In [None]:
# Group by Year and calculate the average energy consumption per each energy source
average_energy_per_source_by_year = final_df.groupby('Year').agg({
    'Energy Source Nuclear': 'mean',
    'Energy Source Coal, peat and oil shale': 'mean',
    'Energy Source Oil products': 'mean',
    'Energy Source Electricity': 'mean',
    'Energy Source Natural gas': 'mean'
})

# Find the most common energy source for each year based on the largest unit
most_common_energy_source_per_year = average_energy_per_source_by_year.idxmax(axis=1)

# Print or visualize the average energy consumption per each energy source by year and the most common energy source for each year
print("Average Energy Consumption per Each Energy Source by Year:")
print(average_energy_per_source_by_year)
print("\nMost Common Energy Source for Each Year:")
print(most_common_energy_source_per_year)

In [None]:
# Plotting the data
average_energy_per_source_by_year.plot(kind='line', marker='o', figsize=(10, 6))
plt.title('Average Energy Consumption by Source Type Over 30 Years')
plt.xlabel('Year')
plt.ylabel('Average Energy Consumption (Gigajoules)')
plt.grid(True)
plt.legend(title='Energy Source')
plt.tight_layout()
plt.show()

In [None]:
# Set the figure size
plt.figure(figsize=(12, 8))

# Iterate over each energy source to create nested bar plot 
for col in average_energy_per_source_by_year.columns:
    plt.bar(average_energy_per_source_by_year.index, average_energy_per_source_by_year[col], label=col)

# Add labels and title
plt.xlabel('Year')
plt.ylabel('Total Energy Consumed (Gigajoules)')
plt.title('Total Energy Consumption by Energy Source per Year')
plt.xticks(average_energy_per_source_by_year.index, fontsize=6)  
plt.legend()  

# Display the plot
plt.show()

## Does an increase in population impact the distribution of energy sources among specific countries?


In [None]:
# We can answer this question by calculating the percentage of growth in population for each country 
# over the years and then analyzing the distribution of energy sources among those countries.

# Calculate the population for each country and year using a loop
#Array of DataFrames
population_df = []

#Loop to calculate % growth of poulation per country
for country in final_df['Country'].unique():

    country_df = final_df[final_df['Country'] == country].copy()
    country_df['Population Growth'] = country_df['Energy Consumption per Capita'].pct_change() * 100
    country_df['Population Growth'] = country_df['Population Growth'].apply(lambda x: '{:.2f}'.format(x))

    population_df.append(country_df)
#Concat dfs
df_pop_merged = pd.concat(population_df, ignore_index=True)

In [None]:
#  Analyse the growth in energy source per country and year
# # Calculate the growth of each energy source per year and country
energy_sources_df = ['Energy Source Nuclear', 'Energy Source Coal, peat and oil shale', 'Energy Source Oil products', 'Energy Source Electricity', 'Energy Source Natural gas']
for source in energy_sources_df:
    df_pop_merged[f'{source} Growth %'] = df_pop_merged[source].pct_change() * 100
    df_pop_merged[f'{source} Growth %'] = df_pop_merged[f'{source} Growth %'].apply(lambda x: '{:.2f}'.format(x))
# Display the DataFrame with the growth percentage for each energy source
df_pop_merged

In [None]:
# Find the top 10 countries with the largest total energy consumption for analysis

# Change values to numeric
df_pop_merged['Primary Energy Consumption'] = pd.to_numeric(df_pop_merged['Primary Energy Consumption'], errors='coerce')
df_pop_merged['Population Growth'] = pd.to_numeric(df_pop_merged['Population Growth'], errors='coerce')

# Calculate each country's total energy consumption and population growth for each country across all years
total_energy_and_population = df_pop_merged.groupby('Country').agg({'Primary Energy Consumption': 'sum', 'Population Growth': 'sum'})

# Find the top 10 countries with highest cumulative energy consumption
highest_energy_consumption = total_energy_and_population['Primary Energy Consumption'].nlargest(10)

# Create new DataFrame for top 10 energy spenders for all records (1985-2022)
highest_energy_consumption_df = pd.DataFrame({
    'Country': highest_energy_consumption.index,
    'Total Energy Consumption': highest_energy_consumption.values,
    'Total Population Growth from 1990 - 2021': total_energy_and_population.loc[highest_energy_consumption.index, 'Population Growth'].values
})

# Append '%' sign to the 'Population Growth from 1990 - 2021' column
highest_energy_consumption_df['Total Population Growth from 1990 - 2021'] = highest_energy_consumption_df['Total Population Growth from 1990 - 2021'].map('{:.2f}%'.format)

# Display DataFrame
highest_energy_consumption_df

In [None]:
# Convert to numerical value
df_pop_merged['CO2 Emissions'] = pd.to_numeric(df_pop_merged['CO2 Emissions'], errors='coerce')
df_pop_merged['Population Growth'] = pd.to_numeric(df_pop_merged['Population Growth'], errors='coerce')

# Define columns for comparison
co2_emissions = df_pop_merged['CO2 Emissions']
pop_growth = df_pop_merged['Population Growth']

# Calculate correlation
population_emissions_corr = co2_emissions.corr(pop_growth)
print(f"Correlation between CO2 emissions and population growth: {population_emissions_corr}")

CO2 emissions and population growth have a weak positive correlation. While there is a slight tendency for CO2 emissions and population growth to increase together, the relationship is not very strong. 

In [None]:
# Create graph to visualize correlation between CO2 emissions and global population growth
population_emissions_corr = co2_emissions.corr(pop_growth)

slope, intercept, rvalue, pvalue, stderr = linregress(pop_growth, co2_emissions)

plt.scatter(pop_growth, co2_emissions, s=1)
plt.plot(pop_growth, slope * pop_growth + intercept, color = 'red')

plt.xlabel('Population Growth)')
plt.ylabel('CO2 Emissions (million tonnes of carbon)')
plt.grid(True)

print(f"Correlation between CO2 emissions and global population growth: {population_emissions_corr}")
plt.show()

# What is the most common energy source ? Has it changed over time ?

In [None]:
# Create a stack bar chart for all energy sources over 30 years period

# Exclude columns 'B' and 'C' from the DataFrame
columns_to_exclude = ['Primary Energy Consumption','Energy Consumption per Capita','CO2 Emissions']
filtered_df = final_df.drop(columns=columns_to_exclude)

# Group the data by year and calculate total energy consumption for each energy source
grouped = filtered_df.groupby('Year').sum()

# Create a bar chart to visualize the trends over time
grouped.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Energy Consumption by Source Over Time')
plt.xlabel('Year')
plt.ylabel('Total Energy Consumption')
plt.legend(title='Energy Source')
plt.show()
