Area vs Total

In [16]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt

# Spark Session
spark = SparkSession.builder.appName("CO2 Emissions Analysis").getOrCreate()

# Define schema for the CSV file
schema = T.StructType([
    T.StructField("area", T.StringType(), True),
    T.StructField("sector", T.StringType(), True),
    T.StructField("category1", T.StringType(), True),
    T.StructField("category2", T.StringType(), True),
    T.StructField("unit", T.StringType(), True),
    T.StructField("unit2", T.StringType(), True),
    T.StructField("measure", T.StringType(), True),
    T.StructField("year", T.IntegerType(), True),
    T.StructField("emissions", T.FloatType(), True)
])

# Read the CSV file into a Spark DataFrame
df_spark = spark.read.csv("greenhouse-gas-emissions-by-region-industry-and-household-year-ended-2022.csv", sep="\t", header=False, schema=schema)


# Group by area and sum up emissions across all years
total_emissions_spark = df_spark.groupBy("area").agg(F.sum("emissions").alias("total_emissions"))

# Convert Spark DataFrame to pandas DataFrame
total_emissions_pd = total_emissions_spark.toPandas()

# Plot the emissions for each area
plt.figure(figsize=(10, 6))
total_emissions_pd.set_index("area")["total_emissions"].sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title('Total CO2 Emissions by Area')
plt.xlabel('Area')
plt.ylabel('Total CO2 Emissions (Kilotonnes)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



Year vs Total

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt

# Spark Session
spark = SparkSession.builder.appName("CO2 Emissions Analysis").getOrCreate()

# Define schema for the CSV file
schema = T.StructType([
    T.StructField("area", T.StringType(), True),
    T.StructField("sector", T.StringType(), True),
    T.StructField("category1", T.StringType(), True),
    T.StructField("category2", T.StringType(), True),
    T.StructField("unit", T.StringType(), True),
    T.StructField("unit2", T.StringType(), True),
    T.StructField("measure", T.StringType(), True),
    T.StructField("year", T.IntegerType(), True),
    T.StructField("emissions", T.FloatType(), True)
])

# Read the CSV file into a Spark DataFrame
df_spark = spark.read.csv("greenhouse-gas-emissions-by-region-industry-and-household-year-ended-2022.csv", sep="\t", header=False, schema=schema)

# Group by year and sum up emissions across all areas
total_emissions_spark = df_spark.groupBy("year").agg(F.sum("emissions").alias("total_emissions"))

# Convert Spark DataFrame to pandas DataFrame
total_emissions_pd = total_emissions_spark.toPandas()

# Plot the emissions for each year
plt.figure(figsize=(10, 6))
total_emissions_pd.set_index("year")["total_emissions"].plot(kind='bar', color='skyblue')
plt.title('Total CO2 Emissions by Year')
plt.xlabel('Year')
plt.ylabel('Total CO2 Emissions (Kilotonnes)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
