## project 2

In [None]:
import pandas as pd
import plotly.express as px
##Loads the two main libraries used in this project:pandas for data cleaning, manipulation, and merging 
##plotly.express for interactive visualizations
##These libraries provide all core functionalities required for reading datasets, transforming them, and producing analytical graphics.

In [11]:
gdp = pd.read_csv("/Users/minxunxie/Desktop/data/gdp-per-capita-maddison-project-database.csv")
co2 = pd.read_csv("/Users/minxunxie/Desktop/data/co-emissions-per-capita.csv")

gdp.head(), co2.head()

##Reads the two raw CSV files into pandas DataFrames.Displays the first few rows of each dataset to confirm successful loading.
##Before cleaning or merging, it is essential to inspect the raw data structure, verify column names, and understand the format.

(        Entity Code  Year  GDP per capita 900793-annotations
 0  Afghanistan  AFG  1950          1156.0                NaN
 1  Afghanistan  AFG  1951          1170.0                NaN
 2  Afghanistan  AFG  1952          1189.0                NaN
 3  Afghanistan  AFG  1953          1240.0                NaN
 4  Afghanistan  AFG  1954          1245.0                NaN,
         Entity Code  Year  Annual CO₂ emissions (per capita)
 0  Afghanistan  AFG  1949                           0.001992
 1  Afghanistan  AFG  1950                           0.010837
 2  Afghanistan  AFG  1951                           0.011625
 3  Afghanistan  AFG  1952                           0.011468
 4  Afghanistan  AFG  1953                           0.013123)

In [12]:
gdp.columns
co2.columns
##Lists all column names of both datasets.


Index(['Entity', 'Code', 'Year', 'Annual CO₂ emissions (per capita)'], dtype='object')

In [13]:
for col in gdp.columns:
    print(repr(col))
##Accurate column names are required for renaming and merging.
##Many columns contain spaces or long names, so inspecting them prevents errors during data cleaning.

'Entity'
'Code'
'Year'
'GDP per capita'
'900793-annotations'


In [None]:
gdp_clean = gdp.rename(columns={
    "Entity": "country",
    "Code": "code",
    "Year": "year",
    "GDP per capita": "gdp_pc"
})[["country", "code", "year", "gdp_pc"]]


co2_clean = co2.rename(columns={
    "Entity": "country",
    "Code": "code",
    "Year": "year",
    "Annual CO₂ emissions (per capita)": "co2_pc"
})[["country", "code", "year", "co2_pc"]]

##Standardizes variable names using short, consistent labels. Selects only the variables needed for the analysis
##Standardized names simplify downstream operations.
##Both datasets must share the same column names (country, code, year) for merging.
##Restricting to relevant columns reduces noise and avoids unnecessary memory usage.

In [None]:
merged = pd.merge(
    gdp_clean,
    co2_clean,
    on=["country", "code", "year"],
    how="inner"
)

##Combines GDP and CO₂ data into a single dataset by matching each country–year observation.
##Uses an inner join to retain only the years where both GDP and CO₂ data are available.

In [15]:

df_2019 = merged[merged["year"] == 2019]
##Extracts data for one representative year (2019).

fig = px.scatter(
    df_2019,
    x="gdp_pc",
    y="co2_pc",
    log_x=True,
    color="country",
    hover_name="country",
    title="GDP per capita vs CO₂ emissions per capita (2019, log scale)"
)

fig.show()
##Creates a scatter plot for 2019.
##Applies a log scale on GDP to spread out low-income countries.
##GDP per capita ranges from a few hundred to over $100,000; using log scale prevents low-income countries from being compressed.

##insight/conclusion:Overall, GDP per capita and CO₂ emissions per capita are positively correlated, indicating that economic development is generally associated with higher energy consumption and carbon intensity. However, the dispersion among high-income countries reveals that policy choices and energy structures can decouple economic growth from emissions. Middle-income countries show the steepest increases in emissions, while low-income countries remain at very low emission levels. The wide variation among countries with similar income levels suggests that GDP alone cannot fully explain carbon emission patterns.