In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
population_file = "population_by_country_2020.csv"
population_df = pd.read_csv(population_file)
population_df.head()

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,China,1438207241,0.39%,5540090,153,9388211,-348399.0,1.7,38,61%,18.47%
1,India,1377233523,0.99%,13586631,464,2973190,-532687.0,2.2,28,35%,17.70%
2,United States,330610570,0.59%,1937734,36,9147420,954806.0,1.8,38,83%,4.25%
3,Indonesia,272931713,1.07%,2898047,151,1811570,-98955.0,2.3,30,56%,3.51%
4,Pakistan,219992900,2.00%,4327022,287,770880,-233379.0,3.6,23,35%,2.83%


In [3]:
# Create a filtered dataframe from specific columns
population_cols = ["Country (or dependency)", "Population (2020)", "Density (P/Km²)", "Urban Pop %"]
                  
population_transformed= population_df[population_cols].copy()

# Rename the column headers
population_transformed = population_transformed.rename(columns={"Country (or dependency)": "country",
                                                          "Population (2020)": "population",
                                                          "Density (P/Km²)": "density_kma_squared",
                                                          "Urban Pop %": "urban_population"})
                                                        
                                                        
                                                           



In [4]:
population_transformed.head()

Unnamed: 0,country,population,density_kma_squared,urban_population
0,China,1438207241,153,61%
1,India,1377233523,464,35%
2,United States,330610570,36,83%
3,Indonesia,272931713,151,56%
4,Pakistan,219992900,287,35%


In [5]:
#Clean the data by dropping duplicates and setting the index
population_transformed.drop_duplicates("country", inplace=True)
population_transformed.set_index("country", inplace=True)

population_transformed.head()

Unnamed: 0_level_0,population,density_kma_squared,urban_population
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,1438207241,153,61%
India,1377233523,464,35%
United States,330610570,36,83%
Indonesia,272931713,151,56%
Pakistan,219992900,287,35%


In [6]:
population_transformed.dtypes

population              int64
density_kma_squared     int64
urban_population       object
dtype: object

In [7]:
#Create database connection
connection_string = "postgres:xxx@localhost:5432/country_db"
engine = create_engine(f'postgresql://{connection_string}')

In [8]:
# Confirm tables
engine.table_names()

['country_population']

In [9]:
population_transformed.to_sql(name='country_population', con=engine, if_exists='append', index=True)