In [10]:
import pandas as pd

In [11]:
# csv obtained from https://ourworldindata.org/crop-yields
df_yield_base = pd.read_csv("crop_yields.csv")

df_yield_base.sample(5)

Unnamed: 0,country,year,abaca__manila_hemp__raw_yield,agave_fibres__raw__n_e_c__yield,almond_yield,apples_yield,apricots_yield,areca_nuts_yield,artichokes_yield,asparagus_yield,...,potato_yield_gap,rapeseed_yield_gap,rice_yield_gap,rye_yield_gap,sorghum_yield_gap,soybean_yield_gap,sugarbeet_yield_gap,sugarcane_yield_gap,sunflower_yield_gap,wheat_yield_gap
5210,Guatemala,1966,,1.0,,5.3103,,,,,...,19.37,,3.745,,2.0668,,,23.5783,,3.392
4344,European Union (27) (FAO),1979,,,0.4469,45.755398,8.0131,,9.0365,3.268,...,,,,,,,,,,
8554,Mozambique,2021,,,,,,,,,...,12.549299,,4.0993,,2.34,0.3691,,27.800507,1.07,3.0362
7071,Liberia,1990,,,,,,,,,...,,,,,,,,,,
2060,Cape Verde,1962,,,,,,,,,...,,,,,,,,,,


In [12]:
# Separate countries
df_countries = df_yield_base["country"]
# Replace apostrophes with double apostrophes which can be used by SQL
df_countries = df_countries.str.replace("'", "''")
df_countries

0        Afghanistan
1        Afghanistan
2        Afghanistan
3        Afghanistan
4        Afghanistan
            ...     
14572       Zimbabwe
14573       Zimbabwe
14574       Zimbabwe
14575       Zimbabwe
14576       Zimbabwe
Name: country, Length: 14577, dtype: object

In [13]:
# Separate years
df_years = df_yield_base["year"]
df_years

0        1961
1        1962
2        1963
3        1964
4        1965
         ... 
14572    2018
14573    2019
14574    2020
14575    2021
14576    2022
Name: year, Length: 14577, dtype: int64

In [14]:
# Keep only columns ending with _yield
df_crops = df_yield_base.filter(regex="(yield)$")
# Filter out raw_yield and n_e_c yield
df_crops_nonyield = df_crops.filter(regex="(raw_yield)|(n_e_c)|(attainable_yield)")
df_crops = df_crops.drop(columns = df_crops_nonyield.columns)

# Re-add year and country
df_crops["year"] = df_years
df_crops["country"] = df_countries

df_crops

Unnamed: 0,almond_yield,apples_yield,apricots_yield,areca_nuts_yield,artichokes_yield,asparagus_yield,avocados_yield,bambara_beans__dry_yield,banana_yield,barley_yield,...,tung_nuts_yield,vegetables_yield,vetches_yield,walnuts_yield,watermelons_yield,wheat_yield,yams_yield,yautia_yield,year,country
0,,6.8018,6.6390,,,,,,,1.0800,...,,4.2402,,,4.8462,1.0220,,,1961,Afghanistan
1,,6.8018,6.6390,,,,,,,1.0800,...,,4.4585,,,4.8462,0.9735,,,1962,Afghanistan
2,,6.8018,6.6390,,,,,,,1.0800,...,,4.7249,,,5.1385,0.8317,,,1963,Afghanistan
3,,7.8298,7.6863,,,,,,,1.0857,...,,4.6526,,,5.4308,0.9510,,,1964,Afghanistan
4,,8.2258,8.0819,,,,,,,1.0857,...,,4.4856,,,5.5769,0.9723,,,1965,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14572,,8.9243,1.9777,,13.023999,4.0152,5.7944,0.2440,7.528800,5.4602,...,,6.6518,,,,2.7178,,,2018,Zimbabwe
14573,,8.9199,1.9832,,12.873700,4.1016,5.8191,0.2271,7.873500,5.4715,...,,6.4830,,,,3.9149,,,2019,Zimbabwe
14574,,8.9063,1.9938,,12.456100,4.0474,5.8321,0.2300,8.086699,5.4709,...,,6.5628,,,,4.7796,,,2020,Zimbabwe
14575,,8.8972,1.9892,,12.780200,4.0843,5.8240,0.2250,8.290000,5.4767,...,,6.6126,,,,5.0759,,,2021,Zimbabwe


In [15]:
def strip_crop_name(name:str):
    # Remove _yield at end for each crop
    new_name = name[:-6]
    # Remove trailing _ for some crops
    new_name = new_name.rstrip("_")
    return new_name

In [16]:
saveCsv = False
notNull = True

df_crops_melted = df_crops.melt(id_vars=["country", "year"], var_name="crop", value_name="yield")
df_crops_melted["crop"] = df_crops_melted["crop"].apply(strip_crop_name)
if notNull:
    df_crops_melted = df_crops_melted[df_crops_melted["yield"].notna()]

if saveCsv:
    df_crops_melted.to_csv("../crop_yields_melted.csv")

df_crops_melted

Unnamed: 0,country,year,crop,yield
15,Afghanistan,1976,almond,1.661000
16,Afghanistan,1977,almond,1.500000
17,Afghanistan,1978,almond,2.000000
18,Afghanistan,1979,almond,1.750000
19,Afghanistan,1980,almond,1.706900
...,...,...,...,...
2171751,World,2018,yautia,12.365100
2171752,World,2019,yautia,12.109799
2171753,World,2020,yautia,12.451699
2171754,World,2021,yautia,12.125700


# CSV file per table

# SQL Insert statements
2 171 973 nan
785 534 notnan

In [18]:
sql_statements = []

# Unique countries
unique_countries = df_crops_melted["country"].unique()

sql_statements.append("\nINSERT INTO country (name) VALUES")
for (idx, name) in enumerate(unique_countries):
    # Add semicolon instead of comma if at last row
    if idx + 1 == len(unique_countries):
        sql_statements.append(f"('{name}');")    
    else:
        sql_statements.append(f"('{name}'),")

# Unique years
unique_years = df_crops_melted["year"].unique()

sql_statements.append("\nINSERT INTO year (value) VALUES")
for (idx, value) in enumerate(unique_years):
    # Add semicolon instead of comma if at last row
    if idx + 1 == len(unique_years):
        sql_statements.append(f"({value});")
    else:
        sql_statements.append(f"({value}),")

# Unique crops
unique_crops = pd.Series(df_crops_melted["crop"].unique())

sql_statements.append("\nINSERT INTO crop (name) VALUES")
for (idx, name) in enumerate(unique_crops):
    # Add semicolon instead of comma if at last row
    if idx + 1 == len(unique_crops):
        sql_statements.append(f"('{name}');")
    else:
        sql_statements.append(f"('{name}'),")


# Crop yields
sql_statements.append("\nINSERT INTO crop_yield (country_id, crop_id, year_id, value)\nVALUES")
yield_limit = 10
for row in df_crops_melted[:yield_limit].iterrows():
# for row in df_crops_melted.iterrows():
    values = row[1]
    country_id = f"(SELECT id FROM country WHERE name = '{values['country']}')"
    crop_id = f"(SELECT id FROM crop WHERE name = '{values['crop']}')"
    year_id = f"(SELECT id FROM year WHERE value = {values['year']})"
    value = "NULL" if pd.isnull(values['yield']) else  values['yield'] 

    sql = f"({country_id},\n{crop_id},\n{year_id},\n{value}),"

    # Remove comma at last insertion
    if row[0]+1 == len(df_crops_melted):
        sql = sql[:-1]
    
    sql_statements.append(sql)
    
# sql_statements.append("RETURNING country_id, crop_id, year_id;")

# sql_statements

INSERT INTO crop_yield (country_id, crop_id, year_id, value) 
VALUES
(
    (SELECT id FROM country WHERE name = 'Afghanistan'), 
    (SELECT id FROM crop WHERE name = 'almond'), 
    (SELECT id FROM year WHERE value = 1964), 
    20
),
(
    (SELECT id FROM country WHERE name = 'Afghanistan'), 
    (SELECT id FROM crop WHERE name = 'almond'), 
    (SELECT id FROM year WHERE value = 1965), 
    5
)
returning country_id, crop_id, year_id;

# Write inserts to .sql file

In [19]:
with open('postgres/data.sql', 'w', encoding="utf-8") as f:
    for line in sql_statements:
        f.write(line + "\n")

In [20]:
df_crops_melted

Unnamed: 0,country,year,crop,yield
15,Afghanistan,1976,almond,1.661000
16,Afghanistan,1977,almond,1.500000
17,Afghanistan,1978,almond,2.000000
18,Afghanistan,1979,almond,1.750000
19,Afghanistan,1980,almond,1.706900
...,...,...,...,...
2171751,World,2018,yautia,12.365100
2171752,World,2019,yautia,12.109799
2171753,World,2020,yautia,12.451699
2171754,World,2021,yautia,12.125700
