In [14]:
import pandas as pd

In [15]:
# csv obtained from https://ourworldindata.org/crop-yields
df_yield_base = pd.read_csv("crop_yields.csv")

df_yield_base.sample(5)

Unnamed: 0,country,year,abaca__manila_hemp__raw_yield,agave_fibres__raw__n_e_c__yield,almond_yield,apples_yield,apricots_yield,areca_nuts_yield,artichokes_yield,asparagus_yield,...,potato_yield_gap,rapeseed_yield_gap,rice_yield_gap,rye_yield_gap,sorghum_yield_gap,soybean_yield_gap,sugarbeet_yield_gap,sugarcane_yield_gap,sunflower_yield_gap,wheat_yield_gap
8896,Netherlands,1987,,,,21.519,,,,,...,4.240902,0.2793,,2.0045,,,17.741299,,,1.5641
3458,Dominican Republic,2022,,,,,,,,,...,,,,,,,,,,
12543,Switzerland,1967,,,,173.6527,,,,,...,17.170002,0.8856,,2.2535,,,20.941803,,,4.0543
9540,Northern America (FAO),1980,,,1.8137,20.7284,11.4807,,11.4411,2.2263,...,,,,,,,,,,
2456,Chile,2017,,,4.3901,47.900898,7.9294,,7.629,5.5703,...,9.5338,0.0,2.9328,0.287,,,0.0,,0.6311,0.0


In [16]:
# Separate countries
df_countries = df_yield_base["country"]
df_countries

0        Afghanistan
1        Afghanistan
2        Afghanistan
3        Afghanistan
4        Afghanistan
            ...     
14572       Zimbabwe
14573       Zimbabwe
14574       Zimbabwe
14575       Zimbabwe
14576       Zimbabwe
Name: country, Length: 14577, dtype: object

In [17]:
# Separate years
df_years = df_yield_base["year"]
df_years

0        1961
1        1962
2        1963
3        1964
4        1965
         ... 
14572    2018
14573    2019
14574    2020
14575    2021
14576    2022
Name: year, Length: 14577, dtype: int64

In [18]:
# Keep only columns ending with _yield
df_crops = df_yield_base.filter(regex="(yield)$")
# Filter out raw_yield and n_e_c yield
df_crops_nonyield = df_crops.filter(regex="(raw_yield)|(n_e_c)|(attainable_yield)")
df_crops = df_crops.drop(columns = df_crops_nonyield.columns)

# Re-add year and country
df_crops["year"] = df_years
df_crops["country"] = df_countries

df_crops

Unnamed: 0,almond_yield,apples_yield,apricots_yield,areca_nuts_yield,artichokes_yield,asparagus_yield,avocados_yield,bambara_beans__dry_yield,banana_yield,barley_yield,...,tung_nuts_yield,vegetables_yield,vetches_yield,walnuts_yield,watermelons_yield,wheat_yield,yams_yield,yautia_yield,year,country
0,,6.8018,6.6390,,,,,,,1.0800,...,,4.2402,,,4.8462,1.0220,,,1961,Afghanistan
1,,6.8018,6.6390,,,,,,,1.0800,...,,4.4585,,,4.8462,0.9735,,,1962,Afghanistan
2,,6.8018,6.6390,,,,,,,1.0800,...,,4.7249,,,5.1385,0.8317,,,1963,Afghanistan
3,,7.8298,7.6863,,,,,,,1.0857,...,,4.6526,,,5.4308,0.9510,,,1964,Afghanistan
4,,8.2258,8.0819,,,,,,,1.0857,...,,4.4856,,,5.5769,0.9723,,,1965,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14572,,8.9243,1.9777,,13.023999,4.0152,5.7944,0.2440,7.528800,5.4602,...,,6.6518,,,,2.7178,,,2018,Zimbabwe
14573,,8.9199,1.9832,,12.873700,4.1016,5.8191,0.2271,7.873500,5.4715,...,,6.4830,,,,3.9149,,,2019,Zimbabwe
14574,,8.9063,1.9938,,12.456100,4.0474,5.8321,0.2300,8.086699,5.4709,...,,6.5628,,,,4.7796,,,2020,Zimbabwe
14575,,8.8972,1.9892,,12.780200,4.0843,5.8240,0.2250,8.290000,5.4767,...,,6.6126,,,,5.0759,,,2021,Zimbabwe


In [19]:
saveCsv = False
df_crops_melted = df_crops.melt(id_vars=["country", "year"], var_name="crop", value_name="yield")
# df_crops_melted["crop"].apply(lambda x: x[:-6])

if saveCsv:
    df_crops_melted.to_csv("../crop_yields_melted.csv")

df_crops_melted

Unnamed: 0,country,year,crop,yield
0,Afghanistan,1961,almond_yield,
1,Afghanistan,1962,almond_yield,
2,Afghanistan,1963,almond_yield,
3,Afghanistan,1964,almond_yield,
4,Afghanistan,1965,almond_yield,
...,...,...,...,...
2171968,Zimbabwe,2018,yautia_yield,
2171969,Zimbabwe,2019,yautia_yield,
2171970,Zimbabwe,2020,yautia_yield,
2171971,Zimbabwe,2021,yautia_yield,


# SQL Insert statements

In [49]:
sql_statements = []

# Unique countries
unique_countries = df_crops_melted["country"].unique()
sql_statements.append("INSERT INTO countries (id, name) VALUES")
for (idx, name) in enumerate(unique_countries):
    # stupid french with their apostrophes (alsof 's Hertogenbosch niet bestaat :) )
    if "'" in name:
        name = str.replace(name, "'", "''")
    # Don't add comma to end if at last entry
    if idx + 1 == len(unique_countries):
        sql_statements.append(f"({idx+1},  \"{name}\")")    
    else:
        sql_statements.append(f"({idx+1},  \"{name}\"),")

# Unique years
unique_years = df_crops_melted["year"].unique()
sql_statements.append("INSERT INTO year (id, value) VALUES")
for (idx, value) in enumerate(unique_years):
    # Don't add comma to end if at last entry
    if idx + 1 == len(unique_years):
        sql_statements.append(f"({idx+1}, {value})")
    else:
        sql_statements.append(f"({idx+1}, {value}),")

# Unique crops
unique_crops = pd.Series(df_crops_melted["crop"].unique())
# Remove _yield at end for each crop
unique_crops = [crop[:-6]for crop in unique_crops]
# Remove trailing _ for some crops
unique_crops = [crop[:-1] if crop[-1] == "_" else crop for crop in unique_crops]
sql_statements.append("INSERT INTO crop (id, name)")
for (idx, name) in enumerate(unique_crops):
    sql_statements.append(f"({idx+1}, \"{name}\"),")
    # Don't add comma to end if at last entry
    if idx + 1 == len(unique_crops):
        sql_statements.append(f"({idx+1},  \"{name}\")")
    else:
        sql_statements.append(f"({idx+1},  \"{name}\"),")

# Crop yields (harder)

sql_statements.append(";")
sql_statements

['INSERT INTO countries (id, name) VALUES',
 '(1,  "Afghanistan"),',
 '(2,  "Africa"),',
 '(3,  "Africa (FAO)"),',
 '(4,  "Albania"),',
 '(5,  "Algeria"),',
 '(6,  "Americas (FAO)"),',
 '(7,  "Angola"),',
 '(8,  "Antigua and Barbuda"),',
 '(9,  "Argentina"),',
 '(10,  "Armenia"),',
 '(11,  "Asia"),',
 '(12,  "Asia (FAO)"),',
 '(13,  "Australia"),',
 '(14,  "Austria"),',
 '(15,  "Azerbaijan"),',
 '(16,  "Bahamas"),',
 '(17,  "Bahrain"),',
 '(18,  "Bangladesh"),',
 '(19,  "Barbados"),',
 '(20,  "Belarus"),',
 '(21,  "Belgium"),',
 '(22,  "Belgium-Luxembourg (FAO)"),',
 '(23,  "Belize"),',
 '(24,  "Benin"),',
 '(25,  "Bhutan"),',
 '(26,  "Bolivia"),',
 '(27,  "Bosnia and Herzegovina"),',
 '(28,  "Botswana"),',
 '(29,  "Brazil"),',
 '(30,  "Brunei"),',
 '(31,  "Bulgaria"),',
 '(32,  "Burkina Faso"),',
 '(33,  "Burundi"),',
 '(34,  "Cambodia"),',
 '(35,  "Cameroon"),',
 '(36,  "Canada"),',
 '(37,  "Cape Verde"),',
 '(38,  "Caribbean (FAO)"),',
 '(39,  "Central African Republic"),',
 '(40,  

# Write inserts to .sql file

In [51]:
with open('postgres/data.sql', 'w', encoding="utf-8") as f:
    for line in sql_statements:
        f.write(line + "\n")