In [1]:
import sys
sys.path.append("/Users/alex/Downloads/stats/")

import pandas as pd
import os
from db import connection
from db_utils import DBUtils
import xlrd
from tqdm import tqdm
import datetime
import json
from glob import glob
import zipfile

In [2]:
source_description = {
    'link': "http://www.fao.org/faostat/en/?#data/",
    'retrievedDate': datetime.datetime.now().strftime("%d-%B-%y")
}

column_types = [
    # 11 columns
    tuple(["Area Code", "Area", "Item Code", "Item", "ISO Currency Code", "Currency", "Year Code", "Year", "Unit", "Value", "Flag"]),
    tuple(["CountryCode", "Country", "ItemCode", "Item", "ElementGroup", "ElementCode", "Element", "Year", "Unit", "Value", "Flag"]),
    tuple(["Area Code", "Area", "Item Code", "Item", "Element Code", "Element", "Year Code", "Year", "Unit", "Value", "Flag"]),
    tuple(["Country Code", "Country", "Item Code", "Item", "Element Code", "Element", "Year Code", "Year", "Unit", "Value", "Flag"]),
    tuple(["Country Code", "Country", "Source Code", "Source", "Indicator Code", "Indicator", "Year Code", "Year", "Unit", "Value", "Flag"]),
    tuple(["Recipient Country Code", "Recipient Country", "Item Code", "Item", "Donor Country Code", "Donor Country", "Year Code", "Year", "Unit", "Value", "Flag"]),
    # 13 columns
    tuple(["Reporter Country Code", "Reporter Countries", "Partner Country Code", "Partner Countries", "Item Code", "Item", "Element Code", "Element", "Year Code", "Year", "Unit", "Value", "Flag"]),
    # 15 columns
    tuple(["Donor Code", "Donor", "Recipient Country Code", "Recipient Country", "Item Code", "Item", "Element Code", "Element", "Purpose Code", "Purpose", "Year Code", "Year", "Unit", "Value", "Flag"]),
    # for Indicators_from_Household_Surveys_E_All_Data_(Normalized)
    tuple(['Survey Code','Survey','Breakdown Variable Code','Breakdown Variable','Breadown by Sex of the Household Head Code','Breadown by Sex of the Household Head','Indicator Code','Indicator','Measure Code','Measure','Unit','Value','Flag'])
]

category_files = {
    "Production": [
        "Production_Crops_E_All_Data_(Normalized).zip",
        "Production_CropsProcessed_E_All_Data_(Normalized).zip",
        "Production_Livestock_E_All_Data_(Normalized).zip",
        "Production_LivestockPrimary_E_All_Data_(Normalized).zip",
        "Production_LivestockProcessed_E_All_Data_(Normalized).zip",
        "Production_Indices_E_All_Data_(Normalized).zip",
        "Value_of_Production_E_All_Data_(Normalized).zip"
    ],
    "Trade": [
        "Trade_Crops_Livestock_E_All_Data_(Normalized).zip",
        "Trade_LiveAnimals_E_All_Data_(Normalized).zip",
        "Trade_DetailedTradeMatrix_E_All_Data_(Norm).zip", 
        "Trade_DetailedTradeMatrix_E_All_Data_(Norm).csv",
        "Trade_Indices_E_All_Data_(Norm).zip"
    ],
    "Food Balance": [
        "FoodBalanceSheets_E_All_Data_(Normalized).zip",  
        "FoodBalanceSheets_E_All_Data_(Normalized).csv",
        "CommodityBalances_Crops_E_All_Data_(Normalized).zip",
        "CommodityBalances_LivestockFish_E_All_Data_(Normalized).zip",
        "FoodSupply_Crops_E_All_Data_(Normalized).zip",
        "FoodSupply_LivestockFish_E_All_Data_(Normalized).zip"
    ],
    "Food Security": [
        "Indicators_from_Household_Surveys_E_All_Data_(Normalized).zip",
        "Food_Security_Data_E_All_Data_(Normalized).zip"
    ],
    "Prices": [
        "Prices_E_All_Data_(Normalized).zip",
        "Prices_Monthly_E_All_Data_(Normalized).zip",
        "Price_Indices_E_All_Data_(Normalized).zip",
        "PricesArchive_E_All_Data_(Norm).zip",
        "ConsumerPriceIndices_E_All_Data_(Normalized).zip",
        "Deflators_E_All_Data_(Normalized).zip",
        "Exchange_rate_E_All_Data_(Normalized).zip"
    ],
    "Inputs": [
        "Inputs_Fertilizers_E_All_Data_(Normalized).zip",
        "Inputs_FertilizersArchive_E_All_Data_(Norm).zip",
        "Inputs_FertilizersTradeValues_E_All_Data_(Norm).zip",
        "Inputs_Pesticides_Use_E_All_Data_(Normalized).zip",
        "Inputs_Pesticides_Trade_E_All_Data_(Norm).zip",
        "Inputs_Land_E_All_Data_(Normalized).zip",
        "Employment_Indicators_E_All_Data_(Norm).zip"
    ],
    "Population": [
        "Population_E_All_Data_(Norm).zip"
    ],
    "Investment": [
        "Investment_Machinery_E_All_Data_(Norm).zip",
        "Investment_MachineryArchive_E_All_Data_(Norm).zip",
        "Investment_GovernmentExpenditure_E_All_Data_(Normalized).zip",
        "Investment_CreditAgriculture_E_All_Data_(Normalized).zip",
        "Development_Assistance_to_Agriculture_E_All_Data_(Normalized).zip",
        "Investment_ForeignDirectInvestment_E_All_Data_(Norm).zip",
        "Investment_CountryInvestmentStatisticsProfile__E_All_Data_(Normalized).zip"
    ],
    "Macro-Statistics": [
        "Investment_CapitalStock_E_All_Data_(Normalized).zip",
        "Macro-Statistics_Key_Indicators_E_All_Data_(Normalized).zip"
    ],
    "Agri-Environmental Indicators": [
        "Environment_AirClimateChange_E_All_Data_(Norm).zip",
        "Environment_Energy_E_All_Data_(Norm).zip",
        "Environment_Fertilizers_E_All_Data_(Normalized).zip",
        "Environment_LandUse_E_All_Data_(Normalized).zip",
        "Environment_LandCover_E_All_Data_(Normalized).zip",
        "Environment_LivestockPatterns_E_All_Data_(Normalized).zip",
        "Environment_Pesticides_E_All_Data_(Normalized).zip",
        "Environment_Soil_E_All_Data_(Norm).zip",
        "Environment_Water_E_All_Data_(Norm).zip",
        "Environment_Emissions_by_Sector_E_All_Data_(Normalized).zip",
        "Environment_Emissions_intensities_E_All_Data_(Normalized).zip",
        "Environment_Livestock_E_All_Data_(Norm).zip"
    ],
    "Emissions - Agriculture": [
        "Emissions_Agriculture_Agriculture_total_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Enteric_Fermentation_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Manure_Management_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Rice_Cultivation_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Synthetic_Fertilizers_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Manure_applied_to_soils_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Manure_left_on_pasture_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Crop_Residues_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Cultivated_Organic_Soils_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Burning_Savanna_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Burning_crop_residues_E_All_Data_(Norm).zip",
        "Emissions_Agriculture_Energy_E_All_Data_(Norm).zip"
    ],
    "Emissions - Land Use": [
        "Emissions_Land_Use_Land_Use_Total_E_All_Data_(Norm).zip",
        "Emissions_Land_Use_Forest_Land_E_All_Data_(Norm).zip",
        "Emissions_Land_Use_Cropland_E_All_Data_(Norm).zip",
        "Emissions_Land_Use_Grassland_E_All_Data_(Norm).zip",
        "Emissions_Land_Use_Burning_Biomass_E_All_Data_(Norm).zip"
    ],
    "Forestry": [
        "Forestry_E_All_Data_(Normalized).zip",
        "Forestry_Trade_Flows_E_All_Data_(Normalized).zip"
    ],
    "ASTI R&D Indicators": [
        "ASTI_Research_Spending_E_All_Data_(Norm).zip",
        "ASTI_Researchers_E_All_Data_(Normalized).zip"
    ],
    "Emergency Response": [
        "Food_Aid_Shipments_WFP_E_All_Data_(Normalized).zip"
    ]
    }


file_dataset_names = {
    "ASTI_Research_Spending_E_All_Data_(Norm).zip": "ASTI-Expenditures",
    "ASTI_Researchers_E_All_Data_(Normalized).zip": "ASTI-Researchers",
    "CommodityBalances_Crops_E_All_Data_(Normalized).zip": "Commodity Balances - Crops Primary Equivalent",
    "CommodityBalances_LivestockFish_E_All_Data_(Normalized).zip": "Commodity Balances - Livestock and Fish Primary Equivalent",
    "ConsumerPriceIndices_E_All_Data_(Normalized).zip": "Consumer Price Indices",
    "Deflators_E_All_Data_(Normalized).zip": "Deflators",
    "Development_Assistance_to_Agriculture_E_All_Data_(Normalized).zip": "Development Flows to Agriculture",
    "Emissions_Agriculture_Agriculture_total_E_All_Data_(Normalized).zip": "Agriculture Total",
    "Emissions_Agriculture_Burning_crop_residues_E_All_Data_(Normalized).zip": "Burning - Crop Residues",
    "Emissions_Agriculture_Burning_Savanna_E_All_Data_(Normalized).zip": "Burning - Savanna",
    "Emissions_Agriculture_Crop_Residues_E_All_Data_(Normalized).zip": "Crop Residues",
    "Emissions_Agriculture_Cultivated_Organic_Soils_E_All_Data_(Normalized).zip": "Cultivation of Organic Soils",
    "Emissions_Agriculture_Energy_E_All_Data_(Normalized).zip": "Energy Use",
    "Emissions_Agriculture_Enteric_Fermentation_E_All_Data_(Normalized).zip": "Enteric Fermentation",
    "Emissions_Agriculture_Manure_applied_to_soils_E_All_Data_(Normalized).zip": "Manure applied to Soils",
    "Emissions_Agriculture_Manure_left_on_pasture_E_All_Data_(Normalized).zip": "Manure left on Pasture",
    "Emissions_Agriculture_Manure_Management_E_All_Data_(Normalized).zip": "Manure Management",
    "Emissions_Agriculture_Rice_Cultivation_E_All_Data_(Normalized).zip": "Rice Cultivation",
    "Emissions_Agriculture_Synthetic_Fertilizers_E_All_Data_(Normalized).zip": "Synthetic Fertilizers",
    "Emissions_Land_Use_Burning_Biomass_E_All_Data_(Normalized).zip": "Burning - Biomass",
    "Emissions_Land_Use_Cropland_E_All_Data_(Normalized).zip": "Cropland",
    "Emissions_Land_Use_Forest_Land_E_All_Data_(Normalized).zip": "Forest Land",
    "Emissions_Land_Use_Grassland_E_All_Data_(Normalized).zip": "Grassland",
    "Emissions_Land_Use_Land_Use_Total_E_All_Data_(Normalized).zip": "Land Use Total",
    "Employment_Indicators_E_All_Data_(Normalized).zip": "Employment Indicators",
    "Environment_AirClimateChange_E_All_Data_(Norm).zip": "Air and climate change",
    "Environment_Emissions_by_Sector_E_All_Data_(Normalized).zip": "Emissions by sector",
    "Environment_Emissions_intensities_E_All_Data_(Normalized).zip": "Emissions intensities",
    "Environment_Energy_E_All_Data_(Norm).zip": "Energy",
    "Environment_Fertilizers_E_All_Data_(Normalized).zip": "Fertilizers",
    "Environment_LandCover_E_All_Data_(Normalized).zip": "Land Cover",
    "Environment_LandUse_E_All_Data_(Normalized).zip": "Land Use",
    "Environment_Livestock_E_All_Data_(Norm).zip": "Livestock",
    "Environment_LivestockPatterns_E_All_Data_(Normalized).zip": "Livestock Patterns",
    "Environment_Pesticides_E_All_Data_(Normalized).zip": "Pesticides",
    "Environment_Soil_E_All_Data_(Norm).zip": "Soil",
    "Environment_Water_E_All_Data_(Norm).zip": "Water",
    "Exchange_rate_E_All_Data_(Normalized).zip": "Exchange rates - Annual",
    "Food_Aid_Shipments_WFP_E_All_Data_(Normalized).zip": "Food Aid Shipments (WFP)",
    "Food_Security_Data_E_All_Data_(Normalized).zip": "Suite of Food Security Indicators",
    "FoodBalanceSheets_E_All_Data_(Normalized).zip": "Food Balance Sheets",
    "FoodSupply_Crops_E_All_Data_(Normalized).zip": "Food Supply - Crops Primary Equivalent",
    "FoodSupply_LivestockFish_E_All_Data_(Normalized).zip": "Food Supply - Livestock and Fish Primary Equivalent",
    "Forestry_E_All_Data_(Normalized).zip": "Forestry Production and Trade",
    "Forestry_Trade_Flows_E_All_Data_(Normalized).zip": "Forestry Trade Flows",
    "Indicators_from_Household_Surveys_E_All_Data_(Normalized).zip": "Indicators from Household Surveys (gender, area, socioeconomics)",
    "Inputs_Fertilizers_E_All_Data_(Normalized).zip": "Fertilizers",
    "Inputs_FertilizersArchive_E_All_Data_(Normalized).zip": "Fertilizers archive",
    "Inputs_FertilizersTradeValues_E_All_Data_(Norm).zip": "Fertilizers - Trade Value",
    "Inputs_Land_E_All_Data_(Normalized).zip": "Land Use",
    "Inputs_Pesticides_Trade_E_All_Data_(Normalized).zip": "Pesticides Trade",
    "Inputs_Pesticides_Use_E_All_Data_(Normalized).zip": "Pesticides Use",
    "Investment_CapitalStock_E_All_Data_(Normalized).zip": "Capital Stock",
    "Investment_CountryInvestmentStatisticsProfile_E_All_Data_(Normalized).zip": "Country Investment Statistics Profile",
    "Investment_CreditAgriculture_E_All_Data_(Normalized).zip": "Credit to Agriculture",
    "Investment_ForeignDirectInvestment_E_All_Data_(Normalized).zip": "Foreign Direct Investment (FDI)",
    "Investment_GovernmentExpenditure_E_All_Data_(Normalized).zip": "Government Expenditure",
    "Investment_Machinery_E_All_Data_(Normalized).zip": "Machinery",
    "Investment_MachineryArchive_E_All_Data_(Normalized).zip": "Machinery Archive",
    "Macro-Statistics_Key_Indicators_E_All_Data_(Normalized).zip": "Macro Indicators",
    "Population_E_All_Data_(Normalized).zip": "Annual population",
    "Price_Indices_E_All_Data_(Normalized).zip": "Producer Price Indices - Annual",
    "Prices_E_All_Data_(Normalized).zip": "Producer Prices - Annual",
    "Prices_Monthly_E_All_Data_(Normalized).zip": "Producer Prices - Monthly",
    "PricesArchive_E_All_Data_(Normalized).zip": "Producer Prices - Archive",
    "Production_Crops_E_All_Data_(Normalized).zip": "Crops",
    "Production_CropsProcessed_E_All_Data_(Normalized).zip": "Crops processed",
    "Production_Indices_E_All_Data_(Normalized).zip": "Production Indices",
    "Production_Livestock_E_All_Data_(Normalized).zip": "Live Animals",
    "Production_LivestockPrimary_E_All_Data_(Normalized).zip": "Livestock Primary",
    "Production_LivestockProcessed_E_All_Data_(Normalized).zip": "Livestock Processed",
    "Trade_Crops_Livestock_E_All_Data_(Normalized).zip": "Crops and livestock products",
    "Trade_DetailedTradeMatrix_E_All_Data_(Normalized).zip": "Detailed trade matrix",
    "Trade_Indices_E_All_Data_(Normalized).zip": "Trade Indices",
    "Trade_LiveAnimals_E_All_Data_(Normalized).zip": "Live animals",
    "Value_of_Production_E_All_Data_(Normalized).zip": "Value of Agricultural Production",
    "FoodBalanceSheets_E_All_Data_(Normalized).csv": "Food Balance Sheets",
    "Trade_DetailedTradeMatrix_E_All_Data_(Norm).csv": "Detailed trade matrix"
}

files_to_exclude = ["CommodityBalances_Crops_E_All_Data_(Normalized).zip", "CommodityBalances_LivestockFish_E_All_Data_(Normalized).zip",
                    "FoodSupply_Crops_E_All_Data_(Normalized).zip", "FoodSupply_LivestockFish_E_All_Data_(Normalized).zip",
                    "Indicators_from_Household_Surveys_E_All_Data_(Normalized).zip", "Population_E_All_Data_(Norm).zip",
                    "Prices_Monthly_E_All_Data_(Normalized).zip", "PricesArchive_E_All_Data_(Norm).zip",
                    "ConsumerPriceIndices_E_All_Data_(Normalized).zip"]

In [3]:
new_files = []

for i in tqdm(glob("/Volumes/Новый/FAOSTAT/*.zip")):
    item = i.split("/")[-1]
    if item not in files_to_exclude and item not in file_dataset_names:
        new_files.append(item)

100%|██████████| 73/73 [00:00<00:00, 224639.91it/s]


In [4]:
new_files

['ASTI_Expenditures_E_All_Data_(Normalized).zip',
 'Environment_LivestockManure_E_All_Data_(Normalized).zip',
 'Environment_Temperature_change_E_All_Data_(Normalized).zip',
 'Inputs_FertilizersNutrient_E_All_Data_(Normalized).zip',
 'Inputs_FertilizersProduct_E_All_Data_(Normalized).zip',
 'Inputs_LandUse_E_All_Data_(Normalized).zip']

In [5]:
priority = ["Food_Security_Data_E_All_Data_(Normalized).zip",
"FoodBalanceSheets_E_All_Data_(Normalized).zip",
"FoodSupply_Crops_E_All_Data_(Normalized).zip",
"FoodSupply_LivestockFish_E_All_Data_(Normalized).zip",
"Indicators_from_Household_Surveys_E_All_Data_(Normalized).zip",
"Production_Crops_E_All_Data_(Normalized).zip",
"Production_CropsProcessed_E_All_Data_(Normalized).zip",
"Production_Indices_E_All_Data_(Normalized).zip",
"Production_Livestock_E_All_Data_(Normalized).zip",
"Production_LivestockPrimary_E_All_Data_(Normalized).zip",
"Production_LivestockProcessed_E_All_Data_(Normalized).zip"]

## Datasets

In [6]:
names = []

for x in glob("/Volumes/Новый/FAOSTAT/*.zip"):
    x = x.split("/")[-1]
  
    if x in priority:
        for j in category_files:
            if x in category_files[j]:
                names.append(j + ": " + file_dataset_names[x] + " - FAO (2019)")
datasets = pd.DataFrame()
datasets['name'] = names
datasets['id'] = [x for x in range(len(names))]
datasets.to_csv("datasets.csv", index=False)

In [7]:
datasets

Unnamed: 0,name,id
0,Food Security: Suite of Food Security Indicato...,0
1,Food Balance: Food Balance Sheets - FAO (2019),1
2,Food Balance: Food Supply - Crops Primary Equi...,2
3,Food Balance: Food Supply - Livestock and Fish...,3
4,Food Security: Indicators from Household Surve...,4
5,Production: Crops - FAO (2019),5
6,Production: Crops processed - FAO (2019),6
7,Production: Production Indices - FAO (2019),7
8,Production: Live Animals - FAO (2019),8
9,Production: Livestock Primary - FAO (2019),9


## Sources

In [8]:
# We need this info to populate additionalInfo and dataPublisherSource in database (sources table).
# This info was collected manually and it takes time, not sure if we need it.


names_publisher_source = {"Production: Livestock Primary - FAO (2019)": "",
                         "Production: Crops processed - FAO (2019)": "The main data source is official statistics from FAO member countries, collected either through annual production questionnaires (APQ) distributed to countries, from national publications (Yearbooks and Pocketbooks) or from official country websites. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries can significantly affect reliability and comparability of data. In nearly all cases where countries provide their official production data, data are recorded as reported by countries. The only exceptions are the application of conversion factors (e.g., if a country reports rice production on a husked rice basis, that production must be converted to a paddy basis), or cases of reporting errors (typically digit errors). In instances where no official data is available, data from semi-official sources (including commodity-specific trade publications) may be used. If no data from either official or unofficial sources is available, data are imputed. In all cases, data are flagged accordingly.",
                         "Production: Livestock Processed - FAO (2019)": "The main data source is official statistics from FAO member countries, collected either through annual production questionnaires (APQ) distributed to countries, from national publications (Yearbooks and Pocketbooks) or from official country websites. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries can significantly affect reliability and comparability of data. In nearly all cases where countries provide their official production data, data are recorded as reported by countries. In instances where no official data is available, data from semi-official sources (including commodity-specific trade publications) may be used. If no data from either official or unofficial sources is available, data are imputed. In all cases, data are flagged accordingly.",
                         "Food Balance: Food Supply - Livestock and Fish Primary Equivalent - FAO (2019)": "The main source is official statistics from FAO member countries. Exceptionally, unofficial data are also used as well as estimated/imputed data. In both cases this is 'flagged'. Data are recorded as countries report them, except for eliminating obvious errors. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries affect significantly reliability and comparability of data.",
                         "Production: Crops - FAO (2019)": "The main data source is official statistics from FAO member countries, collected either through annual production questionnaires (APQ) distributed to countries, from national publications (Yearbooks and Pocketbooks) or from official country websites. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries can significantly affect reliability and comparability of data. In nearly all cases where countries provide their official production data, data are recorded as reported by countries. The only exceptions are the application of conversion factors (e.g., if a country reports rice production on a husked rice basis, that production must be converted to a paddy basis), or cases of reporting errors (typically digit errors). In instances where no official data is available, data from semi-official sources (including commodity-specific trade publications) may be used. If no data from either official or unofficial sources is available, data are imputed. In all cases, data are flagged accordingly.",
                         "Food Security: Indicators from Household Surveys (gender, area, socioeconomics) - FAO (2019)": "", # no data
                         "Production: Production Indices - FAO (2019)": "Indices are calculated on the basis of the production data provided by FAO member countries or estimated/imputed by FAO.",
                         "Production: Live Animals - FAO (2019)": "The main data source is official statistics from FAO member countries, collected either through annual production questionnaires (APQ) distributed to countries, from national publications (Yearbooks and Pocketbooks) or from official country websites. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries can significantly affect reliability and comparability of data. In nearly all cases where countries provide their official production data, data are recorded as reported by countries. In instances where no official data is available, data from semi-official sources (including commodity-specific trade publications) may be used. If no data from either official or unofficial sources is available, data are imputed. In all cases, data are flagged accordingly.",
                         "Food Balance: Food Balance Sheets - FAO (2019)": "The main source is official statistics from FAO member countries. Exceptionally, unofficial data are also used as well as estimated/imputed data. In both cases this is 'flagged'. Data are recorded as countries report them, except for eliminating obvious errors. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries affect significantly reliability and comparability of data.",
                         "Food Balance: Food Supply - Crops Primary Equivalent - FAO (2019)": "The main source is official statistics from FAO member countries. Exceptionally, unofficial data are also used as well as estimated/imputed data. In both cases this is 'flagged'. Data are recorded as countries report them, except for eliminating obvious errors. The source data can originate from surveys, administrative data and estimates based on expert observations. Which type of source is used by countries affect significantly reliability and comparability of data.",
                         "Food Security: Suite of Food Security Indicators - FAO (2019)": "See metadata for the basic variables that make up the indicator." 
                         }
names_add_info_source = {"Production: Livestock Primary - FAO (2019)": "The dataset includes the following variables: Producing Animals/Slaughtered, Yield and Production Quantity. The unit of measure for these variables are shown in item 4 below. Slaughtering is measured through activity of slaughterhouses i.e. production of marketable meat for human consumption. Estimates of ‘other slaughtering’ can be added for a more accurate picture of meat production. Livestock is accounted for by categories that capture their rearing, either for fattening then slaughter, or for herd renewal, i.e. for breeding and/or milking. Aggregates are the sum of available data. Aggregates include estimated data. For some item aggregates, conversion factors are applied to values when calculating totals.",
                        "Production: Crops processed - FAO (2019)": "Production quantities in tonnes of processed crops",
                         "Production: Livestock Processed - FAO (2019)": "Production quantities in tons of processed crops.",
                         "Food Balance: Food Supply - Livestock and Fish Primary Equivalent - FAO (2019)": "Food Balance Sheets (FBS) are compiled every year by FAO, mainly with country-level data on the production and trade of food commodities. Using these data and the available information on seed rates, waste coefficients, stock changes and types of utilization (feed, food, processing and other utilization), a supply/utilization account is prepared for each commodity in weight terms. The food component of the commodity account, which is usually derived as a balancing item, refers to the total amount of the commodity available for human consumption during the year. Besides commodity-by-commodity information, the FAO FBS also provide total food availability estimates by aggregating the food component of all commodities including fishery products. From these values and the available population estimates, the per person dietary energy and protein and fat supplies are derived and expressed on a daily basis. In the FBS production data refer only to primary products while data for all other elements also include processed products derived there from, expressed in primary commodity equivalent.",
                         "Production: Crops - FAO (2019)": "Areas refer to the area under cultivation. Area under cultivation means the area that corresponds to the total sown area, but after the harvest it excludes ruined areas (e.g. due to natural disasters). If the same land parcel is used twice in the same year, the area of this parcel can be counted twice. For tree crops, some countries provide data in terms of number of trees instead of in area. This number is then converted to an area estimate using typical planting density conversions. Production means the harvested production. Harvested production means production including on-holding losses and wastage, quantities consumed directly on the farm and marketed quantities, indicated in units of basic product weight. Harvest year means the calendar year in which the harvest begins. Yield means the harvested production per ha for the area under cultivation. Seed quantity comprises all amounts of the commodity in question used during the reference period for reproductive purposes, such as seed or seedlings. Whenever official data are not available, seed figures can be estimated either as a percentage of production or by multiplying a seed rate (the average amount of seed needed per hectare planted) with the planted area of the particular crop of the subsequent year. Usually, the average seed rate in any given country does not vary greatly from year to year.",
                         "Food Security: Indicators from Household Surveys (gender, area, socioeconomics) - FAO (2019)": "",# no data
                         "Production: Production Indices - FAO (2019)": "The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2004-2006. They are based on the sum of price-weighted quantities of different agricultural commodities produced after deductions of quantities used as seed and feed weighted in a similar manner. The resulting aggregate represents, therefore, disposable production for any use except as seed and feed. All the indices at the country, regional and world levels are calculated by the Laspeyres formula. Production quantities of each commodity are weighted by 2004-2006 average international commodity prices and summed for each year. To obtain the index, the aggregate for a given year is divided by the average aggregate for the base period 2004-2006. Since the FAO indices are based on the concept of agriculture as a single enterprise, amounts of seed and feed are subtracted from the production data to avoid double counting them, once in the production data and once with the crops or livestock produced from them. Deductions for seed (in the case of eggs, for hatching) and for livestock and poultry feed apply to both domestically produced and imported commodities. They cover only primary agricultural products destined to animal feed (e.g. maize, potatoes, milk, etc.). Processed and semi-processed feed items such as bran, oilcakes, meals and molasses have been completely excluded from the calculations at all stages. It should be noted that when calculating indices of agricultural, food and non-food production, all intermediate primary inputs of agricultural origin are deducted. However, for indices of any other commodity group, only inputs originating from within the same group are deducted; thus, only seed is removed from the group “crops” and from all crop subgroups, such as cereals, oil crops, etc.; and both feed and seed originating from within the livestock sector (e.g. milk feed, hatching eggs) are removed from the group “livestock products”. For the main two livestock subgroups, namely, meat and milk, only feed originating from the respective subgroup is removed. The 'international commodity prices' are used in order to avoid the use of exchange rates for obtaining continental and world aggregates, and also to improve and facilitate international comparative analysis of productivity at the national level. These 'international prices', expressed in so-called 'international dollars', are derived using a Geary-Khamis formula for the agricultural sector. This method assigns a single “price” to each commodity. For example, one metric ton of wheat has the same price regardless of the country where it was produced. The currency unit in which the prices are expressed has no influence on the indices published. The commodities covered in the computation of indices of agricultural production are all crops and livestock products originating in each country. Practically all products are covered, with the main exception of fodder crops. The category of food production includes commodities that are considered edible and that contain nutrients. Accordingly, coffee and tea are excluded along with inedible commodities because, although edible, they have practically no nutritive value. Indices for meat production are computed based on data for production from indigenous animals, which takes account of the meat equivalent of exported live animals but excludes the meat equivalent of imported live animals. For index purposes, annual changes in livestock and poultry numbers or in their average live weight are not taken into account. The indices are calculated from production data presented on a calendar year basis. The FAO indices may differ from those produced by the countries themselves because of differences in concepts of production, coverage, weights, time reference of data and methods of calculation.",
                         "Production: Live Animals - FAO (2019)": "The data on livestock numbers are intended to cover all domestic animals irrespective of their age and the place or purpose of their breeding. Estimates have been made for non-reporting countries as well as for countries reporting incomplete data. However, in certain countries, data for chickens, ducks and turkeys do not yet seem to represent the total number of these birds. Certain other countries give a single figure for all poultry; data for these countries are shown under “Chickens”.",
                         "Food Balance: Food Balance Sheets - FAO (2019)": "Food Balance Sheets (FBS) are compiled every year by FAO, mainly with country-level data on the production and trade of food commodities. Using these data and the available information on seed rates, waste coefficients, stock changes and types of utilization (feed, food, processing and other utilization), a supply/utilization account is prepared for each commodity in weight terms. The food component of the commodity account, which is usually derived as a balancing item, refers to the total amount of the commodity available for human consumption during the year. Besides commodity-by-commodity information, the FAO FBS also provide total food availability estimates by aggregating the food component of all commodities including fishery products. From these values and the available population estimates, the per person dietary energy and protein and fat supplies are derived and expressed on a daily basis. In the FBS production data refer only to primary products while data for all other elements also include processed products derived there from, expressed in primary commodity equivalent.",
                         "Food Balance: Food Supply - Crops Primary Equivalent - FAO (2019)": "Food Balance Sheets (FBS) are compiled every year by FAO, mainly with country-level data on the production and trade of food commodities. Using these data and the available information on seed rates, waste coefficients, stock changes and types of utilization (feed, food, processing and other utilization), a supply/utilization account is prepared for each commodity in weight terms. The food component of the commodity account, which is usually derived as a balancing item, refers to the total amount of the commodity available for human consumption during the year. Besides commodity-by-commodity information, the FAO FBS also provide total food availability estimates by aggregating the food component of all commodities including fishery products. From these values and the available population estimates, the per person dietary energy and protein and fat supplies are derived and expressed on a daily basis. In the FBS production data refer only to primary products while data for all other elements also include processed products derived there from, expressed in primary commodity equivalent.",
                         "Food Security: Suite of Food Security Indicators - FAO (2019)": "See attached document which lists statistical concepts and definitions with the respective indicator."
                        }

In [9]:
names, desc, d_ids = [], [], []


for i, row in datasets.iterrows():
    description = {}
    description['dataPublishedBy'] = "Food and Agriculture Organization of the United Nations (FAO) (2019) "
    description['dataPublisherSource'] = names_publisher_source[row['name']]
    description['link'] = "http://www.fao.org/faostat/en/?#data/"
    description['retrievedDate'] = datetime.datetime.now().strftime("%d-%b-%Y")
    description['additionalInfo'] = names_add_info_source[row['name']]
    dataset_id = row['id']
    
    source_name = row['name'].split(":")[-1]

    names.append(source_name)
    desc.append(json.dumps(description))
    d_ids.append(dataset_id)
    
sources = pd.DataFrame()
sources['name'] = names
sources['description'] = desc
sources['dataset_id'] = d_ids
sources.to_csv("sources.csv", index=False)

In [10]:
sources

Unnamed: 0,name,description,dataset_id
0,Suite of Food Security Indicators - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",0
1,Food Balance Sheets - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",1
2,Food Supply - Crops Primary Equivalent - FAO ...,"{""dataPublishedBy"": ""Food and Agriculture Orga...",2
3,Food Supply - Livestock and Fish Primary Equi...,"{""dataPublishedBy"": ""Food and Agriculture Orga...",3
4,"Indicators from Household Surveys (gender, ar...","{""dataPublishedBy"": ""Food and Agriculture Orga...",4
5,Crops - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",5
6,Crops processed - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",6
7,Production Indices - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",7
8,Live Animals - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",8
9,Livestock Primary - FAO (2019),"{""dataPublishedBy"": ""Food and Agriculture Orga...",9


## Variables

In [11]:
# We need this info to populate description in database (variables table).
# This info was collected manually and it takes time, not sure if we need it.

files_to_description = {
    "Food_Security_Data_E_All_Data_(Normalized).zip": "For detailed description of the indicators below see attached document: Average Dietary Supply Adequacy; Average Value of Food Production; Share of Dietary Energy Supply Derived from Cereals, Roots and Tubers; Average Protein Supply; Average Supply of Protein of Animal Origin; Rail lines Density (per 100 square km of land area); Percentage of Population Using At Least Basic Drinking Water Sources; Percentage of Population Using Safely Managed Drinking Water Sources; Percentage of Population Using At Least Basic Sanitation Services; Percentage of Population Using Safely Managed Sanitation Services; Cereal Import Dependency Ratio; Percent of Arable Land Equipped for Irrigation; Value of Food Imports in Total Merchandise Exports; Political Stability and Absence of Violence; Domestic Food Price Volatility Index; Per capita food production variability; Per Capita Food Supply Variability; Prevalence of Undernourishment; Prevalence of Severe Food Insecurity; Prevalence of Moderate or Severe Food Insecurity; Children aged <5 years wasted (%); Children aged <5 years stunted (%); Children aged <5 years overweight (%); Percentage of Adult Obesity; Prevalence of Anaemia among Women of Reproductive Age; Prevalence of Exclusive Breastfeeding among Infants 0-5 Months of Age; Prevalence of Low Birthweight; Number of Undernourished People; Number of Severely Food Insecure People; Prevalence of Moderately or Severely Food Insecure People; Minimum Dietary Energy Requirement (MDER); Average Dietary Energy Requirement (ADER); Coefficient of Variation of Habitual Caloric Consumption Distribution (CV); Skewness of Habitual Caloric Consumption Distribution (SK); Incidence of Caloric Losses at Retail Distribution Level; Dietary Energy Supply (DES); Average Fat Supply",
    "FoodBalanceSheets_E_All_Data_(Normalized).zip": "Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",
    "FoodSupply_Crops_E_All_Data_(Normalized).zip": "Food supply data is some of the most important data in FAOSTAT. In fact, this data is for the basis for estimation of global and national undernourishment assessment, when it is combined with parameters and other data sets. This data has been the foundation of food balance sheets ever since they were first constructed. The data is accessed by both business and governments for economic analysis and policy setting, as well as being used by the academic community.",
    "FoodSupply_LivestockFish_E_All_Data_(Normalized).zip": "Food supply data is some of the most important data in FAOSTAT. In fact, this data is for the basis for estimation of global and national undernourishment assessment, when it is combined with parameters and other data sets. This data has been the foundation of food balance sheets ever since they were first constructed. The data is accessed by both business and governments for economic analysis and policy setting, as well as being used by the academic community.",
    "Indicators_from_Household_Surveys_E_All_Data_(Normalized).zip": "",
    "Production_Crops_E_All_Data_(Normalized).zip": "Crop statistics are recorded for 173 products, covering the following categories: Crops Primary, Fibre Crops Primary, Cereals, Coarse Grain, Citrus Fruit, Fruit, Jute Jute-like Fibres, Oilcakes Equivalent, Oil crops Primary, Pulses, Roots and Tubers, Treenuts and Vegetables and Melons. Data are expressed in terms of area harvested, production quantity and yield. The objective is to comprehensively cover production of all primary crops for all countries and regions in the world.Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. Area data relate to harvested area. Some countries report sown or cultivated area only; however, in these countries the sown or cultivated area does not differ significantly in normal years from the area actually harvested, either because practically the whole area sown is harvested or because the area surveys are conducted around the harvest period.Vegetables, total (including melons): Data relate to vegetable crops grown mainly for human consumption. Crops such as cabbages, pumpkins and carrots, when explicitly cultivated for animal feed, are therefore excluded. Statistics on vegetables are not available in many countries, and the coverage of the reported data differs from country to country. In general, it appears that the data refer to crops grown in field and market gardens mainly for sale, thus excluding crops cultivated in kitchen gardens or small family gardens mainly for household consumption.Fruit, total (excluding melons): Data refer to total production of fresh fruit, whether finally used for direct consumption for food or feed, or processed into different products: dry fruit, juice, jam, alcohol, etc. Generally, production data relate to plantation crops or orchard crops grown mainly for sale. Data on production from scattered trees used mainly for home consumption are not usually collected. Production from wild plants, particularly berries, which is of some importance in certain countries, is generally disregarded by national statistical services. Therefore, the data for the various fruits and berries are rather incomplete. Bananas and plantains: Figures on bananas refer, as far as possible, to all edible fruit-bearing species of the genus Musa except Musa paradisiaca, commonly known as plantain. Unfortunately, several countries make no distinction in their statistics between bananas and plantains and publish only overall estimates. When this occurs and there is some indication or assumption that the data reported refer mainly to bananas, the data are included. The production data on bananas and plantains reported by the various countries are also difficult to compare because a number of countries report in terms of bunches, which generally means that the stalk is included in the weight. Dates, plantains and total grapes are included in the “total fruit” aggregated figures, while olives are excluded. Treenuts, aggregated: Production of nuts (including chestnuts) relates to nuts in the shell or in the husk. Statistics are very scanty and generally refer only to crops for sale. In addition to the kind of nuts shown separately, production data include all other treenuts mainly used as dessert or table nuts, such as pecan nuts, pili nuts, sapucaia nuts and macadamia nuts. Nuts mainly used for flavouring beverages are excludedas are masticatory and stimulant nuts and nuts used mainly for the extraction of oil or butter, including areca/betel nuts, cola nuts, illipe nuts, karate nuts, coconuts, tung nuts, oilpalm nuts etc.",
    "Production_CropsProcessed_E_All_Data_(Normalized).zip": "The dataset covers the following commodities: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.",
    "Production_Indices_E_All_Data_(Normalized).zip": "The dataset includes data on gross and net production indices for various food and agriculture aggregates expressed in both totals and per capita.",
    "Production_Livestock_E_All_Data_(Normalized).zip": "The dataset contains the following commodities and commodity aggregates thereof : Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.",
    "Production_LivestockPrimary_E_All_Data_(Normalized).zip": "The dataset contains the following commodities and commodity aggregates thereof : Beeswax; Eggs (various types); Hair, horse; Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat indigenous (ass, bird nes, buffalo, camel, cattle, chicken, duck, geese, goat, horse, mule, other camelids, pig, rabbit, rodents, sheep, turkey); Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins, furs; Skins (goat, sheep); Snails, not sea; Wool, greasy.Meat: Data relate to animals slaughtered within national boundaries, irrespective of their origin. All data shown relate to total meat production, that is, from both commercial and farm slaughter. Data are given in terms of dressed carcass weight, excluding offal and slaughter fats. Production of beef and buffalo meat includes veal; mutton and goat meat includes meat from lambs and kids; pig meat includes bacon and ham in fresh equivalent. Poultry meat includes meat from all domestic birds and refers, wherever possible, to ready-to-cook weight. Data on poultry-meat production reported by national statistical offices could be expressed in terms of either live weight, eviscerated weight, ready-to-cook weight or dressed weight. Data for countries reporting in other than ready-to-cook weight have been converted into the ready-to-cook equivalent.Milk: Data on milk production relate to total production of whole fresh milk, excluding the milk sucked by young animals but including amounts fed to livestock. Eggs: Some countries have no statistics on egg production, and estimates had to be derived from such related data as chicken or total poultry numbers and reported or assumed rates of egg laying. Most of the countries that have statistics on egg production report either the total weight of eggs or the numbers of eggs produced. Data generally refer to total production, including eggs for hatching, in both agricultural and non-agricultural sectors.Wool: Wool production statistics are generally given for greasy wool, which contains from 30 to 65 percent impurities. In order to make figures comparable, data are also given on a degreased (scoured) basis.Honey: Production data should cover the amount sold by beekeepers plus other recorded collection of honey. The data presented in the table are incomplete, particularly with regard to African and Asian countries.",
    "Production_LivestockProcessed_E_All_Data_(Normalized).zip": "The dataset covers the following commodities: Butter and ghee, sheep milk; Butter of goat milk; Butter, buffalo milk; Butter, cow milk; Cheese of goat milk; Cheese, buffalo milk; Cheese, sheep milk; Cheese, skimmed cow milk; Cheese, whole cow milk; Cream fresh; Ghee, butteroil of cow milk; Ghee, of buffalo milk; Lard; Milk, dry buttermilk; Milk, skimmed condensed; Milk, skimmed cow; Milk, skimmed dried; Milk, skimmed evaporated; Milk, whole condensed; Milk, whole dried; Milk, whole evaporated; Silk raw; Tallow; Whey, condensed; Whey, dry; Yoghurt."
}

In [26]:
import csv

ids, names, units, dataset_ids = [], [], [], []

varname_description = {} # it's for populate variables description in the database while inserting in db

unique_var_names = []
path_to_unrar = "/Users/alex/Downloads/stats/faostat/data/"
ind = 0
for item in tqdm(glob("/Volumes/Новый/FAOSTAT/*.zip")):
    item_name = item.split("/")[-1]   
    if item_name in priority:
    
        prefix = file_dataset_names[item_name]
        zip_ref = zipfile.ZipFile(item, 'r')
        csv_filename = zip_ref.namelist()[0]
        zip_ref.extract(csv_filename, path_to_unrar)
        zip_ref.close()
        
        for j in category_files:
            if item_name in category_files[j]:
                dataset_name = j + ": " + file_dataset_names[item_name] + " - FAO (2019)"
                dataset_id = datasets[datasets['name'] == dataset_name]['id'].values[0]
                
                
    
        
        data = pd.read_csv(path_to_unrar + csv_filename, encoding='latin-1')
        filecolumns = tuple(data.columns)
        

        if filecolumns == column_types[0] or filecolumns == column_types[1] \
           or filecolumns == column_types[2] or filecolumns == column_types[3] \
           or filecolumns == column_types[4] or filecolumns == column_types[8]:
            
            for i,row in data.iterrows():
                if filecolumns == column_types[0]:
                    variablename = row['Item'] + " (FAO (2019))"                    

                if filecolumns == column_types[1]:
                    variablename = '%s - %s (FAO (2019))' % (row['Item'], row['Element'])
                    
                    
                if filecolumns == column_types[2]:
                    variablename = '%s - %s (FAO (2019))' % (row['Item'], row['Element'])
                if filecolumns == column_types[3]:
                    variablename = '%s - %s (FAO (2019))' % (row['Item'], row['Element'])
                if filecolumns == column_types[4]:
                    variablename = '%s - %s (FAO (2019))' % (row['Indicator'], row['Source'])
                    
                if filecolumns == column_types[8]:
                    variablename = row['Breakdown Variable'] + " (FAO (2019))"
                

                if variablename not in unique_var_names:
                    varname_description[variablename] = files_to_description[item_name]
                    unique_var_names.append(variablename)
                    units.append(row['Unit'])
                    ids.append(ind)
                    ind+=1
                    dataset_ids.append(dataset_id)
        




  0%|          | 0/73 [00:00<?, ?it/s][A[A[A


 51%|█████     | 37/73 [00:10<00:10,  3.37it/s][A[A[A


 52%|█████▏    | 38/73 [26:52<4:40:21, 480.63s/it][A[A[A


 53%|█████▎    | 39/73 [36:45<4:51:27, 514.33s/it][A[A[A


 55%|█████▍    | 40/73 [43:25<4:24:03, 480.10s/it][A[A[A


 59%|█████▉    | 43/73 [44:00<2:49:45, 339.52s/it][A[A[A


 86%|████████▋ | 63/73 [49:28<40:25, 242.58s/it]  [A[A[A


 88%|████████▊ | 64/73 [49:43<26:10, 174.55s/it][A[A[A


 89%|████████▉ | 65/73 [51:09<19:43, 147.97s/it][A[A[A


 90%|█████████ | 66/73 [51:32<12:52, 110.38s/it][A[A[A


 92%|█████████▏| 67/73 [54:06<12:20, 123.41s/it][A[A[A


 93%|█████████▎| 68/73 [54:21<07:34, 90.86s/it] [A[A[A


100%|██████████| 73/73 [54:21<00:00, 44.67s/it][A[A[A

In [22]:
variables = pd.DataFrame()
variables['id'] = ids
variables['name'] = unique_var_names
variables['unit'] = units
variables['dataset_id'] = dataset_ids

In [23]:
variables.to_csv("variables.csv", index=False)

## Datapoints

This functions is for creating variables for each file. As you can see, we created variables multiple ways according to the file type. Here we create a column containing these variables, then get unique and for each unique we get subset of the original data. It works quite long for a large file (1gb<).

In [24]:
def create_var_val_1_2(row):
    
    row['var_val'] = ('%s - %s (FAO (2019))' % (row['Item'], row['Element']))
    
    return row

def create_var_val_0(row):
    
    row['var_val'] = (data['Item'] + " (FAO (2019))")
    return row

def create_var_val_3(row):
    
    row['var_val'] = ('%s - %s (FAO (2019))' % (data['Item'], data['Element']))
    return row

def create_var_val_4(row):
    
    row['var_val'] = ('%s - %s (FAO (2019))' % (row['Indicator'], data['Source']))
    return row

def create_var_val_8(row):
    
    row['var_val'] = row['Breakdown Variable'] + " (FAO (2019))"
    year = row['Survey'].split()[-1]
    
    if len(year) > 4:
        try:
            row['Year'] = (int(row['Survey'].split()[-1].split("-")[0]) + (int(row['Survey'].split()[-1].split("-")[1])))//2
        except:
            print(row['Survey'])
    else:
        row['Year'] = int(row['Survey'].split()[-1])
    row['Area'] = row['Survey'].split("-")[0]
    
    return row

In [25]:
for item in tqdm(glob("data/*.csv")):
    
    print(item)
    data = pd.read_csv(item, encoding='latin-1')
    filecolumns = tuple(data.columns)

    if filecolumns == column_types[0]:
        data = data.apply(create_var_val_0, axis=1)
    if filecolumns == column_types[1]:
        data = data.apply(create_var_val_1_2, axis=1)
    if filecolumns == column_types[2]:
        data = data.apply(create_var_val_1_2, axis=1)
    if filecolumns == column_types[3]:
        data = data.apply(create_var_val_3, axis=1)
    if filecolumns == column_types[4]:
        data = data.apply(create_var_val_4, axis=1)
    if filecolumns == column_types[8]:
        data = data.apply(create_var_val_8, axis=1)

    distinct_vals = data['var_val'].unique()

    for var_name in tqdm(distinct_vals):

        var_id = variables[variables['name'] == var_name]['id'].values[0]
        sub_data = data[data['var_val'] == var_name]
        res = pd.DataFrame()

        res['year'] = sub_data['Year']
        res['country'] = sub_data['Area']
        res['value'] = sub_data['Value']  
        res.dropna(subset=['value'], inplace=True)
        res.to_csv('./datapoints/datapoints_%s.csv' % str(var_id), index=False)  



  0%|          | 0/11 [00:00<?, ?it/s][A[A

data/Production_CropsProcessed_E_All_Data_(Normalized).csv





  0%|          | 0/21 [00:00<?, ?it/s][A[A[A


 14%|█▍        | 3/21 [00:00<00:00, 29.75it/s][A[A[A


 33%|███▎      | 7/21 [00:00<00:00, 32.20it/s][A[A[A


 48%|████▊     | 10/21 [00:00<00:00, 31.10it/s][A[A[A


 67%|██████▋   | 14/21 [00:00<00:00, 32.75it/s][A[A[A


 90%|█████████ | 19/21 [00:00<00:00, 34.88it/s][A[A[A


100%|██████████| 21/21 [00:00<00:00, 36.21it/s][A[A[A

  9%|▉         | 1/11 [01:34<15:46, 94.70s/it][A[A

data/Production_LivestockPrimary_E_All_Data_(Normalized).csv





  0%|          | 0/160 [00:00<?, ?it/s][A[A[A


  1%|          | 1/160 [00:00<00:18,  8.69it/s][A[A[A


  2%|▏         | 3/160 [00:00<00:17,  9.15it/s][A[A[A


  2%|▎         | 4/160 [00:00<00:16,  9.38it/s][A[A[A


  4%|▍         | 6/160 [00:00<00:15,  9.80it/s][A[A[A


  4%|▍         | 7/160 [00:00<00:15,  9.86it/s][A[A[A


  6%|▌         | 9/160 [00:00<00:14, 10.58it/s][A[A[A


  7%|▋         | 11/160 [00:01<00:14, 10.36it/s][A[A[A


  8%|▊         | 13/160 [00:01<00:14,  9.83it/s][A[A[A


  9%|▉         | 15/160 [00:01<00:15,  9.59it/s][A[A[A


 11%|█         | 17/160 [00:01<00:15,  9.42it/s][A[A[A


 12%|█▏        | 19/160 [00:01<00:13, 10.21it/s][A[A[A


 13%|█▎        | 21/160 [00:02<00:12, 11.26it/s][A[A[A


 14%|█▍        | 23/160 [00:02<00:12, 10.78it/s][A[A[A


 16%|█▌        | 25/160 [00:02<00:12, 10.69it/s][A[A[A


 17%|█▋        | 27/160 [00:02<00:12, 10.59it/s][A[A[A


 18%|█▊        | 29/160 [00:02<00:12, 10.74it/s][A

data/Production_Crops_E_All_Data_(Normalized).csv





  0%|          | 0/527 [00:00<?, ?it/s][A[A[A


  0%|          | 1/527 [00:00<01:36,  5.48it/s][A[A[A


  0%|          | 2/527 [00:00<01:30,  5.83it/s][A[A[A


  1%|          | 3/527 [00:00<01:25,  6.12it/s][A[A[A


  1%|          | 4/527 [00:00<01:21,  6.42it/s][A[A[A


  1%|          | 5/527 [00:00<01:18,  6.62it/s][A[A[A


  1%|          | 6/527 [00:00<01:17,  6.74it/s][A[A[A


  1%|▏         | 7/527 [00:01<01:17,  6.67it/s][A[A[A


  2%|▏         | 8/527 [00:01<01:17,  6.71it/s][A[A[A


  2%|▏         | 9/527 [00:01<01:17,  6.65it/s][A[A[A


  2%|▏         | 10/527 [00:01<01:17,  6.64it/s][A[A[A


  2%|▏         | 11/527 [00:01<01:16,  6.74it/s][A[A[A


  2%|▏         | 12/527 [00:01<01:16,  6.73it/s][A[A[A


  2%|▏         | 13/527 [00:01<01:16,  6.70it/s][A[A[A


  3%|▎         | 14/527 [00:02<01:17,  6.64it/s][A[A[A


  3%|▎         | 15/527 [00:02<01:17,  6.63it/s][A[A[A


  3%|▎         | 16/527 [00:02<01:15,  6.76it/s][A[A

 26%|██▌       | 136/527 [00:21<01:05,  6.00it/s][A[A[A


 26%|██▌       | 137/527 [00:21<01:04,  6.02it/s][A[A[A


 26%|██▌       | 138/527 [00:21<01:05,  5.95it/s][A[A[A


 26%|██▋       | 139/527 [00:21<01:05,  5.92it/s][A[A[A


 27%|██▋       | 140/527 [00:21<01:05,  5.91it/s][A[A[A


 27%|██▋       | 141/527 [00:22<01:03,  6.07it/s][A[A[A


 27%|██▋       | 142/527 [00:22<01:02,  6.21it/s][A[A[A


 27%|██▋       | 143/527 [00:22<01:01,  6.23it/s][A[A[A


 27%|██▋       | 144/527 [00:22<01:02,  6.09it/s][A[A[A


 28%|██▊       | 145/527 [00:22<01:03,  5.98it/s][A[A[A


 28%|██▊       | 146/527 [00:22<01:05,  5.80it/s][A[A[A


 28%|██▊       | 147/527 [00:23<01:04,  5.93it/s][A[A[A


 28%|██▊       | 148/527 [00:23<01:02,  6.02it/s][A[A[A


 28%|██▊       | 149/527 [00:23<01:01,  6.13it/s][A[A[A


 28%|██▊       | 150/527 [00:23<01:01,  6.13it/s][A[A[A


 29%|██▊       | 151/527 [00:23<01:00,  6.23it/s][A[A[A


 29%|██▉       | 152/527

 51%|█████     | 270/527 [00:41<00:38,  6.63it/s][A[A[A


 51%|█████▏    | 271/527 [00:41<00:38,  6.58it/s][A[A[A


 52%|█████▏    | 272/527 [00:41<00:39,  6.53it/s][A[A[A


 52%|█████▏    | 273/527 [00:42<00:38,  6.57it/s][A[A[A


 52%|█████▏    | 274/527 [00:42<00:39,  6.44it/s][A[A[A


 52%|█████▏    | 275/527 [00:42<00:38,  6.52it/s][A[A[A


 52%|█████▏    | 276/527 [00:42<00:38,  6.58it/s][A[A[A


 53%|█████▎    | 277/527 [00:42<00:37,  6.65it/s][A[A[A


 53%|█████▎    | 278/527 [00:42<00:37,  6.69it/s][A[A[A


 53%|█████▎    | 279/527 [00:43<00:36,  6.83it/s][A[A[A


 53%|█████▎    | 280/527 [00:43<00:35,  6.94it/s][A[A[A


 53%|█████▎    | 281/527 [00:43<00:35,  7.02it/s][A[A[A


 54%|█████▎    | 282/527 [00:43<00:35,  6.84it/s][A[A[A


 54%|█████▎    | 283/527 [00:43<00:36,  6.74it/s][A[A[A


 54%|█████▍    | 284/527 [00:43<00:36,  6.64it/s][A[A[A


 54%|█████▍    | 285/527 [00:43<00:36,  6.68it/s][A[A[A


 54%|█████▍    | 286/527

 77%|███████▋  | 404/527 [01:01<00:17,  7.16it/s][A[A[A


 77%|███████▋  | 405/527 [01:01<00:16,  7.18it/s][A[A[A


 77%|███████▋  | 406/527 [01:01<00:16,  7.19it/s][A[A[A


 77%|███████▋  | 407/527 [01:01<00:16,  7.20it/s][A[A[A


 77%|███████▋  | 408/527 [01:01<00:16,  7.19it/s][A[A[A


 78%|███████▊  | 409/527 [01:01<00:16,  7.15it/s][A[A[A


 78%|███████▊  | 410/527 [01:01<00:16,  7.15it/s][A[A[A


 78%|███████▊  | 411/527 [01:02<00:16,  7.17it/s][A[A[A


 78%|███████▊  | 412/527 [01:02<00:16,  7.14it/s][A[A[A


 78%|███████▊  | 413/527 [01:02<00:16,  7.08it/s][A[A[A


 79%|███████▊  | 414/527 [01:02<00:15,  7.10it/s][A[A[A


 79%|███████▊  | 415/527 [01:02<00:15,  7.16it/s][A[A[A


 79%|███████▉  | 416/527 [01:02<00:15,  7.18it/s][A[A[A


 79%|███████▉  | 417/527 [01:02<00:15,  7.23it/s][A[A[A


 79%|███████▉  | 418/527 [01:03<00:15,  7.17it/s][A[A[A


 80%|███████▉  | 419/527 [01:03<00:14,  7.21it/s][A[A[A


 80%|███████▉  | 420/527

data/Food_Security_Data_E_All_Data_(Normalized).csv





  0%|          | 0/36 [00:00<?, ?it/s][A[A[A


 19%|█▉        | 7/36 [00:00<00:00, 64.14it/s][A[A[A


 42%|████▏     | 15/36 [00:00<00:00, 67.80it/s][A[A[A


 64%|██████▍   | 23/36 [00:00<00:00, 69.39it/s][A[A[A


 86%|████████▌ | 31/36 [00:00<00:00, 71.11it/s][A[A[A


100%|██████████| 36/36 [00:00<00:00, 75.25it/s][A[A[A

 36%|███▋      | 4/11 [52:09<1:12:26, 620.94s/it][A[A

data/FoodSupply_LivestockFish_E_All_Data_(Normalized).csv





  0%|          | 0/203 [00:00<?, ?it/s][A[A[A


  0%|          | 1/203 [00:00<00:38,  5.30it/s][A[A[A


  1%|          | 2/203 [00:00<00:36,  5.54it/s][A[A[A


  1%|▏         | 3/203 [00:00<00:34,  5.81it/s][A[A[A


  2%|▏         | 4/203 [00:00<00:33,  6.03it/s][A[A[A


  2%|▏         | 5/203 [00:00<00:31,  6.24it/s][A[A[A


  3%|▎         | 6/203 [00:00<00:30,  6.39it/s][A[A[A


  3%|▎         | 7/203 [00:01<00:30,  6.48it/s][A[A[A


  4%|▍         | 8/203 [00:01<00:29,  6.53it/s][A[A[A


  4%|▍         | 9/203 [00:01<00:29,  6.59it/s][A[A[A


  5%|▍         | 10/203 [00:01<00:29,  6.62it/s][A[A[A


  5%|▌         | 11/203 [00:01<00:29,  6.59it/s][A[A[A


  6%|▌         | 12/203 [00:01<00:28,  6.61it/s][A[A[A


  6%|▋         | 13/203 [00:02<00:28,  6.64it/s][A[A[A


  7%|▋         | 14/203 [00:02<00:28,  6.68it/s][A[A[A


  7%|▋         | 15/203 [00:02<00:27,  6.75it/s][A[A[A


  8%|▊         | 16/203 [00:02<00:27,  6.76it/s][A[A

 67%|██████▋   | 136/203 [00:20<00:10,  6.64it/s][A[A[A


 67%|██████▋   | 137/203 [00:20<00:09,  6.62it/s][A[A[A


 68%|██████▊   | 138/203 [00:21<00:09,  6.68it/s][A[A[A


 68%|██████▊   | 139/203 [00:21<00:09,  6.74it/s][A[A[A


 69%|██████▉   | 140/203 [00:21<00:09,  6.74it/s][A[A[A


 69%|██████▉   | 141/203 [00:21<00:09,  6.79it/s][A[A[A


 70%|██████▉   | 142/203 [00:21<00:08,  6.86it/s][A[A[A


 70%|███████   | 143/203 [00:21<00:08,  6.89it/s][A[A[A


 71%|███████   | 144/203 [00:21<00:08,  6.87it/s][A[A[A


 71%|███████▏  | 145/203 [00:22<00:08,  6.93it/s][A[A[A


 72%|███████▏  | 146/203 [00:22<00:08,  6.94it/s][A[A[A


 72%|███████▏  | 147/203 [00:22<00:08,  6.98it/s][A[A[A


 73%|███████▎  | 148/203 [00:22<00:07,  7.01it/s][A[A[A


 73%|███████▎  | 149/203 [00:22<00:07,  7.03it/s][A[A[A


 74%|███████▍  | 150/203 [00:22<00:07,  7.00it/s][A[A[A


 74%|███████▍  | 151/203 [00:22<00:07,  7.02it/s][A[A[A


 75%|███████▍  | 152/203

data/Production_LivestockProcessed_E_All_Data_(Normalized).csv





  0%|          | 0/30 [00:00<?, ?it/s][A[A[A


 17%|█▋        | 5/30 [00:00<00:00, 40.61it/s][A[A[A


 30%|███       | 9/30 [00:00<00:00, 39.14it/s][A[A[A


 47%|████▋     | 14/30 [00:00<00:00, 40.84it/s][A[A[A


 63%|██████▎   | 19/30 [00:00<00:00, 41.76it/s][A[A[A


 83%|████████▎ | 25/30 [00:00<00:00, 45.41it/s][A[A[A


100%|██████████| 30/30 [00:00<00:00, 46.97it/s][A[A[A

 55%|█████▍    | 6/11 [1:33:04<1:09:10, 830.16s/it] [A[A

data/Indicators_from_Household_Surveys_E_All_Data_(Normalized).csv





  0%|          | 0/27 [00:00<?, ?it/s][A[A[A


 15%|█▍        | 4/27 [00:00<00:00, 30.31it/s][A[A[A


 26%|██▌       | 7/27 [00:00<00:00, 29.28it/s][A[A[A


 37%|███▋      | 10/27 [00:00<00:00, 27.98it/s][A[A[A


 48%|████▊     | 13/27 [00:00<00:00, 27.61it/s][A[A[A


 59%|█████▉    | 16/27 [00:00<00:00, 26.84it/s][A[A[A


 70%|███████   | 19/27 [00:00<00:00, 27.61it/s][A[A[A


 85%|████████▌ | 23/27 [00:00<00:00, 29.87it/s][A[A[A


100%|██████████| 27/27 [00:00<00:00, 30.17it/s][A[A[A

 64%|██████▎   | 7/11 [1:38:24<45:08, 677.00s/it]  [A[A

data/Production_Livestock_E_All_Data_(Normalized).csv





  0%|          | 0/22 [00:00<?, ?it/s][A[A[A


 14%|█▎        | 3/22 [00:00<00:00, 21.37it/s][A[A[A


 23%|██▎       | 5/22 [00:00<00:00, 19.31it/s][A[A[A


 36%|███▋      | 8/22 [00:00<00:00, 19.57it/s][A[A[A


 45%|████▌     | 10/22 [00:00<00:00, 18.64it/s][A[A[A


 55%|█████▍    | 12/22 [00:00<00:00, 18.22it/s][A[A[A


 68%|██████▊   | 15/22 [00:00<00:00, 20.10it/s][A[A[A


 77%|███████▋  | 17/22 [00:00<00:00, 19.53it/s][A[A[A


 95%|█████████▌| 21/22 [00:00<00:00, 22.74it/s][A[A[A


100%|██████████| 22/22 [00:01<00:00, 21.61it/s][A[A[A

 73%|███████▎  | 8/11 [1:40:40<25:43, 514.65s/it][A[A

data/Production_Indices_E_All_Data_(Normalized).csv





  0%|          | 0/64 [00:00<?, ?it/s][A[A[A


  3%|▎         | 2/64 [00:00<00:04, 15.42it/s][A[A[A


  6%|▋         | 4/64 [00:00<00:03, 16.17it/s][A[A[A


  9%|▉         | 6/64 [00:00<00:03, 16.74it/s][A[A[A


 12%|█▎        | 8/64 [00:00<00:03, 17.14it/s][A[A[A


 16%|█▌        | 10/64 [00:00<00:03, 15.77it/s][A[A[A


 19%|█▉        | 12/64 [00:00<00:03, 15.01it/s][A[A[A


 22%|██▏       | 14/64 [00:00<00:03, 14.37it/s][A[A[A


 25%|██▌       | 16/64 [00:01<00:03, 14.15it/s][A[A[A


 28%|██▊       | 18/64 [00:01<00:03, 13.98it/s][A[A[A


 31%|███▏      | 20/64 [00:01<00:03, 13.92it/s][A[A[A


 34%|███▍      | 22/64 [00:01<00:03, 13.89it/s][A[A[A


 38%|███▊      | 24/64 [00:01<00:02, 13.60it/s][A[A[A


 41%|████      | 26/64 [00:01<00:02, 14.03it/s][A[A[A


 44%|████▍     | 28/64 [00:01<00:02, 14.44it/s][A[A[A


 47%|████▋     | 30/64 [00:02<00:02, 14.13it/s][A[A[A


 50%|█████     | 32/64 [00:02<00:02, 13.98it/s][A[A[A


 53%|██

data/FoodSupply_Crops_E_All_Data_(Normalized).csv





  0%|          | 0/529 [00:00<?, ?it/s][A[A[A


  0%|          | 1/529 [00:00<05:44,  1.53it/s][A[A[A


  0%|          | 2/529 [00:01<05:05,  1.72it/s][A[A[A


  1%|          | 3/529 [00:01<04:23,  1.99it/s][A[A[A


  1%|          | 4/529 [00:01<03:55,  2.23it/s][A[A[A


  1%|          | 5/529 [00:02<03:32,  2.47it/s][A[A[A


  1%|          | 6/529 [00:02<03:10,  2.74it/s][A[A[A


  1%|▏         | 7/529 [00:02<03:00,  2.90it/s][A[A[A


  2%|▏         | 8/529 [00:02<03:00,  2.89it/s][A[A[A


  2%|▏         | 9/529 [00:03<03:04,  2.81it/s][A[A[A


  2%|▏         | 10/529 [00:03<03:03,  2.82it/s][A[A[A


  2%|▏         | 11/529 [00:03<02:55,  2.95it/s][A[A[A


  2%|▏         | 12/529 [00:04<03:23,  2.54it/s][A[A[A


  2%|▏         | 13/529 [00:04<03:13,  2.67it/s][A[A[A


  3%|▎         | 14/529 [00:05<03:01,  2.84it/s][A[A[A


  3%|▎         | 15/529 [00:05<03:02,  2.81it/s][A[A[A


  3%|▎         | 16/529 [00:05<03:15,  2.63it/s][A[A

 26%|██▌       | 136/529 [00:46<02:02,  3.21it/s][A[A[A


 26%|██▌       | 137/529 [00:47<02:06,  3.10it/s][A[A[A


 26%|██▌       | 138/529 [00:47<02:09,  3.01it/s][A[A[A


 26%|██▋       | 139/529 [00:47<02:05,  3.10it/s][A[A[A


 26%|██▋       | 140/529 [00:48<02:05,  3.11it/s][A[A[A


 27%|██▋       | 141/529 [00:48<02:06,  3.08it/s][A[A[A


 27%|██▋       | 142/529 [00:48<02:02,  3.15it/s][A[A[A


 27%|██▋       | 143/529 [00:48<01:59,  3.22it/s][A[A[A


 27%|██▋       | 144/529 [00:49<02:01,  3.17it/s][A[A[A


 27%|██▋       | 145/529 [00:49<01:59,  3.21it/s][A[A[A


 28%|██▊       | 146/529 [00:49<02:04,  3.09it/s][A[A[A


 28%|██▊       | 147/529 [00:50<02:04,  3.06it/s][A[A[A


 28%|██▊       | 148/529 [00:50<02:27,  2.58it/s][A[A[A


 28%|██▊       | 149/529 [00:51<02:21,  2.68it/s][A[A[A


 28%|██▊       | 150/529 [00:51<02:19,  2.71it/s][A[A[A


 29%|██▊       | 151/529 [00:51<02:24,  2.61it/s][A[A[A


 29%|██▊       | 152/529

 51%|█████     | 270/529 [01:28<01:15,  3.44it/s][A[A[A


 51%|█████     | 271/529 [01:28<01:14,  3.47it/s][A[A[A


 51%|█████▏    | 272/529 [01:28<01:12,  3.55it/s][A[A[A


 52%|█████▏    | 273/529 [01:28<01:11,  3.60it/s][A[A[A


 52%|█████▏    | 274/529 [01:29<01:10,  3.61it/s][A[A[A


 52%|█████▏    | 275/529 [01:29<01:11,  3.54it/s][A[A[A


 52%|█████▏    | 276/529 [01:29<01:10,  3.57it/s][A[A[A


 52%|█████▏    | 277/529 [01:30<01:11,  3.54it/s][A[A[A


 53%|█████▎    | 278/529 [01:30<01:09,  3.61it/s][A[A[A


 53%|█████▎    | 279/529 [01:30<01:09,  3.58it/s][A[A[A


 53%|█████▎    | 280/529 [01:30<01:10,  3.55it/s][A[A[A


 53%|█████▎    | 281/529 [01:31<01:10,  3.50it/s][A[A[A


 53%|█████▎    | 282/529 [01:31<01:11,  3.48it/s][A[A[A


 53%|█████▎    | 283/529 [01:31<01:11,  3.46it/s][A[A[A


 54%|█████▎    | 284/529 [01:32<01:10,  3.45it/s][A[A[A


 54%|█████▍    | 285/529 [01:32<01:11,  3.43it/s][A[A[A


 54%|█████▍    | 286/529

 76%|███████▋  | 404/529 [02:06<00:35,  3.54it/s][A[A[A


 77%|███████▋  | 405/529 [02:06<00:35,  3.51it/s][A[A[A


 77%|███████▋  | 406/529 [02:06<00:34,  3.57it/s][A[A[A


 77%|███████▋  | 407/529 [02:07<00:33,  3.59it/s][A[A[A


 77%|███████▋  | 408/529 [02:07<00:34,  3.56it/s][A[A[A


 77%|███████▋  | 409/529 [02:07<00:33,  3.53it/s][A[A[A


 78%|███████▊  | 410/529 [02:08<00:33,  3.55it/s][A[A[A


 78%|███████▊  | 411/529 [02:08<00:33,  3.55it/s][A[A[A


 78%|███████▊  | 412/529 [02:08<00:33,  3.54it/s][A[A[A


 78%|███████▊  | 413/529 [02:08<00:32,  3.59it/s][A[A[A


 78%|███████▊  | 414/529 [02:09<00:31,  3.64it/s][A[A[A


 78%|███████▊  | 415/529 [02:09<00:31,  3.61it/s][A[A[A


 79%|███████▊  | 416/529 [02:09<00:31,  3.58it/s][A[A[A


 79%|███████▉  | 417/529 [02:10<00:31,  3.58it/s][A[A[A


 79%|███████▉  | 418/529 [02:10<00:30,  3.61it/s][A[A[A


 79%|███████▉  | 419/529 [02:10<00:29,  3.67it/s][A[A[A


 79%|███████▉  | 420/529

data/FoodBalanceSheets_E_All_Data_(Normalized).csv


KeyboardInterrupt: 

## Get country names

In [32]:
countries = set()

for x in tqdm(glob('datapoints/*.csv')):
    
   
    data = pd.read_csv(x)
    for j in data['country'].values:
        countries.add(j)
res = pd.DataFrame()
res['name'] = list(countries)
res.to_csv("distinct_countries_standardized.csv", index=False)



  0%|          | 0/2743 [00:00<?, ?it/s][A[A

  0%|          | 1/2743 [00:22<16:50:02, 22.10s/it][A[A

  0%|          | 11/2743 [00:22<11:44:35, 15.47s/it][A[A

  1%|          | 27/2743 [00:22<8:10:25, 10.83s/it] [A[A

  1%|▏         | 41/2743 [00:22<5:41:37,  7.59s/it][A[A

  2%|▏         | 52/2743 [00:22<3:58:17,  5.31s/it][A[A

  2%|▏         | 66/2743 [00:22<2:46:02,  3.72s/it][A[A

  3%|▎         | 83/2743 [00:22<1:55:33,  2.61s/it][A[A

  4%|▎         | 100/2743 [00:22<1:20:27,  1.83s/it][A[A

  4%|▍         | 114/2743 [00:22<56:07,  1.28s/it]  [A[A

  5%|▍         | 128/2743 [00:23<39:12,  1.11it/s][A[A

  5%|▌         | 141/2743 [00:23<27:24,  1.58it/s][A[A

  6%|▌         | 156/2743 [00:23<19:09,  2.25it/s][A[A

  6%|▋         | 173/2743 [00:23<13:24,  3.19it/s][A[A

  7%|▋         | 188/2743 [00:23<09:26,  4.51it/s][A[A

  7%|▋         | 203/2743 [00:23<06:39,  6.36it/s][A[A

  8%|▊         | 217/2743 [00:23<04:45,  8.85it/s][A[A

  8%|▊   

 78%|███████▊  | 2150/2743 [00:37<00:04, 138.61it/s][A[A

 79%|███████▉  | 2164/2743 [00:37<00:04, 130.22it/s][A[A

 79%|███████▉  | 2178/2743 [00:37<00:04, 129.31it/s][A[A

 80%|███████▉  | 2193/2743 [00:37<00:04, 133.58it/s][A[A

 80%|████████  | 2208/2743 [00:37<00:03, 137.22it/s][A[A

 81%|████████  | 2223/2743 [00:37<00:03, 139.02it/s][A[A

 82%|████████▏ | 2238/2743 [00:37<00:03, 140.57it/s][A[A

 82%|████████▏ | 2253/2743 [00:37<00:03, 142.12it/s][A[A

 83%|████████▎ | 2268/2743 [00:38<00:03, 140.13it/s][A[A

 83%|████████▎ | 2283/2743 [00:38<00:03, 134.51it/s][A[A

 84%|████████▍ | 2298/2743 [00:38<00:03, 136.71it/s][A[A

 84%|████████▍ | 2312/2743 [00:38<00:03, 137.02it/s][A[A

 85%|████████▍ | 2326/2743 [00:38<00:03, 133.61it/s][A[A

 85%|████████▌ | 2340/2743 [00:38<00:03, 131.82it/s][A[A

 86%|████████▌ | 2354/2743 [00:38<00:02, 131.89it/s][A[A

 86%|████████▋ | 2369/2743 [00:38<00:02, 136.20it/s][A[A

 87%|████████▋ | 2383/2743 [00:38<00:02,

## Insert db

In [27]:
with connection as c:
    db = DBUtils(c)
    
    entities = pd.read_csv("distinct_countries_standardized_copy.csv")
    datasets = pd.read_csv("datasets.csv")
    sources = pd.read_csv("sources.csv")
    variables = pd.read_csv('variables.csv')
    
    new_entities = entities[entities['db_entity_id'].isnull()]
    for _, entity in new_entities.iterrows():
        entity_id = entity.name
        entity_name = entity['name']
        db_entity_id = db.get_or_create_entity(entity_name)
        entities.loc[entity_id, 'db_entity_id'] = db_entity_id
    
    # upsert datasets
    dataset_name_ids = {}
    for i, row in tqdm(datasets.iterrows()):
        dataset_id = db.upsert_dataset(name=row['name'], namespace="faostat", user_id=15)
        dataset_name_ids[row['name']] = dataset_id
        
        
    # upsert sources
    
    dataset_to_source_ids = {}
    for i, row in tqdm(sources.iterrows()):

        dataset_name = datasets[datasets['id'] == row['dataset_id']]['name'].values[0]
        source_id = db.upsert_source(name=row['name'], description=json.dumps(row['description']), dataset_id=dataset_name_ids[dataset_name])

        dataset_to_source_ids[dataset_name] = source_id

        
    # upsert variables
    names_to_ids = {}
    for i, row in tqdm(variables.iterrows()):
        
        dataset_name = datasets[datasets['id'] == row['dataset_id']]['name'].values[0]
        dataset_id = dataset_name_ids[dataset_name]
        source_id = dataset_to_source_ids[dataset_name]
        
        unit = row['unit'] if len(row['unit']) < 512 else ""
        
        variable_id = db.upsert_variable(
                                        name=row['name'], 
                                        code=None, 
                                        unit=unit, 
                                        short_unit=None, 
                                        source_id=source_id, 
                                        dataset_id=dataset_id, 
                                        description=varname_description[row['name']], 
                                        timespan='', 
                                        coverage='', 
                                        display={}
                                        )
        names_to_ids[row['name']] = variable_id
        
    #Inserting datapoints


    datapoints_files = glob("datapoints/*.csv")
    for x in tqdm(datapoints_files): 

        v_id = int(x.split("_")[1].split(".")[0])
       
        # to get variable name
        variable_name = variables[variables['id']==v_id]['name'].values[0]
       
        # to get variable id from db
        variable_id = names_to_ids[variable_name]
        data = pd.read_csv(x)
        

        for i, row in data.iterrows():
            entity_id = entities[entities['name'] == row['country']]['db_entity_id'].values[0]

            year = row['year']
            val = row['value']
            
            try:

                db.upsert_one("""
                    INSERT INTO data_values
                        (value, year, entityId, variableId)
                    VALUES
                        (%s, %s, %s, %s)
                    ON DUPLICATE KEY UPDATE
                        value = VALUES(value),
                        year = VALUES(year),
                        entityId = VALUES(entityId),
                        variableId = VALUES(variableId)
                """, [val, int(year), str(int(entity_id)), str(variable_id)])
            except:
                print(x)
                print(row['year'], row['value'], entity_id, variable_id)
    




0it [00:00, ?it/s][A[A[A


11it [00:00, 159.87it/s][A[A[A


0it [00:00, ?it/s][A[A[A


11it [00:00, 157.74it/s][A[A[A


0it [00:00, ?it/s][A[A[A


15it [00:00, 149.14it/s][A[A[A


48it [00:00, 178.14it/s][A[A[A


88it [00:00, 212.98it/s][A[A[A


120it [00:00, 235.86it/s][A[A[A


161it [00:00, 269.90it/s][A[A[A


193it [00:00, 282.56it/s][A[A[A


239it [00:00, 318.55it/s][A[A[A


281it [00:00, 342.27it/s][A[A[A


328it [00:00, 370.84it/s][A[A[A


368it [00:01, 375.26it/s][A[A[A


408it [00:01, 350.36it/s][A[A[A


445it [00:01, 352.81it/s][A[A[A


489it [00:01, 373.60it/s][A[A[A


529it [00:01, 379.24it/s][A[A[A


568it [00:01, 370.19it/s][A[A[A


606it [00:01, 371.20it/s][A[A[A


644it [00:01, 364.29it/s][A[A[A


691it [00:01, 390.47it/s][A[A[A


733it [00:01, 395.87it/s][A[A[A


774it [00:02, 389.81it/s][A[A[A


816it [00:02, 397.57it/s][A[A[A


863it [00:02, 415.34it/s][A[A[A


909it [00:02, 426.26it/s

datapoints/datapoints_777.csv
1995 5.0 168.0 127653





  1%|          | 31/2743 [04:04<6:57:32,  9.24s/it][A[A[A


  1%|          | 32/2743 [04:07<5:31:49,  7.34s/it][A[A[A


  1%|          | 33/2743 [04:18<6:18:37,  8.38s/it][A[A[A


  1%|          | 34/2743 [04:20<4:52:36,  6.48s/it][A[A[A


  1%|▏         | 35/2743 [04:30<5:49:17,  7.74s/it][A[A[A


  1%|▏         | 36/2743 [04:41<6:32:58,  8.71s/it][A[A[A


  1%|▏         | 37/2743 [04:46<5:38:52,  7.51s/it][A[A[A


  1%|▏         | 38/2743 [04:52<5:14:22,  6.97s/it][A[A[A


  1%|▏         | 39/2743 [05:02<6:03:23,  8.06s/it][A[A[A


  1%|▏         | 40/2743 [05:13<6:41:08,  8.90s/it][A[A[A


  1%|▏         | 41/2743 [05:24<6:58:26,  9.29s/it][A[A[A


  2%|▏         | 42/2743 [05:35<7:23:00,  9.84s/it][A[A[A


  2%|▏         | 43/2743 [05:39<6:08:06,  8.18s/it][A[A[A


  2%|▏         | 44/2743 [05:50<6:47:08,  9.05s/it][A[A[A


  2%|▏         | 45/2743 [06:01<7:16:46,  9.71s/it][A[A[A


  2%|▏         | 46/2743 [06:14<7:59:42, 10.67s/it]

  6%|▌         | 159/2743 [23:14<6:08:56,  8.57s/it][A[A[A


  6%|▌         | 160/2743 [23:25<6:48:28,  9.49s/it][A[A[A


  6%|▌         | 161/2743 [23:28<5:23:02,  7.51s/it][A[A[A


  6%|▌         | 162/2743 [23:31<4:20:33,  6.06s/it][A[A[A


  6%|▌         | 163/2743 [23:44<5:48:27,  8.10s/it][A[A[A


  6%|▌         | 164/2743 [23:48<5:00:26,  6.99s/it][A[A[A


  6%|▌         | 165/2743 [23:50<3:58:37,  5.55s/it][A[A[A


  6%|▌         | 166/2743 [23:55<3:49:59,  5.35s/it][A[A[A


  6%|▌         | 167/2743 [23:58<3:17:19,  4.60s/it][A[A[A


  6%|▌         | 168/2743 [24:02<3:15:38,  4.56s/it][A[A[A


  6%|▌         | 169/2743 [24:03<2:28:33,  3.46s/it][A[A[A


  6%|▌         | 170/2743 [24:06<2:11:52,  3.08s/it][A[A[A


  6%|▌         | 171/2743 [24:11<2:41:49,  3.77s/it][A[A[A


  6%|▋         | 172/2743 [24:16<3:01:21,  4.23s/it][A[A[A


  6%|▋         | 173/2743 [24:28<4:42:23,  6.59s/it][A[A[A


  6%|▋         | 174/2743 [24:41<6:03:08

KeyboardInterrupt: 