In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading datasets

In [2]:
df_trade_matrix = pd.read_csv('data/Trade_DetailedTradeMatrix_E_All_Data.csv', encoding = 'ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_crop_production = pd.read_csv('data/Production_Crops_E_All_Data.csv', encoding = 'ISO-8859-1')

In [4]:
df_crop_production.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1962,...,Y2013,Y2013F,Y2014,Y2014F,Y2015,Y2015F,Y2016,Y2016F,Y2017,Y2017F
0,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,ha,,,,...,14114.0,,13703.0,,14676.0,,19481.0,,19793.0,
1,2,Afghanistan,221,"Almonds, with shell",5419,Yield,hg/ha,,,,...,29910.0,Fc,19996.0,Fc,16521.0,Fc,16859.0,Fc,13788.0,Fc
2,2,Afghanistan,221,"Almonds, with shell",5510,Production,tonnes,,,,...,42215.0,,27400.0,,24246.0,,32843.0,,27291.0,
3,2,Afghanistan,711,"Anise, badian, fennel, coriander",5312,Area harvested,ha,,M,,...,18500.0,F,30000.0,F,25000.0,F,26019.0,Im,28873.0,Im
4,2,Afghanistan,711,"Anise, badian, fennel, coriander",5419,Yield,hg/ha,,,,...,6757.0,Fc,7167.0,Fc,7200.0,Fc,6923.0,Fc,6830.0,Fc


In [5]:
df_production_ch = df_crop_production[df_crop_production['Area'] == 'Switzerland']
df_trade_ch = df_trade_matrix[df_trade_matrix['Reporter Countries'] == 'Switzerland']

In [6]:
df_production_ch.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1962,...,Y2013,Y2013F,Y2014,Y2014F,Y2015,Y2015F,Y2016,Y2016F,Y2017,Y2017F
31434,211,Switzerland,515,Apples,5312,Area harvested,ha,2700.0,F,4800.0,...,3993.0,,3894.0,,3863.0,,3854.0,,3806.0,
31435,211,Switzerland,515,Apples,5419,Yield,hg/ha,999259.0,Fc,1000833.0,...,526243.0,Fc,594101.0,Fc,554621.0,Fc,589777.0,Fc,592805.0,Fc
31436,211,Switzerland,515,Apples,5510,Production,tonnes,269800.0,,480400.0,...,210129.0,,231343.0,,214250.0,,227300.0,,225622.0,Im
31437,211,Switzerland,526,Apricots,5312,Area harvested,ha,,M,,...,702.0,,708.0,,709.0,,721.0,,736.0,
31438,211,Switzerland,526,Apricots,5419,Yield,hg/ha,,,,...,119587.0,Fc,150014.0,Fc,117969.0,Fc,128336.0,Fc,127993.0,Fc


In [7]:
df_trade_ch.head()

Unnamed: 0,Reporter Country Code,Reporter Countries,Partner Country Code,Partner Countries,Item Code,Item,Element Code,Element,Unit,Y1986,...,Y2013,Y2013F,Y2014,Y2014F,Y2015,Y2015F,Y2016,Y2016F,Y2017,Y2017F
4470118,211,Switzerland,2,Afghanistan,231,Almonds shelled,5610,Import Quantity,tonnes,,...,,,,,,,0.0,,,
4470119,211,Switzerland,2,Afghanistan,231,Almonds shelled,5622,Import Value,1000 US$,,...,,,,,,,0.0,,,
4470120,211,Switzerland,2,Afghanistan,1169,"Animals, live, non-food",5622,Import Value,1000 US$,,...,,,,,,,,,,
4470121,211,Switzerland,2,Afghanistan,527,"Apricots, dry",5610,Import Quantity,tonnes,,...,,,,,,,,,,
4470122,211,Switzerland,2,Afghanistan,527,"Apricots, dry",5622,Import Value,1000 US$,,...,,,,,,,,,,


# EDA

Every quantity in the table is followed by a flag column. A flag indicates how the quantity was obtained:
- **F**: FAO estimate
- **Fc**: Calculated data
- **M**: Data not available
- **NaN**: Official data
- *: Non-official data
- **A**: Aggregate, may include official, semi-official, estimated or calculated data
- **B**: Balance
- **Im**: FAO data based on imputation methodology


The takeaway is that the data can be very inaccurate at times, so not all of the following computations will be a really portrait of the reality.
Another thing to take into account is when the flag is **M** we can just replace the non-available data with 0s.

### Align data start year

In [8]:
drop_columns = ['Y' + str(year) for year in range(1961, 1986)]
drop_columns += ['Y' + str(year) + 'F' for year in range(1961, 1986)]

In [9]:
df_production_ch = df_production_ch.drop(columns=drop_columns)

### Fill NaN corresponding to a **M** flag

In [10]:
def fill_with_zero(df, year_start, year_end):
    for year in range(year_start, year_end+1):
        column = 'Y' + str(year)
        column_f = column + 'F'
        condition = df[column_f] == 'M'
        df.loc[condition, column] = df.loc[condition, column].fillna(0)

In [11]:
year_start = 1986
year_end = 2017
fill_with_zero(df_production_ch, year_start, year_end)
fill_with_zero(df_trade_ch, year_start, year_end)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [12]:
df_production_ch[df_production_ch['Y1986F']=='M'].head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y1986,Y1986F,Y1987,...,Y2013,Y2013F,Y2014,Y2014F,Y2015,Y2015F,Y2016,Y2016F,Y2017,Y2017F
31440,211,Switzerland,366,Artichokes,5312,Area harvested,ha,0.0,M,0.0,...,4.0,,4.0,,3.0,,4.0,,4.0,
31442,211,Switzerland,366,Artichokes,5510,Production,tonnes,0.0,M,0.0,...,5.0,,5.0,,6.0,,3.0,,0.0,
31443,211,Switzerland,367,Asparagus,5312,Area harvested,ha,0.0,M,0.0,...,332.0,,371.0,,380.0,,413.0,,397.0,
31445,211,Switzerland,367,Asparagus,5510,Production,tonnes,0.0,M,0.0,...,613.0,,805.0,,679.0,,611.0,,722.0,
31455,211,Switzerland,552,Blueberries,5312,Area harvested,ha,0.0,M,0.0,...,73.0,,73.0,,76.0,,83.0,,93.0,


# Local Consumption and Exported Quantities

In [29]:
def local_consumption_weight(df_production, df_trade, year, item, local_pop_size):
    df_prod_elem = df_production[(df_production['Item'] == item) & (df_production['Unit'] == 'tonnes')]
    df_trade_elem = df_trade[(df_trade['Item'] == item) & (df_trade['Unit'] == 'tonnes')]

    total_produced = df_prod_elem['Y'+str(year)].iloc[0]
    total_exported = df_trade_elem['Y'+str(year)].sum(skipna=True)
    local_consumption = total_produced - total_exported
    per_person = local_consumption*1000/local_pop_size
    print(f"{item} for year {year}: \n\t- {total_produced} tonnes produced \n\t- {total_exported} tonnes exported \n\t- {local_consumption} tonnes available for local consumption, so {per_person} kg per person")



In [30]:
local_pop_size = 7000000
item = 'Tomatoes'
year = 2017

local_consumption_weight(df_production_ch, df_trade_ch, year, item, local_pop_size)

Tomatoes for year 2017: 
	- 42533.0 tonnes produced 
	- 37262.0 tonnes exported 
	- 5271.0 tonnes available for local consumption, so 0.753 kg per person
