# Py : Tidy data analysis - Global Crop Yields

**Introduction** :

Our data on agricultural yields across crop types and by country are much more extensive from 1960 onwards. The UN Food and Agricultural Organization (FAO) publish yield estimates across a range of crop commodities by country over this period. The FAO report yield values as the national average for any given year; this is calculated by diving total crop output (in kilograms or tonnes) by the area of land used to grow a given crop (in hectares). There are likely to be certain regional and seasonal differences in yield within a given country, however, reported average yields still provide a useful indication of changes in productivity over time and geographical region.

In [1]:
# Importing libraries
import datatable as dt
import pandas as pd
import altair as alt
from datatable import f,by,count,update,sort,join
import re
from itertools import repeat
from itertools import chain

In [2]:
# Datatable options are set to display limit number of rows and datatable frame columns colors are maintained
dt.init_styles()
dt.options.display.head_nrows=4
dt.options.display.tail_nrows=4

In [3]:
crop_yields_global_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-01/key_crop_yields.csv')

In [4]:
crop_fields = crop_yields_global_dt.names

In [5]:
crop_fields_clean = [re.sub('\s{1,}\([\w\s]+\)','',col).lower() for col in crop_fields ]

In [6]:
crop_fields_clean_1 = [ re.sub('\s','_',col) for col in crop_fields_clean ]

In [7]:
crop_yields_global_dt.names = crop_fields_clean_1

In [8]:
crop_yields_global_dt

Unnamed: 0_level_0,entity,code,year,wheat,rice,maize,soybeans,potatoes,beans,peas,cassava,barley,cocoa_beans,bananas
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,Afghanistan,AFG,1961,1.022,1.519,1.4,,8.6667,,,,1.08,,
1,Afghanistan,AFG,1962,0.9735,1.519,1.4,,7.6667,,,,1.08,,
2,Afghanistan,AFG,1963,0.8317,1.519,1.426,,8.1333,,,,1.08,,
3,Afghanistan,AFG,1964,0.951,1.7273,1.4257,,8.6,,,,1.0857,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
13071,Zimbabwe,ZWE,2015,2.0486,2.2689,0.5803,0.9459,16.936,0.4145,,4.6818,5.462,,7.1862
13072,Zimbabwe,ZWE,2016,1.9013,2.2671,0.4405,1.1958,17.0007,0.3574,,4.7117,5.4727,,7.4281
13073,Zimbabwe,ZWE,2017,1.7542,2.2656,0.5589,1.5139,17.0545,0.5316,,4.742,5.4811,,7.6618
13074,Zimbabwe,ZWE,2018,2.001,2.2641,0.6131,1.5,17.1083,0.4773,,4.7705,5.4894,,7.8955


In [9]:
crop_yields_global_dt_v1 = crop_yields_global_dt[:,dt.sum(f[3:]),by(f.entity)]

In [10]:
crop_yields_global_dt_v1

Unnamed: 0_level_0,entity,wheat,rice,maize,soybeans,potatoes,beans,peas,cassava,barley,cocoa_beans,bananas
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,Afghanistan,75.5326,130.375,99.0582,0,798.419,0,0,0,68.6434,0,0
1,Africa,93.7614,120.211,89.3943,48.2091,612.213,40.3014,41.4261,446.23,57.3945,22.3873,434.42
2,Albania,155.867,102.656,213.741,58.053,693.955,63.3469,0,0,114.002,0,0
3,Algeria,54.7813,131.243,116.327,0,804.96,33.9203,29.9066,0,51.2693,0,419.036
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
245,Yemen,79.2362,0,91.706,0,701.702,63.8295,47.5026,0,55.6646,0,400.344
246,Yugoslavia,92.4004,131.491,112.283,52.3622,268.784,26.4929,34.7345,0,65.3341,0,0
247,Zambia,257.897,53.477,97.0596,61.2669,634.596,0,0,349.942,34.3003,0,193.076
248,Zimbabwe,230.081,100.996,69.2479,93.7791,883.574,33.2357,0,220.337,280.046,0,303.048


In [11]:
crop_fields_clean_1[3:]

['wheat',
 'rice',
 'maize',
 'soybeans',
 'potatoes',
 'beans',
 'peas',
 'cassava',
 'barley',
 'cocoa_beans',
 'bananas']

In [12]:
def pydt_reshape_wide_to_long(DT,*measure_vars,var_name=None,val_name=None):
    """reshaping datatable from wide columns to long """
    dt_cols=[*measure_vars]
    measure_col_dict = DT[:,[*measure_vars]].to_dict()
    variables_dict={'variable':[],'value':[]}
    for k,v in measure_col_dict.items():
        variables_dict['variable'].extend(repeat(k,len(v)))
        variables_dict['value'].extend(v)
    wide_to_long_dt = dt.Frame(variables_dict)
    removed_cols_dt = DT[:,f[:].remove([ f[col] for col in dt_cols])].to_dict()
    non_measures_dt = dt.Frame({k:list(chain.from_iterable(list(repeat(v,len(dt_cols))))) for k,v in removed_cols_dt.items()})
    
    if var_name and val_name is not None:
        wide_to_long_dt.names={'variable':var_name,'value':val_name}
        
    wide_to_long_prep_dt=dt.cbind(non_measures_dt,wide_to_long_dt)
        
    return wide_to_long_prep_dt

In [13]:
crop_yields_tidy =pydt_reshape_wide_to_long(crop_yields_global_dt_v1,'wheat',
 'rice',
 'maize',
 'soybeans',
 'potatoes',
 'beans',
 'peas',
 'cassava',
 'barley',
 'cocoa_beans',
 'bananas',
var_name='crop',val_name='crop_yield')

In [14]:
crop_yields_tidy[:,dt.sum(f.crop_yield),by(f.crop)]

Unnamed: 0_level_0,crop,crop_yield
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪
0,bananas,135429.0
1,barley,15028.4
2,beans,8699.05
3,cassava,67169.6
4,cocoa_beans,1818.91
5,maize,32585.9
6,peas,9225.41
7,potatoes,154224.0
8,rice,26783.3
9,soybeans,8634.52
