In [1]:
import pandas as pd

from arrow import now
from charset_normalizer import detect
from glob import glob

from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

CODES = '/kaggle/input/country-mapping-iso-continent-region/continents2.csv'
CROPS = '/kaggle/input/crop-production/P*.csv'

def get_file(filename: str):
    with open(file=filename, mode='rb') as input_fp:
        encoding = detect(input_fp.read())['encoding']
        return pd.read_csv(filepath_or_buffer=filename, encoding=encoding)

time_start = now()
df = pd.concat(objs=[get_file(filename=input_file) for input_file in glob(CROPS)]).merge(right=pd.read_csv(filepath_or_buffer=CODES, usecols=['alpha-3', 'name']), left_on='Area', right_on='name')
for column in df.columns:
    if column.startswith('Y') and not column.endswith('F'):
        df[column] = df[column].fillna(value=0)
print('data read complete in {}'.format(now() - time_start))

df.head()

data read complete in 0:00:49.169737


Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1962,...,Y2016,Y2016F,Y2017,Y2017F,Y2018,Y2018F,Y2019,Y2019F,name,alpha-3
0,10,Australia,221,"Almonds, with shell",5312,Area harvested,ha,0.0,M,0.0,...,28553.0,,29282.0,,31361.0,,37903.0,,Australia,AUS
1,10,Australia,221,"Almonds, with shell",5419,Yield,hg/ha,0.0,,0.0,...,42050.0,Fc,40909.0,Fc,39769.0,Fc,38628.0,Fc,Australia,AUS
2,10,Australia,221,"Almonds, with shell",5510,Production,tonnes,0.0,M,0.0,...,120065.0,Im,119790.0,Im,124718.0,Im,146410.0,Im,Australia,AUS
3,10,Australia,711,"Anise, badian, fennel, coriander",5312,Area harvested,ha,0.0,M,0.0,...,1083.0,Im,1108.0,Im,1145.0,Im,1182.0,Im,Australia,AUS
4,10,Australia,711,"Anise, badian, fennel, coriander",5419,Yield,hg/ha,0.0,,0.0,...,11625.0,Fc,11661.0,Fc,11712.0,Fc,11751.0,Fc,Australia,AUS


In [2]:
df.info() # we have lost 210 - 178 areas in the ISO-3 code merge

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31434 entries, 0 to 31433
Columns: 127 entries, Area Code to alpha-3
dtypes: float64(59), int64(3), object(65)
memory usage: 30.5+ MB


In [3]:
print('We have data for {} areas, {} crops, and {} years.'.format(df['Area'].nunique(), df['Item Code'].nunique(), len([column for column in df.columns if column.startswith('Y')]) // 2))

We have data for 178 areas, 174 crops, and 59 years.


What do when we want to visualize a single series of named data and we have too much to make a bar plot? 

We can introduce a random variable to add a dimension and make a scatter plot, or we can introduce two random variables and lay out the data at random points in our 2-d space. Unfortunately in this case our item codes bunch up, so we need to use two random variables; below we've chosen to use text instead of hover text to show the item names; the result is middling, but it gives us a way to visually browse 174 items and their codes.

In [4]:
from plotly.express import scatter
from random import seed
from random import sample

seed(2024)
scatter_df = df[['Item', 'Item Code']].drop_duplicates(ignore_index=True).sort_values(by='Item Code').reset_index(drop=True)
scatter_df['x'] = sample(population=range(1 + len(scatter_df)), k=len(scatter_df))
scatter_df['y'] = sample(population=range(1 + len(scatter_df)), k=len(scatter_df))

scatter(data_frame=scatter_df, x='x', y='y', hover_name='Item', log_y=False, color='Item Code', text='Item', height=900).update_traces(marker={'size': 3})

We have an enormouus amount of data; we aren't going to be able to visualize it all. Let's pick an item for which we have a lot of data and a recent year and plot the yields for that year for those crops on a world map.

In [5]:
from plotly.express import histogram
histogram(data_frame=df, x='Item',)

We have a lot of variability from item to item; let's pick some very popular items.

In [6]:
df['Item'].value_counts().head(n=10)

Item
Fruit Primary                524
Roots and Tubers, Total      523
Vegetables Primary           522
Vegetables, fresh nes        513
Oilcrops                     510
Oilcrops, Oil Equivalent     510
Oilcrops, Cake Equivalent    503
Cereals, Total               485
Pulses, Total                458
Maize                        454
Name: count, dtype: int64

In [7]:
from plotly.express import choropleth
for item in df['Item'].value_counts().head(n=5).index.tolist():
    choropleth(data_frame=df[(df['Item'] == item) & (df['Element'] == 'Yield')][['Y2019', 'alpha-3',]], locations='alpha-3', color='Y2019',
          title=item).show()

We can also with some work choose an area and an item and visualize its yield across the time series.

In [8]:
from plotly.express import line

Y = [column for column in df.columns if not column.startswith('Y')]
F = [column for column in df.columns if column.endswith('F')]

def get_annual_series(item: str, area: str, element: str):
    result_df = df[(df['Item'] == item) & (df['Area'] == area) & (df['Element'] ==  element)].drop(columns=F + Y).T.reset_index()
    result_df['index'] = result_df['index'].apply(func=lambda x: int(x[1:]))
    # patch up cases where the element is missing
    if len(result_df.columns) == 1:
        result_df[element] = [0] * len(result_df)
    result_df.columns = ['year', element]
    result_df['Area'] = [area] * len(result_df)
    return result_df

vanilla_df = pd.concat([get_annual_series(item='Vanilla', area=area, element='Yield') for area in df[df['Item'] == 'Vanilla']['Area'].unique().tolist()])
line(data_frame=vanilla_df,  x='year', y='Yield', color='Area', height=800)


Finally what items are produced in the most countries?

In [9]:
from plotly.express import histogram
histogram(data_frame=df[['Item', 'Area']].groupby(by=['Item',]).size().reset_index().sort_values(by=0, ascending=False).head(n=50), x='Item', y=0)