In [1]:
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from IPython.display import display, Markdown

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [61]:
pest_df = pd.read_csv('./pesticides.csv')
rain_df = pd.read_csv('./rainfall.csv')
temp_df = pd.read_csv('./temp.csv')
yield_df = pd.read_csv('./yield.csv')

In [62]:
print(pest_df.columns)
print(rain_df.columns)
print(temp_df.columns)
print(yield_df.columns)

Index(['Domain', 'Area', 'Element', 'Item', 'Year', 'Unit', 'Value'], dtype='object')
Index([' Area', 'Year', 'average_rain_fall_mm_per_year'], dtype='object')
Index(['year', 'country', 'avg_temp'], dtype='object')
Index(['Domain Code', 'Domain', 'Area Code', 'Area', 'Element Code', 'Element',
       'Item Code', 'Item', 'Year Code', 'Year', 'Unit', 'Value'],
      dtype='object')


In [63]:
pest_df.nunique()

Domain        1
Area        168
Element       1
Item          1
Year         27
Unit          1
Value      2825
dtype: int64

In [64]:
def print_info(df, name):
    print(f"Information for {name}:")
    print('_____________________________')
    print(df.info())
    print('\n')

print_info(temp_df, 'temp_df')
print_info(rain_df, 'rain_df')
print_info(pest_df, 'pest_df')
print_info(yield_df, 'yield_df')

Information for temp_df:
_____________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71311 entries, 0 to 71310
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   year      71311 non-null  int64  
 1   country   71311 non-null  object 
 2   avg_temp  68764 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB
None


Information for rain_df:
_____________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6727 entries, 0 to 6726
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0    Area                          6727 non-null   object
 1   Year                           6727 non-null   int64 
 2   average_rain_fall_mm_per_year  5953 non-null   object
dtypes: int64(1), object(2)
memory usage: 157.8+ KB
None


Information for pest_df:
_____________________________
<cl

In [31]:
temp_df.rename(columns = {'year':'Year','country':'Country','avg_temp':'Average_Temperature'},inplace = True)
rain_df.rename(columns = {' Area':'Country','average_rain_fall_mm_per_year':'Average_Rainfall'},inplace = True)
yield_df.rename(columns = {'Area':'Country','Value':'Yield'},inplace = True)
pest_df.rename(columns = {'Area':'Country','Value':'Pesticides'},inplace = True)

In [32]:
column_mask_pest = ['Country', 'Year', 'Pesticides']
column_mask_yield = ['Country', 'Item', 'Year', 'Yield']

In [33]:
pest_df = pest_df[column_mask_pest]
yield_df = yield_df[column_mask_yield]

In [34]:
yield_df = yield_df.groupby(['Country','Year'])['Yield'].sum().reset_index()

In [35]:
yield_df

Unnamed: 0,Country,Year,Yield
0,Afghanistan,1961,126077
1,Afghanistan,1962,115592
2,Afghanistan,1963,119100
3,Afghanistan,1964,127040
4,Afghanistan,1965,129396
...,...,...,...
10567,Zimbabwe,2012,313366
10568,Zimbabwe,2013,303062
10569,Zimbabwe,2014,305570
10570,Zimbabwe,2015,297742


In [25]:
print(f"Year range in pesticides.csv {pest_df['Year'].max()} - {pest_df['Year'].min()}")
pest_df.describe().round(0).astype(int)

Year range in pesticides.csv 2016 - 1990


Unnamed: 0,Year,Pesticides
count,4349,4349
mean,2003,20303
std,8,117736
min,1990,0
25%,1996,93
50%,2003,1138
75%,2010,7869
max,2016,1807000


In [24]:
print(f"Year range in yield.csv {yield_df['Year'].max()} - {yield_df['Year'].min()}")
yield_df.describe().round(0).astype(int)

Year range in yield.csv 2016 - 1961


Unnamed: 0,Year,Yield
count,10572,10572
mean,1989,333127
std,16,189960
min,1961,4017
25%,1975,197757
50%,1990,296784
75%,2003,443306
max,2016,1271477


In [86]:
unique_counts_df = pd.concat([
    temp_df.nunique().rename('temp_df'),
    rain_df.nunique().rename('rain_df'),
    pest_df.nunique().rename('pest_df'),
    yield_df.nunique().rename('yield_df')
], axis=1)
unique_counts_df = unique_counts_df.astype(str).replace('nan', '')
print(unique_counts_df)

                    temp_df rain_df pest_df yield_df
Year                  271.0    31.0    27.0     56.0
Country               137.0   217.0   168.0    212.0
Average_Temperature  3303.0                         
Average_Rainfall              173.0                 
Pesticides                           2825.0         
Yield                                        10238.0


In [87]:
print("Missing values in temp_df:", temp_df.isna().sum().sum())
print("Missing values in rain_df:", rain_df.isna().sum().sum())
print("Missing values in pest_df:", pest_df.isna().sum().sum())
print("Missing values in yield_df:", yield_df.isna().sum().sum())

Missing values in temp_df: 2547
Missing values in rain_df: 774
Missing values in pest_df: 0
Missing values in yield_df: 0


In [88]:
def print_missing_values(df, name):
    print(f"Missing values in {name}:")
    print('_____________________________________')
    print(df.isna().sum())
    print('\n')

print_missing_values(temp_df, 'temp_df')
print_missing_values(rain_df, 'rain_df')

Missing values in temp_df:
_____________________________________
Year                      0
Country                   0
Average_Temperature    2547
dtype: int64


Missing values in rain_df:
_____________________________________
Country               0
Year                  0
Average_Rainfall    774
dtype: int64




In [43]:
combined_df = yield_df.merge(temp_df, on=['Country', 'Year'], how='left')
combined_df = combined_df.merge(rain_df, on=['Country', 'Year'], how='left')
combined_df = combined_df.merge(pest_df, on=['Country', 'Year'], how='left')

In [44]:
combined_df

Unnamed: 0,Country,Item,Year,Yield,Average_Temperature,Average_Rainfall,Pesticides
0,Afghanistan,Maize,1961,14000,14.23,,
1,Afghanistan,Maize,1962,14000,14.10,,
2,Afghanistan,Maize,1963,14260,15.01,,
3,Afghanistan,Maize,1964,14257,13.73,,
4,Afghanistan,Maize,1965,14400,13.90,,
...,...,...,...,...,...,...,...
109361,Zimbabwe,Wheat,2012,24420,20.52,657,3375.53
109362,Zimbabwe,Wheat,2013,22888,19.76,657,2550.07
109363,Zimbabwe,Wheat,2014,21357,,657,2185.07
109364,Zimbabwe,Wheat,2015,19826,,657,2185.07


In [60]:
combined_df.attrs = {'Average_Temperature':'Celcius','Average_Rainfall':'Average rainfall mm per year','Yield':'Tonnes of active ingredients','Item':'Crop type'}

In [54]:
combined_df[combined_df['Country'] == 'Afghanistan'].nunique()

Country                  1
Item                     4
Year                    56
Yield                  197
Average_Temperature     49
Average_Rainfall         1
Pesticides               0
dtype: int64

In [55]:
combined_df[combined_df['Country'] == 'Afghanistan']

Unnamed: 0,Country,Item,Year,Yield,Average_Temperature,Average_Rainfall,Pesticides
0,Afghanistan,Maize,1961,14000,14.23,,
1,Afghanistan,Maize,1962,14000,14.10,,
2,Afghanistan,Maize,1963,14260,15.01,,
3,Afghanistan,Maize,1964,14257,13.73,,
4,Afghanistan,Maize,1965,14400,13.90,,
...,...,...,...,...,...,...,...
219,Afghanistan,Wheat,2012,20104,14.51,327,
220,Afghanistan,Wheat,2013,20248,16.21,327,
221,Afghanistan,Wheat,2014,20237,,327,
222,Afghanistan,Wheat,2015,21959,,327,


In [56]:
combined_df

Unnamed: 0,Country,Item,Year,Yield,Average_Temperature,Average_Rainfall,Pesticides
0,Afghanistan,Maize,1961,14000,14.23,,
1,Afghanistan,Maize,1962,14000,14.10,,
2,Afghanistan,Maize,1963,14260,15.01,,
3,Afghanistan,Maize,1964,14257,13.73,,
4,Afghanistan,Maize,1965,14400,13.90,,
...,...,...,...,...,...,...,...
109361,Zimbabwe,Wheat,2012,24420,20.52,657,3375.53
109362,Zimbabwe,Wheat,2013,22888,19.76,657,2550.07
109363,Zimbabwe,Wheat,2014,21357,,657,2185.07
109364,Zimbabwe,Wheat,2015,19826,,657,2185.07


In [62]:
combined_df.attrs

{'Average_Temperature': 'Celcius',
 'Average_Rainfall': 'Average rainfall mm per year',
 'Yield': 'Tonnes of active ingredients',
 'Item': 'Crop type'}

In [75]:
combined_df.describe().round(0).astype(int).map("{:,}".format)

Unnamed: 0,Year,Yield,Average_Temperature,Pesticides
count,109366,109366,84859,39918
mean,1988,65055,20,38415
std,16,69141,7,118352
min,1961,0,-4,0
25%,1975,16270,15,816
50%,1989,37016,21,8306
75%,2002,94200,26,39406
max,2016,1000000,31,1807000


In [76]:
combined_df.to_csv('combined.csv',index=False)