In [7]:
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import Normalize
from IPython.display import display, Markdown

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
# reading the dataset
def read_dataset(file_path):
    try:
        pest_df = pd.read_csv(file_path)
        return pest_df
    except Exception as e:
        return f'An error occured: {e}'
    
# reading the dataset
merged_df = read_dataset('merged_df.csv')
merged_df.head()

Unnamed: 0,Country,Year,Yield,Pesticides,Temperature,Rainfall
0,Albania,1990,4368200.0,121.0,16.37,1485.0
1,Albania,1990,4368200.0,0.17,16.37,1485.0
2,Albania,1990,4368200.0,0.04,16.37,1485.0
3,Albania,1990,4368200.0,0.12,16.37,1485.0
4,Albania,1990,4368200.0,70.0,16.37,1485.0


In [9]:

unique_values = merged_df.groupby(['Country', 'Year']).nunique()
unique_values

Unnamed: 0_level_0,Unnamed: 1_level_0,Yield,Pesticides,Temperature,Rainfall
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Albania,1990,1,9,1,1
Albania,1991,1,9,1,1
Albania,1992,1,9,1,1
Albania,1993,1,23,1,1
Albania,1994,1,25,1,1
...,...,...,...,...,...
Zimbabwe,2009,1,13,1,1
Zimbabwe,2010,1,13,1,1
Zimbabwe,2011,1,13,1,1
Zimbabwe,2012,1,33,1,1


In [10]:
aggregated_data = merged_df.groupby(['Country', 'Year']).agg({
    'Yield': 'first',
    'Pesticides': 'mean',
    'Temperature': 'first',
    'Rainfall': 'first'
}).reset_index()

aggregated_data.head()

Unnamed: 0,Country,Year,Yield,Pesticides,Temperature,Rainfall
0,Albania,1990,4368200.0,20.194167,16.37,1485.0
1,Albania,1991,5665042.0,20.195833,15.36,1485.0
2,Albania,1992,5960509.0,20.194167,16.06,1485.0
3,Albania,1993,6008588.0,11.35375,16.05,1485.0
4,Albania,1994,6585554.0,18.86,16.96,1485.0


In [11]:
start_end_values = aggregated_data.groupby('Country').agg({
    'Year': ['first', 'last'],
    'Yield': ['first', 'last'],
    'Pesticides': ['first', 'last']
})

start_end_values['Yield_diff'] = start_end_values['Yield']['last'] - start_end_values['Yield']['first']
start_end_values['Pesticides_diff'] = start_end_values['Pesticides']['last'] - start_end_values['Pesticides']['first']

countries_positive_trend = start_end_values[(start_end_values['Yield_diff'] > 0) & 
                                            (start_end_values['Pesticides_diff'] <= 0)].reset_index()

countries_with_positive_trend = countries_positive_trend['Country'].tolist()

countries_with_positive_trend

['Angola',
 'Austria',
 'Bahamas',
 'Bahrain',
 'Belgium',
 'Bulgaria',
 'Croatia',
 'Denmark',
 'France',
 'Greece',
 'Guatemala',
 'India',
 'Ireland',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Libya',
 'Mali',
 'Mongolia',
 'Niger',
 'Norway',
 'Portugal',
 'Romania',
 'Sierra Leone',
 'Spain',
 'Sri Lanka',
 'Switzerland',
 'Tajikistan',
 'Thailand',
 'Ukraine',
 'Zimbabwe']

In [12]:
positive_trend_data = aggregated_data[aggregated_data['Country'].isin(countries_with_positive_trend)]

In [13]:
len(countries_with_positive_trend)

31

In [14]:
subset_countries = ['Angola', 'Bahamas', 'Bahrain', 'Belgium', 'Bulgaria']
# subset_data = aggregated_data[aggregated_data['Country'].isin(subset_countries)]
subset_data = aggregated_data[aggregated_data['Country'].isin(countries_with_positive_trend)]

temperature_chart = alt.Chart(subset_data).mark_line().encode(
    x='Year:O',
    y='Temperature:Q',
    color='Country:N',
    tooltip=['Country', 'Year', 'Temperature']
).properties(
    title='Temperature Trends Over the Years',
    width=800,
    height=400
).interactive()

# rainfall_chart = alt.Chart(subset_data).mark_line().encode(
#     x='Year:O',
#     y='Rainfall:Q',
#     color='Country:N',
#     tooltip=['Country', 'Year', 'Rainfall']
# ).properties(
#     title='Rainfall Trends Over the Years',
#     width=600,
#     height=300
# ).interactive()

temperature_chart

In [15]:

subset_countries = ['Denmark', 'Egypt', 'Finland', 'France', 'Honduras']
subset_data = aggregated_data[aggregated_data['Country'].isin(subset_countries)]

temp_chart = alt.Chart(subset_data).mark_line().encode(
    x='Year:O',
    y='Temperature:Q',
    color='Country:N',
    tooltip=['Country', 'Year', 'Temperature']
).properties(
    width=800,
    height=200,
    title='Temperature Trends Over the Years'
).facet(
    row='Country:N'
)

temp_chart


In [16]:
subset_data = aggregated_data[aggregated_data['Country'].isin(countries_with_positive_trend)]

temp_base_chart = alt.Chart().mark_line().encode(
    x='Year:O',
    y='Temperature:Q',
    color='Country:N',
    tooltip=['Country', 'Year', 'Temperature']
).properties(
    width=100,
    height=100,
)

temp_base_chart

temp_trend = temp_base_chart.facet(
    row='Country:N',
    data=subset_data
).resolve_scale(
    y='independent'
)

temp_trend

In [17]:
def compute_correlation(group):
    return group['Yield'].corr(group['Temperature'])

temp_yield_corr = (
    aggregated_data[aggregated_data['Country'].isin(countries_with_positive_trend)]
    .groupby('Country')
    .apply(compute_correlation)
    .reset_index(name='Correlation')
)

temp_yield_corr

Unnamed: 0,Country,Correlation
0,Angola,0.203196
1,Austria,0.364272
2,Bahamas,-0.004999
3,Bahrain,0.590496
4,Belgium,-0.499666
5,Bulgaria,0.26121
6,Croatia,0.363422
7,Denmark,0.420463
8,France,0.261548
9,Greece,0.115243


In [18]:
temp_yield_corr_chart = alt.Chart(temp_yield_corr).mark_bar().encode(
    x='Country:O',
    y=alt.Y('Correlation:Q',scale=alt.Scale(domain=[-1, 1])),
    color=alt.condition(
        alt.datum.Correlation > 0,
        alt.value('blue'),
        alt.value('red')
    ),
    tooltip=['Country', 'Correlation']
).properties(
    title='Country-wise Yield-Temperature Correlation',
    width=800,
    height=400
).interactive()

temp_yield_corr_chart

In [19]:
heatmap_bar_chart = alt.Chart(temp_yield_corr).mark_rect().encode(
    x='Country:O',
    y=alt.Y('Correlation:Q',scale=alt.Scale(domain=[-.75, .75])),
    color=alt.Color('Correlation:Q', scale=alt.Scale(domain=[-1, 1], scheme='redyellowgreen')),
    tooltip=['Country', 'Correlation']
).properties(
    title="Correlation Heatmap",
    width=600,
    height=400
)

lines = alt.Chart(pd.DataFrame({'y': [0.3, 0.7, -0.3, -0.7]})).mark_rule(strokeDash=[2,2]).encode(
    y='y:Q'
)

heatmap_bar_chart + lines

The Pearson correlation coefficients between crop yield and weather conditions (temperature) for each country. Here are our observations:

For most countries, there is a positive correlation between temperature and yield, indicating that as temperature increased, the yield also tended to increase. However, the strength of the correlation varies across countries with most being either weak and some moderate. None of them were a strong positive correlation. 

For some countries like Belgium and Zimbabwe, there's a negative correlation between temperature and yield, suggesting that as the temperature increased, the yield decreased.

In [20]:
trend_data = {
    'Country': [],
    'Temperature Trend': [],
}

for country in countries_with_positive_trend:
    country_data = aggregated_data[aggregated_data['Country'] == country]
    
    tau_temp, p_value_temp = stats.kendalltau(country_data['Year'], country_data['Temperature'])
    if p_value_temp < 0.05:
        if tau_temp > 0:
            trend_temp = "Increasing"
        else:
            trend_temp = "Decreasing"
    else:
        trend_temp = "No significant trend"
    
    trend_data['Country'].append(country)
    trend_data['Temperature Trend'].append(trend_temp)
    
temp_trend_df = pd.DataFrame(trend_data)

NameError: name 'stats' is not defined

In [None]:
grouped = temp_trend_df.groupby('Temperature Trend')
increasing_df = grouped.get_group('Increasing')
no_significant_trend_df = grouped.get_group('No significant trend')
decreasing_df = grouped.get_group('Decreasing')

In [None]:
increasing_df['Country'].to_list()

['Angola',
 'Bahrain',
 'Bulgaria',
 'Greece',
 'India',
 'Libya',
 'Mali',
 'Niger',
 'Romania',
 'Sierra Leone',
 'Sri Lanka',
 'Ukraine']

In [None]:
temp_independent = no_significant_trend_df['Country'].to_list()
temp_independent

['Austria',
 'Bahamas',
 'Belgium',
 'Croatia',
 'Denmark',
 'France',
 'Guatemala',
 'Ireland',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Mongolia',
 'Norway',
 'Portugal',
 'Spain',
 'Switzerland',
 'Tajikistan',
 'Thailand']

In [None]:
decreasing_df['Country'].to_list()

['Zimbabwe']

In [None]:
aggregated_counts = temp_trend_df['Temperature Trend'].value_counts().reset_index()
aggregated_counts.columns = ['Temperature Trend', 'Count']

temp_trend_bar_chart = alt.Chart(aggregated_counts).mark_bar().encode(
    x='Temperature Trend:O',
    y='Count:Q',
    color='Temperature Trend:O',
    tooltip=['Temperature Trend', 'Count']
).properties(
    title="Count of Countries by Temperature Trend",
    width=300,
    height=300
)

temp_trend_bar_chart

In [None]:
positive_trend_data = positive_trend_data[positive_trend_data['Country'].isin(temp_independent)]
print(positive_trend_data.head())

## Analysis of Countries who have improved yields without a significant change in pesticide use

TODO: Need to figure this out, it's including each year

In [None]:
scatter_chart_yield_vs_pesticide = alt.Chart(positive_trend_data).mark_circle().encode(
    x='Pesticides:Q',
    y='Yield:Q',
    color='Country:N',
    tooltip=['Country', 'Pesticides', 'Yield']
).properties(
    title='Yield vs. Pesticide Use',
    width=600,
    height=400
)

scatter_chart_yield_vs_pesticide

In [None]:
start_values = positive_trend_data.groupby('Country').first()
end_values = positive_trend_data.groupby('Country').last()

percentage_change = pd.DataFrame({
    'Yield_perc': ((end_values['Yield'] - start_values['Yield']) / start_values['Yield']) * 100,
    'Pesticides_perc': ((end_values['Pesticides'] - start_values['Pesticides']) / start_values['Pesticides']) * 100
}).reset_index()

melted_df = percentage_change.melt(id_vars='Country', 
                                   value_vars=['Yield_perc', 'Pesticides_perc'], 
                                   var_name='Metric', value_name='Percentage Change')

bar_chart = alt.Chart(melted_df).mark_bar(opacity=0.7).encode(
    x='Country:N',
    y='Percentage Change:Q',
    color='Metric:N',
    tooltip=['Country', 'Metric', 'Percentage Change']
).properties(
    title='Percentage Change in Yield and Pesticide Use from Start to End Year',
    width=800,
    height=400
).interactive()

bar_chart

### Normalize data

In [None]:
trend_data = positive_trend_data.copy()

#Yield
yield_min = trend_data['Yield'].min()
yield_max = trend_data['Yield'].max()
trend_data['Yield_normalized'] = (trend_data['Yield'] - yield_min) / (yield_max - yield_min)

#Pesticides
pesticides_min = trend_data['Pesticides'].min()
pesticides_max = trend_data['Pesticides'].max()
trend_data['Pesticides_normalized'] = (trend_data['Pesticides'] - pesticides_min) / (pesticides_max - pesticides_min)

print(positive_trend_data.head())

     Country  Year      Yield  Pesticides  Temperature  Rainfall  Yield_normalized  Pesticides_normalized
136  Austria  1990  9181758.0  772.368182         9.23    1110.0          0.331428               0.037028
137  Austria  1991  8294770.0  816.204545         8.15    1110.0          0.297904               0.039130
138  Austria  1992  7928646.0  708.886364         9.49    1110.0          0.284066               0.033984
139  Austria  1993  8673002.0  724.712727         8.48    1110.0          0.312200               0.034742
140  Austria  1994  8861187.0  658.316364         9.89    1110.0          0.319312               0.031559


In [None]:
base = alt.Chart(trend_data)

yield_chart = base.mark_line(color='blue').encode(
    x='Year:O',
    y='Yield_normalized:Q',
    tooltip=['Year', 'Yield_normalized']
)

pest_chart = base.mark_line(color='red').encode(
    x='Year:O',
    y='Pesticides_normalized:Q',
    tooltip=['Year', 'Pesticides_normalized']
)

final_chart = (yield_chart + pest_chart).properties(
    width=800,
    height=400
).facet(
    row='Country:N'
)

final_chart

In [None]:
default_width = pd.get_option('display.width')
default_display_columns = pd.get_option('display.max_columns')
default_max_rows = pd.get_option('display.max_rows')

print(default_width) #80
print(default_display_columns) #10
print(default_max_rows) #20

pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 140)

140
10
60


In [None]:
first_values = positive_trend_data.groupby('Country').first()
last_values = positive_trend_data.groupby('Country').last()

yield_perc_increase = ((last_values['Yield'] - first_values['Yield']) / first_values['Yield']) * 100
pesticide_perc_decrease = ((first_values['Pesticides'] - last_values['Pesticides']) / first_values['Pesticides']) * 100

ranking_score = yield_perc_increase + pesticide_perc_decrease

ranking_df = pd.DataFrame({
    'Country': yield_perc_increase.index,
    'Yield Percentage Increase': yield_perc_increase.values,
    'Pesticide Percentage Decrease': pesticide_perc_decrease.values,
    'Ranking Score': ranking_score.values
})

ranking_df = ranking_df.sort_values(by='Ranking Score', ascending=False).reset_index(drop=True)

print(ranking_df)

        Country  Yield Percentage Increase  Pesticide Percentage Decrease  Ranking Score
0       Bahamas                 134.655536                      53.997814     188.653350
1    Kazakhstan                 135.699430                      49.141870     184.841299
2       Austria                  88.566285                      62.668063     151.234349
3       Croatia                  92.278096                      55.656563     147.934659
4      Thailand                  61.780656                      72.258082     134.038738
5      Mongolia                 131.725681                       0.000000     131.725681
6        France                  26.642640                      65.743098      92.385738
7         Italy                  18.078009                      73.296700      91.374709
8         Japan                   8.786725                      71.277374      80.064099
9      Portugal                  38.591621                      38.734437      77.326058
10      Ireland      

In [None]:
base_chart = alt.Chart(ranking_df).mark_bar().encode(
    x=alt.X('Country:N', sort='-y', title='Country', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('Ranking Score:Q', title='Ranking Score'),
    tooltip=['Country', 'Ranking Score', 'Pesticide Percentage Decrease', 'Yield Percentage Increase']
)

pesticide_chart = base_chart.encode(
    color=alt.Color('Pesticide Percentage Decrease:Q', scale=alt.Scale(scheme='yellowgreen'), title='% Decrease')
).properties(title="Ranking and Color by Pesticide % Decrease")

yield_chart = base_chart.encode(
    color=alt.Color('Yield Percentage Increase:Q', scale=alt.Scale(scheme='blues'), title='% Increase')
).properties(title="Ranking and Color by Yield % Increase")

combined_chart = alt.hconcat(pesticide_chart, yield_chart).resolve_scale(color='independent')

combined_chart
