In [1]:
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from IPython.display import display, Markdown

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
pest_df = pd.read_csv('./pesticides.csv')
rain_df = pd.read_csv('./rainfall.csv')
temp_df = pd.read_csv('./temp.csv')
yield_df = pd.read_csv('./yield.csv')

temp_df.rename(columns = {'year':'Year','country':'Country','avg_temp':'Average_Temperature'},inplace = True)
rain_df.rename(columns = {' Area':'Country','average_rain_fall_mm_per_year':'Average_Rainfall'},inplace = True)
yield_df.rename(columns = {'Area':'Country','Value':'Yield'},inplace = True)
pest_df.rename(columns = {'Area':'Country','Value':'Pesticides'},inplace = True)

column_mask_pest = ['Country', 'Year', 'Pesticides']
column_mask_yield = ['Country', 'Item', 'Year', 'Yield']

pest_df = pest_df[column_mask_pest]
yield_df = yield_df[column_mask_yield]

yield_df = yield_df.groupby(['Country','Year'])['Yield'].sum().reset_index()

In [6]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71311 entries, 0 to 71310
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 71311 non-null  int64  
 1   Country              71311 non-null  object 
 2   Average_Temperature  68764 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


In [9]:
temp_df[temp_df['Country'] == 'United States'].to_csv("usa.csv")

In [29]:
temp_df[(temp_df['Country'] == 'United States') & (temp_df['Year'] == 2000)].describe()

Unnamed: 0,Year,Average_Temperature
count,52.0,52.0
mean,2000.0,15.087115
std,0.0,4.119794
min,2000.0,6.03
25%,2000.0,12.2775
50%,2000.0,15.02
75%,2000.0,17.225
max,2000.0,23.5


In [26]:
temp_df[temp_df['Country'] == 'Bahamas'].groupby('Year')['Average_Temperature'].count().value_counts()

Average_Temperature
1    231
0     25
Name: count, dtype: int64

In [43]:
duplicate_counts_temp = temp_df.groupby(['Year', 'Country']).size().reset_index(name='Count')
duplicate_counts_temp = duplicate_counts_temp[duplicate_counts_temp['Count'] > 1]
duplicate_counts_temp.sort_values(by='Count', ascending=False).head(10)
usa = duplicate_counts_temp[duplicate_counts_temp['Country'] == 'United States']
print(f"Duplicate values by year for the USA: {usa['Count'].value_counts()}")
usa

Duplicate values by year for the USA: Count
52    165
31     45
22     15
44     14
27     10
29      7
40      7
37      5
38      3
Name: count, dtype: int64


Unnamed: 0,Year,Country,Count
37,1743,United States,22
75,1744,United States,22
113,1745,United States,22
151,1746,United States,22
189,1747,United States,22
...,...,...,...
27959,2009,United States,52
28096,2010,United States,52
28233,2011,United States,52
28370,2012,United States,52


In [47]:
usa['Count'].value_counts().sort_index()

Count
22     15
27     10
29      7
31     45
37      5
38      3
40      7
44     14
52    165
Name: count, dtype: int64

In [28]:
duplicate_counts_temp[duplicate_counts_temp['Country'] == 'United States']

Unnamed: 0,Year,Country,count
37,1743,United States,22
75,1744,United States,22
113,1745,United States,22
151,1746,United States,22
189,1747,United States,22
...,...,...,...
27959,2009,United States,52
28096,2010,United States,52
28233,2011,United States,52
28370,2012,United States,52


In [31]:
temp_df[(temp_df['Country'] == 'United Kingdom') & (temp_df['Year'] == 1998)].describe()

Unnamed: 0,Year,Average_Temperature
count,5.0,5.0
mean,1998.0,9.71
std,0.0,0.703776
min,1998.0,8.56
25%,1998.0,9.59
50%,1998.0,10.0
75%,1998.0,10.0
max,1998.0,10.4


In [33]:
temp_df[(temp_df['Country'] == 'United States') & (temp_df['Year'] == 2000)].describe()

Unnamed: 0,Year,Average_Temperature
count,52.0,52.0
mean,2000.0,15.087115
std,0.0,4.119794
min,2000.0,6.03
25%,2000.0,12.2775
50%,2000.0,15.02
75%,2000.0,17.225
max,2000.0,23.5


In [48]:
usa_duplicates = duplicate_counts_temp[duplicate_counts_temp['Country'] == 'United States']

chart = alt.Chart(usa_duplicates).mark_line().encode(
    x='Year',
    y='Count',
    tooltip=['Year', 'Count']
).properties(
    title='Duplicate Counts for United States by Year',
    width=600,
    height=400
)

chart

In [55]:
top_countries = duplicate_counts_temp.groupby('Country')['Count'].sum().nlargest(7).index.tolist()

filtered_data = duplicate_counts_temp[duplicate_counts_temp['Country'].isin(top_countries)]

chart = alt.Chart(filtered_data).mark_line().encode(
    x='Year',
    y='Count',
    color='Country',
    tooltip=['Year', 'Count', 'Country']
).properties(
    title='Duplicate Counts by Year for Top 7 Countries',
    width=600,
    height=300
)

chart

In [51]:
duplicate_counts_temp.groupby('Country')['Count'].sum()

Country
Argentina                               318
Australia                              1027
Bangladesh                              436
Brazil                                 1989
Canada                                 1541
Chile                                   318
China                                  6168
Colombia                                378
Congo (Democratic Republic Of The)      316
Côte D'Ivoire                           330
Dominican Republic                      380
Ecuador                                 443
Egypt                                   412
Germany                                 813
India                                  4756
Indonesia                              1134
Iran                                    536
Iraq                                    390
Italy                                   542
Japan                                  1030
Kazakhstan                              388
Libya                                   522
Mexico                  