In [3]:
temp_df

Unnamed: 0,year,country,avg_temp
0,1849,Côte D'Ivoire,25.58
1,1850,Côte D'Ivoire,25.52
2,1851,Côte D'Ivoire,25.67
3,1852,Côte D'Ivoire,
4,1853,Côte D'Ivoire,
...,...,...,...
71306,2009,Mexico,21.76
71307,2010,Mexico,20.90
71308,2011,Mexico,21.55
71309,2012,Mexico,21.52


We hypothesize that the missing values might be systematically absent or unavailable for specific time periods. To delve deeper into this, we'll employ the Altair visualization library to plot the frequency of these missing values and identify potential concentrations. Given the extensive range of years in this dataset, we've opted to group them into bins for a clearer visualization. Grouping the data in 10-year intervals strikes a balance between granularity and clarity, enabling us to pinpoint where values may be absent.

In [15]:
bin_size = 10
min_year_temp = temp_df["Year"].min()
max_year_temp = temp_df["Year"].max()
bins_temp = list(range(int(min_year_temp), int(max_year_temp) + bin_size, bin_size))

bin_labels_temp = [f"{i}-{i + bin_size - 1}" for i in bins_temp[:-1]]
temp_df["Year_Bin"] = pd.cut(temp_df["Year"], bins=bins_temp, labels=bin_labels_temp, right=False)
temp_missing_by_bin = temp_df.groupby("Year_Bin", observed=True)["Average_Temperature"].apply(lambda x: x.isnull().mean())

temp_missing_df = temp_missing_by_bin.reset_index()
temp_missing_df.columns = ['Year_Bin', 'Missing_Proportion']

temp_chart = alt.Chart(temp_missing_df).mark_bar().encode(
    x=alt.X('Year_Bin:O', title='Year Bin', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('Missing_Proportion:Q', title='Average Proportion of Missing Values'),
    tooltip=['Year_Bin', 'Missing_Proportion']
).properties(
    title='Average Proportion of Missing Values by Year Bin in Temperature Dataset',
    width=600,
    height=400
)

temp_chart

In [125]:
missing_data = temp_df[temp_df.isnull().any(axis=1)]
latest_year_with_missing = missing_data['Year'].max()
temp_df[temp_df["Year"] >= 1900]['Average_Temperature'].isnull().sum()

0

In [29]:
temp_df = temp_df[temp_df["Year"] >= 1900]

In [36]:
temp_df[temp_df["Year"] >= 1900].isnull().sum()

Year                   0
Country                0
Average_Temperature    0
dtype: int64

We can see that

# Rainfall missing values

In [155]:
bin_size = 5
min_year_rainfall = rain_df["Year"].min()
max_year_rainfall = rain_df["Year"].max()
bins_rainfall = list(range(int(min_year_rainfall), int(max_year_rainfall) + bin_size, bin_size))
bin_labels_rainfall = [f"{i}-{i + bin_size - 1}" for i in bins_rainfall[:-1]]

rain_df["Year_Bin"] = pd.cut(rain_df["Year"], bins=bins_rainfall, labels=bin_labels_rainfall, right=False)
rainfall_missing_by_bin = rain_df.groupby("Year_Bin")["Average_Rainfall"].apply(lambda x: x.isnull().mean())

rainfall_missing_by_year = rain_df.groupby("Year")["Average_Rainfall"].apply(lambda x: x.isnull().mean()).reset_index()
rainfall_missing_by_year.columns = ['Year', 'Missing_Proportion']

rainfall_chart = alt.Chart(rainfall_missing_by_year).mark_bar().encode(
    x=alt.X('Year:O', title='Year', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('Missing_Proportion:Q', title='Average Proportion of Missing Values'),
    tooltip=['Year', 'Missing_Proportion']
).properties(
    title='Average Proportion of Missing Values by Year in Rainfall Dataset',
    width=600,
    height=400
)

rainfall_chart

In [151]:
rainfall_missing_by_country = rain_df.groupby('Country')['Average_Rainfall'].apply(lambda x: x.isnull().sum()).reset_index()
rainfall_missing_by_country.columns = ['Country', 'Missing_Count']

In [152]:
rainfall_missing_by_country

Unnamed: 0,Country,Missing_Count
0,Afghanistan,0
1,Albania,0
2,Algeria,0
3,American Samoa,31
4,Andorra,0
...,...,...
212,Virgin Islands (U.S.),31
213,West Bank and Gaza,0
214,Yemen,0
215,Zambia,0


In [156]:
rainfall_missing_by_country = rainfall_missing_by_country[rainfall_missing_by_country["Missing_Count"] > 0]

rainfall_country_chart = alt.Chart(rainfall_missing_by_country).mark_bar().encode(
    x=alt.X('Country:O', title='Country', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('Missing_Count:Q', title='Average Proportion of Missing Values'),
    tooltip=['Country', 'Missing_Count']
).properties(
    title='Number of Missing Values by Country in Rainfall Dataset',
    width=600,
    height=400
)

rainfall_country_chart

In [40]:
rain_df.nunique()

Country             217
Year                 31
Average_Rainfall    173
dtype: int64

In [42]:
countries_with_missing_rainfall = rain_df[rain_df["Average_Rainfall"].isnull()]["Country"].unique()

In [43]:
subset_pest_df = pest_df[pest_df["Country"].isin(countries_with_missing_rainfall)]
subset_pest_df

Unnamed: 0,Country,Year,Pesticides
426,Bermuda,1990,159.88
427,Bermuda,1991,159.88
428,Bermuda,1992,159.88
429,Bermuda,1993,159.88
430,Bermuda,1994,159.88
...,...,...,...
3912,Tonga,2012,11.21
3913,Tonga,2013,11.21
3914,Tonga,2014,15.09
3915,Tonga,2015,15.09


In [213]:
countries_with_missing_rainfall
rain_df_cleaned = rain_df[~rain_df["Country"].isin(countries_with_missing_rainfall)]

print(f'Shape of cleaned rainfall dataframe: {rain_df_cleaned.shape}')
print('----------------------')
print('Missing values now')
print(rain_df_cleaned.isnull().sum())

Shape of cleaned rainfall dataframe: (5952, 4)
----------------------
Missing values now
Country             0
Year                0
Average_Rainfall    0
Year_Bin            0
dtype: int64


In [233]:
md_text = f"""
## Remaining rows in cleaned rainfall dataframe: {rain_df_cleaned.shape[0]}

---
## Missing values now

| Column       | Missing Values |
|--------------|----------------|
"""
for col, missing in rain_df_cleaned.isnull().sum().items():
    md_text += f"| {col.replace('_', ' ')} |<center>{missing}</center> |\n"

display(Markdown(md_text))


## Remaining rows in cleaned rainfall dataframe: 5952

---
## Missing values now

| Column       | Missing Values |
|--------------|----------------|
| Country |<center>0</center> |
| Year |<center>0</center> |
| Average Rainfall |<center>0</center> |
| Year Bin |<center>0</center> |


In [49]:
rain_df.dtypes

Country             object
Year                 int64
Average_Rainfall    object
dtype: object

In [50]:
rain_df.describe()

Unnamed: 0,Year
count,6727.0
mean,2001.354839
std,9.530114
min,1985.0
25%,1993.0
50%,2001.0
75%,2010.0
max,2017.0


In [51]:
rain_df['Average_Rainfall'] = rain_df['Average_Rainfall'].astype(float)

ValueError: could not convert string to float: '..'

In [52]:
rain_df['Average_Rainfall'] = rain_df['Average_Rainfall'].astype(str)
non_numeric_rain = rain_df[~rain_df['Average_Rainfall'].str.isnumeric()]
non_numeric_rain['Average_Rainfall'].value_counts()

Average_Rainfall
nan    774
..       6
Name: count, dtype: int64

In [53]:
duplicated_temp_rows = temp_df[temp_df.duplicated(keep=False)].sort_values(by=["Year", "Country"])
duplicated_temp_rows.head(10)

Unnamed: 0,Year,Country,Average_Temperature
9750,1743,United Kingdom,7.07
13518,1743,United Kingdom,7.07
1908,1743,United States,5.34
3992,1743,United States,5.34
5742,1743,United States,5.34
15107,1743,United States,5.44
41741,1743,United States,5.44
68917,1743,United States,5.34
9751,1744,United Kingdom,9.8
13519,1744,United Kingdom,9.8


In [54]:
duplicated_temp_rows.shape

(11963, 3)

In [55]:
temp_df.shape

(71311, 3)

In [60]:
temp_df[temp_df['Country'] == 'United Kingdom'].to_csv("uk.csv")