In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import s3fs
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pycountry

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [None]:
# Load the data
# csv file
url = "https://plasticpollution.s3.amazonaws.com/MLW_data_clean.csv"
MLW_data_df = pd.read_csv(url)
MLW_data_df

In [None]:
MLW_raw_df = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\MLW_Data.csv", encoding="iso-8859-1")
MLW_raw_df.EventDate = pd.to_datetime(MLW_raw_df.EventDate, format = "%Y%m%d").dt.year
# df.EventDate = pd.to_datetime(df.EventDate, format = "%Y%m%d")
# df.EventDate.unique()
# df.groupby("BeachCountrycode").sum()
# df.EventDate
MLW_raw_df

In [None]:
MLW_clean = MLW_raw_df.rename(columns={'BeachCountrycode': 'country_code'})
MLW_clean.head(75)

In [None]:
# Replace NaN values with empty string on country_code column
MLW_clean = MLW_clean.replace(np.nan, '', regex=True)
# country_code_df[['country_code']].fillna('')
MLW_clean

In [None]:
country_code_df = pd.DataFrame(MLW_clean.country_code.unique())
country_code_df.columns = ['country_code']
country_code_df

In [None]:
country_code_df

list_alpha_2 = [i.alpha_2 for i in list(pycountry.countries)]
list_alpha_3 = [i.alpha_3 for i in list(pycountry.countries)] 

def country_flag(df):
    if (len(df['country_code'])==2 and df['country_code'] in list_alpha_2):
        return pycountry.countries.get(alpha_2=df['country_code']).name
    elif (len(df['country_code'])==3 and df['country_code'] in list_alpha_3):
        return pycountry.countries.get(alpha_3=df['country_code']).name
    else:
        return 'xxxx'

country_code_df['country_name']=country_code_df.apply(country_flag, axis = 1)
country_code_df

In [None]:
MLW_clean['country_code'] = MLW_clean['country_code'].map(country_code_df.set_index('country_code')['country_name'])
# MLW_clean.rename(columns={'country_code' : 'Country'}, inplace=True)
MLW_clean

In [None]:
MLW_clean_df = MLW_clean.loc[MLW_clean['EventDate'].isin([2017, 2018, 2019, 2020, 2021])]
MLW_clean_df

In [None]:
MLW_clean_df = MLW_clean_df.rename(columns={'country_code': 'Country'})
MLW_clean_df

In [None]:
MLW_clean_df = MLW_clean_df.drop(columns=['CommunityName', 'BeachName', 'BeachRegionalSea', 'BeachLocation', 'BeachType', 'EventType', 'NatRef', 'lon_x1', 'lat_y2'])
MLW_clean_df

In [None]:
MLW_clean_df = MLW_clean_df.drop(columns=['lat_y1', 'lon_x2'])
MLW_clean_df

In [None]:
# Load the data 
# CSV File from S3 MLW_meta
url = "https://plasticpollution.s3.amazonaws.com/MLW_meta_clean.csv"
MLW_meta_df = pd.read_csv(url)
MLW_meta_df

In [None]:
# Load the data 
# CSV File from S3 Mismanagement
url = "https://plasticpollution.s3.amazonaws.com/Plastic_waste_mismanagement_clean.csv"
mismanagement_df = pd.read_csv(url)
mismanagement_df

In [None]:
# Rename columns
mismanagement_df = mismanagement_df.rename(columns={"PlasticWaste(metric tons)": "Metric_Tons"})
mismanagement_df

In [None]:
# Load the data 
# CSV File from S3
url = 'https://plasticpollution.s3.amazonaws.com/pbt_clean.csv'
pbt_df = pd.read_csv(url)
pbt_df

In [None]:
# Rename columns 
pbt_df = pbt_df.rename(columns={'COUNTRY NAME' : 'Country', 'YEAR' : 'Year', 'VALUE' : 'Metric_Tons'})
pbt_df.Year.unique()
# pbt_df

In [None]:
pbt_df.describe()

In [None]:
#Bar Graph for PBT
plt.bar(pbt_df.Year, pbt_df.Metric_Tons)
plt.xlabel('Year')
plt.ylabel('Metric Tons')
plt.show()

In [None]:
pwaste_df = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\2021_2016_2010.csv")
pwaste_df

In [None]:
pwaste_df.sort_values('mpw_oceans_2021', ascending=False)

In [None]:
pwaste_df.describe()

In [None]:
GDP_population_df = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\2010_per-capita-plastic-waste-vs-gdp-per-capita.csv")
GDP_population_df

In [None]:
GDP_population_df = GDP_population_df.drop(columns=['Per capita plastic waste (kg/person/day)', 'Code', 'Continent'], axis=1)
GDP_population_df

In [None]:
GDP_population_df.Year.unique()

In [None]:
GDP_population_df = GDP_population_df.loc[GDP_population_df['Year'].isin([2016, 2017, 2018, 2019, 2020])]
GDP_population_df

In [None]:
GDP_population_df = GDP_population_df.rename(columns={"GDP per capita, PPP (constant 2017 international $)": "GDP_per_capita", "Population (historical estimates)": "Population", "Entity" : "Country"})
GDP_population_df

In [None]:
GDP_population_df.describe()

In [None]:
GDP_population_df['GDP_per_capita'].nlargest(n=10)

In [None]:
GDP_population_df.sort_values('GDP_per_capita', ascending=True)

In [None]:
GDP_population_df.sort_values('Population', ascending=False).head(60)


In [None]:
list_countries_df = GDP_population_df.loc[GDP_population_df['Country'].isin(['Saint Kitts and Nevis', 'United Arab Emirates', 'Indonesia', 'Seychelles', 'Nigeria', 'United States of America', 'Australia', 'Russia', 'Russian Federation', 'Germany', 'United States', 'China', 'Philippines', 'Brazil', 'Argentina'])]
# list_countries.values.tolist()
list_countries_df

In [None]:
GDP_population_uniques = pd.DataFrame(GDP_population_df.Country.unique())
GDP_population_uniques

In [None]:
list_countries_df

In [None]:
list_countries_df.describe()


In [None]:
GDP_population_uniques.values.tolist()

In [None]:
income_df = GDP_population_df.loc[GDP_population_df['Country'].isin(['High income', 'Low and middle income', 'Low income', \
                                    'Lower middle income', 'Middle income', 'Upper middle income'])]
# income_df.values.tolist()
income_df

In [None]:
income_df = income_df.drop(columns=['Population'], axis=1)
income_df

In [None]:
income_df.describe()

In [None]:
# Load plastic-waste-per-capita file from 2010
pwaste_per_cap_df = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\plastic-waste-per-capita.csv")
# pwaste_per_cap_df.values.tolist()
pwaste_per_cap_df

In [None]:
pwaste_per_cap_df.Entity.unique()

In [None]:
# Load Mismanaged Plastic Waste Per Capita 2019
mm_pw_pc = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\mismanaged-plastic-waste-per-capita.csv")
mm_pw_pc

In [None]:
# Load per-capita-mismanaged-plastic-waste-vs-gdp-per-capita.csv
pc_mm_pw_vs_gdp = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\per-capita-mismanaged-plastic-waste-vs-gdp-per-capita.csv")
pc_mm_pw_vs_gdp

In [None]:
# Find unique years in pc_mm_pw_vs_gdp df

pc_mm_pw_vs_gdp.Year.unique()

In [None]:
pc_mm_pw_vs_gdp_df = pc_mm_pw_vs_gdp.loc[pc_mm_pw_vs_gdp['Year'].isin([2017, 2018, 2019, 2020])]
pc_mm_pw_vs_gdp_df

In [None]:
total_plastic_prod = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\global-plastics-production.csv")
total_plastic_prod

In [None]:
total_plastic_prod_df = total_plastic_prod.loc[total_plastic_prod['Year'].isin([2010, 2011, 2012, 2013, 2014, 2015])]
total_plastic_prod_df

In [None]:
pbt_df.Country.unique()

In [None]:
pbt_countries_df = pbt_df.loc[pbt_df['Country'].isin(['Saint Kitts and Nevis', 'United Arab Emirates', 'Indonesia', 'Seychelles', 'Nigeria', 'United States of America', 'Australia', 'Russia', 'Russian Federation', 'Germany', 'United States', 'China', 'Philippines', 'Brazil', 'Argentina'])]
pbt_countries_df.values.tolist()

In [None]:
#Bar Graph for PBT
plt.scatter(pbt_countries_df.Country, pbt_countries_df.Metric_Tons)
plt.xlabel('Country')
plt.ylabel('Metric Tons')
plt.xticks(rotation=90, ha='right')
# plt.rcParams["figure.figsize"] = [70.50, 50.50]
# plt.rcParams["figure.autolayout"] = True
# plt.rcParams["font.size"] = "50"
plt.show()

In [None]:
#Bar Graph for PBT
plt.bar(pbt_countries_df.Country, pbt_countries_df.Metric_Tons)
plt.xlabel('Country')
plt.ylabel('Metric Tons')
plt.xticks(rotation=90, ha='right')
# plt.rcParams["figure.figsize"] = [70.50, 50.50]
# plt.rcParams["figure.autolayout"] = True
# plt.rcParams["font.size"] = "50"
plt.show()

In [None]:
# Merge all relevant dataframes for years 2017-2020

countries_GDP_MT = pd.merge(list_countries_df, pbt_countries_df, on=['Country', 'Year'], how='inner')
countries_GDP_MT

In [None]:
global_total_df = pd.read_csv(r"C:\Users\mommy\Repositories\Dev\Analysis Projects\Ocean_Plastic_Pollution\Resources\Raw_Data\mismanaged-waste-global-total.csv")
global_total_df

In [None]:
countries_GDP_MT.to_csv("Resources\Raw_Data\countries_GDP_MT.csv")