In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from path import Path
from collections import Counter
import s3fs
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pycountry

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Load the data
all_countries_df = pd.read_csv(Path("Resources/all_countries.csv"), index_col=0)

all_countries_df

Unnamed: 0,Country,Year,GDP_per_capita,Population,Metric_Tons
0,Albania,2017,12770.963867,2884169.0,16481.0
1,Albania,2018,13317.092773,2882735.0,18094.0
2,Albania,2019,13656.592773,2880913.0,18500.0
3,Albania,2020,13191.952148,2877800.0,20157.0
4,Algeria,2017,11737.401367,41389174.0,153459.0
...,...,...,...,...,...
491,Uruguay,2020,21608.429688,3473727.0,110834.0
492,Vanuatu,2017,3081.835205,285499.0,868.0
493,Vanuatu,2018,3093.461426,292675.0,841.0
494,Vanuatu,2019,3137.240967,299882.0,997.0


In [4]:
all_countries_df.sort_values('GDP_per_capita', ascending=False)

Unnamed: 0,Country,Year,GDP_per_capita,Population,Metric_Tons
380,Qatar,2017,91738.750000,2724727.0,793.0
381,Qatar,2018,90970.062500,2781682.0,832.0
227,Ireland,2020,90789.218750,4937796.0,556.0
382,Qatar,2019,89966.453125,2832071.0,868.0
226,Ireland,2019,86650.007812,4882498.0,508.0
...,...,...,...,...,...
315,Mozambique,2020,1229.940308,31255435.0,73396.0
430,Somalia,2019,1186.450928,15442906.0,2371.0
431,Somalia,2020,1180.965942,15893219.0,2151.0
429,Somalia,2018,1129.531494,15008225.0,2136.0


In [5]:
all_countries_df = all_countries_df.dropna()

In [6]:
all_countries_bins_df = all_countries_df.copy(deep=True)

In [7]:
# Put all GDP per capita values in income bins
bins = [0, 12535]
labels = ['low_income''upper_income']
all_countries_bins_df['GDP_per_capita'] = pd.cut(x = all_countries_bins_df['GDP_per_capita'], bins = bins, right=True, labels = labels)
all_countries_bins_df.GDP_per_capita.dtypes

CategoricalDtype(categories=['low_incomeupper_income'], ordered=True)

In [8]:
all_countries_bins_df.GDP_per_capita.unique()

[NaN, 'low_incomeupper_income']
Categories (1, object): ['low_incomeupper_income']

In [9]:
# all_countries_bins_df = all_countries_bins_df.drop('Country', 'Year', axis=1)
all_countries_bins_df = all_countries_bins_df.drop(columns=['Country', 'Year'], axis=1)
all_countries_bins_df

Unnamed: 0,GDP_per_capita,Population,Metric_Tons
0,,2884169.0,16481.0
1,,2882735.0,18094.0
2,,2880913.0,18500.0
3,,2877800.0,20157.0
4,low_incomeupper_income,41389174.0,153459.0
...,...,...,...
491,,3473727.0,110834.0
492,low_incomeupper_income,285499.0,868.0
493,low_incomeupper_income,292675.0,841.0
494,low_incomeupper_income,299882.0,997.0


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = all_countries_bins_df.copy()
df2['GDP_per_capita'] = le.fit_transform(df2['GDP_per_capita'])
df2.GDP_per_capita.unique()

array([1, 0])

In [11]:
df2

Unnamed: 0,GDP_per_capita,Population,Metric_Tons
0,1,2884169.0,16481.0
1,1,2882735.0,18094.0
2,1,2880913.0,18500.0
3,1,2877800.0,20157.0
4,0,41389174.0,153459.0
...,...,...,...
491,1,3473727.0,110834.0
492,0,285499.0,868.0
493,0,292675.0,841.0
494,0,299882.0,997.0


In [12]:
# Encode the GDP per capita into columns using get_dummies
country_binary_encoded = pd.get_dummies(all_countries_bins_df, columns=["GDP_per_capita"])
country_binary_encoded

Unnamed: 0,Population,Metric_Tons,GDP_per_capita_low_incomeupper_income
0,2884169.0,16481.0,0
1,2882735.0,18094.0,0
2,2880913.0,18500.0,0
3,2877800.0,20157.0,0
4,41389174.0,153459.0,1
...,...,...,...
491,3473727.0,110834.0,0
492,285499.0,868.0,1
493,292675.0,841.0,1
494,299882.0,997.0,1


In [13]:
# Create instance of StandardScaler
# from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [14]:
# Scale the date in df
df2 = pd.DataFrame(StandardScaler().fit_transform(df2), columns=df2.columns, index=df2.index)
# df2 = data_scaler.fit_transform(df2)

In [15]:
df2[:5]

Unnamed: 0,GDP_per_capita,Population,Metric_Tons
0,0.881007,-0.254142,-0.229396
1,0.881007,-0.25415,-0.223933
2,0.881007,-0.25416,-0.222559
3,0.881007,-0.254178,-0.216947
4,-1.135065,-0.039366,0.234456


In [16]:
print(np.mean(df2[:,0]))
print(np.std(df2[:,0]))

TypeError: '(slice(None, None, None), 0)' is an invalid key

In [None]:
# Define the features set.

X = df2.copy()
X = X.drop("Metric_Tons")
X.head()

In [None]:
# Define the target set.
y = country_binary_encoded["Metric_Tons"].values
y[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
predictions

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df