In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# load dataset
alltopcountries = pd.read_csv("VossMaria_CPE_590_Project/VossMaria_CPE_590_Project/Datasets/top26_numlabels.csv")

# Create the Logistic Regression model
x = alltopcountries[['Material Number Labels', 'Geo Number Labels', 'OBS_VALUE']].to_numpy() # features
y = alltopcountries[['Partner Number Labels']].to_numpy().ravel() # output
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=16)

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16, max_iter=1000)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [2]:
print(logreg.score(X_test,y_test))

0.07126756330302803


Logistic Regression is not a good fit for predicting where the trash will go here, so now, the input class will be the amount of trash taken in, and the output will be which country takes it in. 

In [3]:
# load dataset
alltopcountries = pd.read_csv("VossMaria_CPE_590_Project/VossMaria_CPE_590_Project/Datasets/top26_numlabels.csv")

# Create the Logistic Regression model
x = alltopcountries[['OBS_VALUE']].to_numpy() # features
y = alltopcountries[['Partner Number Labels']].to_numpy().ravel() # output
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=16)

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16, max_iter=1000)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [4]:
print(logreg.score(X_test,y_test))

0.05124351942294688


Even less of a good score! Wow logistic regression is not good for determining which country will take in the trash. Now what if we switch what the output label will be? Going to use Linear regression in order to get the amount of input trash. 

In [6]:
from sklearn.linear_model import LinearRegression
x = alltopcountries[['Material Number Labels', 'Geo Number Labels', 'Partner Number Labels']].to_numpy() # features
y = alltopcountries[['OBS_VALUE']].to_numpy().ravel() # output
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, test_size = 0.3, random_state = 0)
regressor = LinearRegression()
regressor.fit(X_Train, Y_Train)
print("R^2", regressor.score(x, y))

R^2 0.0069509252493757945


In [8]:
import sklearn.metrics as sm
Y_Pred = regressor.predict(X_Test)
# error
e= sm.mean_squared_error(Y_Test, Y_Pred)
# print accuracy
print("Accuracy = {}".format(1-e))

Accuracy = -49466119403.28881


Time to try using a lot less features. <br>
Going to only keep Glass, Paper, Plastics including Rubber, Metal, Organics, and Textiles from Materials. <br>
Going to keep only the EU as a whole instead of each specific country. <br>
Only going to keep the top 10 countries importing trash from the EU at the moment. <br>
Maybe all of this will make a better model. 

Top 10 Countries: <br> 
'TR': 148062623.0 = Turkiye (Turkey) <br>
'CN': 103565534.0 = China including Hong Kong <br>
'IN': 31944803.0 = India <br>
'UK': 28741386.0 = United Kingdom <br>
'CH': 26099504.0 = Switzerland <br>
'NO': 18474897.0 = Norway <br>
'ID': 14482476.0 = Indonesia <br>
'EG': 14327843.0 = Egypt <br>
'US': 13218107.0 = United States <br>
'HK': 12033537.0 = Hong Kong <br>
'PK': 10418970.0 = Pakistan <br>
We're going to exclude Hong Kong, though. 

In [19]:
# Processing this new dataset I need
less_materials1 = alltopcountries[alltopcountries['Material Number Labels'] == '245']
less_materials2 = alltopcountries[alltopcountries['Material Number Labels'] == '246']
less_materials3 = alltopcountries[alltopcountries['Material Number Labels'] == '250']
less_materials4 = alltopcountries[alltopcountries['Material Number Labels'] == '253']
less_materials5 = alltopcountries[alltopcountries['Material Number Labels'] == '254']
less_materials6 = alltopcountries[alltopcountries['Material Number Labels'] == '258']
less_materials = pd.concat([less_materials1, less_materials2, less_materials3, less_materials4, less_materials5, less_materials6], ignore_index=True, sort=False)

filtercountries1 = less_materials[less_materials['partner'] == 'TR']
filtercountries2 = less_materials[less_materials['partner'] == 'CN']
filtercountries3 = less_materials[less_materials['partner'] == 'IN']
filtercountries4 = less_materials[less_materials['partner'] == 'UK']
filtercountries5 = less_materials[less_materials['partner'] == 'CH']
filtercountries6 = less_materials[less_materials['partner'] == 'NO']
filtercountries7 = less_materials[less_materials['partner'] == 'ID']
filtercountries8 = less_materials[less_materials['partner'] == 'EG']
filtercountries9 = less_materials[less_materials['partner'] == 'US']
filtercountries10 = less_materials[less_materials['partner'] == 'PK']
filter_countries = pd.concat([filtercountries1, filtercountries2, filtercountries3, filtercountries4, filtercountries5, filtercountries6, filtercountries7, filtercountries8, filtercountries9, filtercountries10], ignore_index=True, sort=False)

allof_EU = filter_countries[filter_countries['geo'] == 'EU27_2020']
print(len(less_materials1))

0


In [None]:
from sklearn.linear_model import LinearRegression
x = allof_EU[['Material Number Labels', 'Geo Number Labels', 'Partner Number Labels']].to_numpy() # features
y = allof_EU[['OBS_VALUE']].to_numpy().ravel() # output
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, test_size = 0.3, random_state = 0)
regressor = LinearRegression()
regressor.fit(X_Train, Y_Train)
print("R^2", regressor.score(x, y))

Time to try with one-hot encoding instead of label encoding. 

In [9]:
aggregate_trash = alltopcountries.drop(columns=['stk_flow', 'rawmat', 'Material Number Labels', 'partner', 'Partner Number Labels', 'geo', 'Geo Number Labels'])

# one-hot-encode using sklearn
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoded_results = encoder.fit_transform(aggregate_trash).toarray()

MemoryError: Unable to allocate 101. GiB for an array with shape (106470, 127642) and data type float64