# Predicting the supplier country from invoice data 

In [1]:
#import dataset with invoice and country 
import pandas as pd
import numpy as np

supcountry = pd.read_csv("data/invoice_country.csv",encoding='unicode_escape')
supcountry

Unnamed: 0,SupplierName,SupplierAddress,SupplierPhone,SupplierEmail,AccountNumber,BankName,SortCode,SwiftCode,IBAN,Currency,Status,Zone,SupplierAddress1,SupplierCity,SupplierPostalCode,SupplierGSTNumber,BSBCode,SupplierCountry
0,RK,18 Sin Ming Lane #01-08 Midview City,,,,,,,,SGD,1,,,Singapore,573960,201617200G,,Singapore
1,Kai Guderjahn,34 Midelton Avenue North Bondi Sydney,,kai.guderjahn@gmail.com.au,10094529,Commonwealth Bank,,CTBAAU2S,,AUD,2,,,NSW,2026,,063-097,Australia
2,PSN Events Pty Ltd,PO BOX 647 POTTS POINT,,,16708085,,,,,AUD,2,,,NSW,1335,,062-000,Australia
3,Apple Pty Ltd,PO Box A2629 Sydney South,,,2510054,,,,,AUD,2,,,NSW,1235,,,Australia
4,BAN HOCK FOOD PTE LTD,34 Woodlands Terrace Woodlands East industrial...,,enguiries@bannock.ccm.sg,,,,,,SGD,1,,,Singapore,739453,M2-0064933-8,,Singapore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,ADirect Singapore Pte. Ltd.,,12345676,Adirect.renepay@gmail.com,,,,,,SGD,2,,"No 2-15, Food Xchange Singapore ,Singapore - 7...",,,,,Singapore
193,ADirect Singapore Pte. Ltd.,,12345676,Adirect.renepay@gmail.com,,,,,,SGD,2,,"8A Admiralty Street No 2-15, Food Xchange Sing...",,,,,Singapore
194,ADirect Singapore Pte. Ltd.,,12345676,Adirect.renepay@gmail.com,,,,,,SGD,2,,"8A Admiralty Street No 2-15, Food Xchange Sing...",,,,,Singapore
195,ADirect Singapore Pte. Ltd.,,12345676,Adirect.renepay@gmail.com,,,,,,SGD,2,,"8A Admiralty Street No 2-15, Food Xchange Sing...",,,,,Singapore


In [2]:
#find out the total no. of null values 
supcountry.isna().sum()

SupplierName            2
SupplierAddress        72
SupplierPhone         129
SupplierEmail          62
AccountNumber          44
BankName               94
SortCode              131
SwiftCode             133
IBAN                  156
Currency                0
Status                  0
Zone                  197
SupplierAddress1      159
SupplierCity            9
SupplierPostalCode     40
SupplierGSTNumber     159
BSBCode               169
SupplierCountry         2
dtype: int64

In [3]:
#drop missing vaues from the label (supplierCountry)
supcountry.dropna(subset=["SupplierCountry"],inplace=True)
supcountry.isna().sum()

SupplierName            0
SupplierAddress        70
SupplierPhone         127
SupplierEmail          60
AccountNumber          42
BankName               92
SortCode              129
SwiftCode             131
IBAN                  154
Currency                0
Status                  0
Zone                  195
SupplierAddress1      157
SupplierCity            7
SupplierPostalCode     38
SupplierGSTNumber     157
BSBCode               167
SupplierCountry         0
dtype: int64

In [4]:
#Split data into features and label 
x=supcountry.drop("SupplierCountry", axis=1)
y=supcountry["SupplierCountry"]

In [5]:
#Filling missing values with skikit learn 

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#fill missing values with " missing"
cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")

#define columns 
cat_features=["SupplierName","SupplierAddress","SupplierPhone","SupplierEmail","AccountNumber","BankName","SortCode","SwiftCode","IBAN","Currency","Status","Zone","SupplierAddress1","SupplierCity","SupplierPostalCode","SupplierGSTNumber","BSBCode"]


#create an imputer
#Create an imputer ( something that fills missing data )
imputer = ColumnTransformer([
                            ("cat_imputer",cat_imputer,cat_features)
])

# transform the data 
filled_x= imputer.fit_transform(x)
filled_x

array([['RK', '18 Sin Ming Lane #01-08 Midview City', 'missing', ...,
        '573960', '201617200G', 'missing'],
       ['Kai Guderjahn', '34 Midelton Avenue North Bondi Sydney',
        'missing', ..., '2026', 'missing', '063-097'],
       ['PSN Events Pty Ltd', 'PO BOX 647 POTTS POINT', 'missing', ...,
        '1335', 'missing', '062-000'],
       ...,
       ['ADirect Singapore Pte. Ltd.', 'missing', '12345676', ...,
        'missing', 'missing', 'missing'],
       ['ADirect Singapore Pte. Ltd.', 'missing', '12345676', ...,
        'missing', 'missing', 'missing'],
       ['ADirect Singapore Pte. Ltd.', 'missing', '12345676', ...,
        'missing', 'missing', 'missing']], dtype=object)

In [6]:
sup_filled = pd.DataFrame(filled_x,columns=["SupplierName","SupplierAddress","SupplierPhone","SupplierEmail","AccountNumber","BankName","SortCode","SwiftCode","IBAN","Currency","Status","Zone","SupplierAddress1","SupplierCity","SupplierPostalCode","SupplierGSTNumber","BSBCode"])
sup_filled.isna().sum()

SupplierName          0
SupplierAddress       0
SupplierPhone         0
SupplierEmail         0
AccountNumber         0
BankName              0
SortCode              0
SwiftCode             0
IBAN                  0
Currency              0
Status                0
Zone                  0
SupplierAddress1      0
SupplierCity          0
SupplierPostalCode    0
SupplierGSTNumber     0
BSBCode               0
dtype: int64

In [7]:
#turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

categorical_features =["SupplierName","SupplierAddress","SupplierPhone","SupplierEmail","AccountNumber","BankName","SortCode","SwiftCode","IBAN","Currency","Status","Zone","SupplierAddress1","SupplierCity","SupplierPostalCode","SupplierGSTNumber","BSBCode"]

one_hot= OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")

transformed_x= transformer.fit_transform(sup_filled)
transformed_x

<195x441 sparse matrix of type '<class 'numpy.float64'>'
	with 3315 stored elements in Compressed Sparse Row format>

In [8]:
#turn y into number 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

label = le.fit_transform(supcountry["SupplierCountry"])

label



array([4, 0, 0, 0, 4, 4, 4, 4, 0, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 4, 4, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 0, 0, 0, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 5, 5, 3, 3, 3, 3, 5, 5, 5,
       4, 5, 5, 5, 1, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 7, 5, 5, 1, 4, 4, 5,
       5, 4, 5, 1, 5, 5, 5, 4, 5, 4, 5, 4, 4, 5, 5, 5, 4, 5, 4, 4, 4, 5,
       1, 4, 1, 4, 4, 5, 4, 5, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4])

In [9]:
le.classes_

array(['Australia', 'India', 'Ireland', 'NL', 'Singapore', 'UK', 'USA',
       'ZA'], dtype=object)

## instantiate random forest 

In [10]:
#split data into training and testing sets 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(transformed_x, label, test_size=0.2)

#initialize estimator
clf = RandomForestClassifier(n_estimators=100)

#fit into the model
clf.fit(x_train,y_train)
clf.predict(x_test)

array([1, 5, 5, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 0, 4, 5, 4, 5, 4, 4, 5, 5,
       5, 5, 4, 4, 5, 5, 4, 5, 0, 5, 4, 0, 0, 5, 4, 5, 5])

In [11]:
le.classes_

array(['Australia', 'India', 'Ireland', 'NL', 'Singapore', 'UK', 'USA',
       'ZA'], dtype=object)

In [12]:
clf.score(x_test,y_test)

0.9487179487179487