In [23]:
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 KB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.1/

In [2]:
bucket = 'unspsc-data'
key = 'data.zip'

In [3]:
!aws s3 cp s3://{bucket}/{key} {key}

download: s3://unspsc-data/data.zip to ./data.zip                   


In [4]:
!unzip data.zip

Archive:  data.zip
   creating: data/
   creating: data/canada/
  inflating: data/canada/nibs-gsin.csv  
  inflating: data/canada/tpsgc-pwgsc_aa-a.csv  
   creating: data/california/
  inflating: data/california/purchase-order-data-2012-2015-.csv  
   creating: data/codes/
  inflating: data/codes/data-unspsc-codes.csv  
   creating: data/australia/
  inflating: data/australia/2016-2017-australian-government-contract-data.csv  
  inflating: data/australia/17-18-fy-dataset.csv  
  inflating: data/australia/20152016-fy-austender-cns.zip  
  inflating: data/australia/2019-20-australian-government-contract-data.xlsx  
  inflating: data/australia/20142015fy.csv  


In [10]:
import pandas as pd
import numpy as np
import openpyxl
import sagemaker
import boto3

# Read in the Code Data

In [12]:
codes = pd.read_csv('data/codes/data-unspsc-codes.csv', encoding='latin-1')

In [13]:
codes.head()

Unnamed: 0,Segment,Segment Name,Family,Family Name,Class,Class Name,Commodity,Commodity Name
0,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101501,Cats
1,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101502,Dogs
2,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101504,Mink
3,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101505,Rats
4,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101506,Horses


In [14]:
codes.dtypes

Segment            int64
Segment Name      object
Family             int64
Family Name       object
Class              int64
Class Name        object
Commodity          int64
Commodity Name    object
dtype: object

In [16]:
index_columns = ['Segment', 'Segment Name']
column_list = ['Family Name', 'Class Name', 'Commodity Name']

[column for column in codes.columns if ((column not in column_list) and (column not in index_columns))]

['Family', 'Class', 'Commodity']

In [19]:
def concatenate_and_remove(df: pd.DataFrame, column_list: list, index_columns: list) -> pd.DataFrame:
    
    #drop columns we don't want
    df = (df[column_list+index_columns]
          .assign(output_field = ''))
    
    for column in column_list:
        df['output_field'] += df[column].str.lower() + ' '
    
    df = df[index_columns + ['output_field']]
    
    return df
    
    
    

In [21]:
cleaned_df = codes.pipe(concatenate_and_remove, column_list, index_columns)

In [22]:
cleaned_df.head()

Unnamed: 0,Segment,Segment Name,output_field
0,10000000,Live Plant and Animal Material and Accessories...,live animals livestock cats
1,10000000,Live Plant and Animal Material and Accessories...,live animals livestock dogs
2,10000000,Live Plant and Animal Material and Accessories...,live animals livestock mink
3,10000000,Live Plant and Animal Material and Accessories...,live animals livestock rats
4,10000000,Live Plant and Animal Material and Accessories...,live animals livestock horses


In [6]:
au_1415 = pd.read_csv('data/australia/20142015fy.csv')

In [8]:
au_1415.dtypes

Agency Name                         object
Parent Contract ID                  object
Contract ID                         object
Publish Date                        object
AmendmentDate                       object
Start Date                          object
End Date                            object
Value                              float64
Description                         object
Agency Ref ID                       object
UNSPSC Code                          int64
Title                               object
Procurement Method                  object
TenderNumber                        object
SON ID                              object
Confidentiality Contract Flag       object
Confidentiality Contract Reason     object
Confidentiality Outputs Flag        object
Confidentiality Outputs Reason      object
Consultancy Flag                    object
Consultancy Reason                  object
Amendment Reason                    object
Supplier Name                       object
SupplierAdd

In [9]:
au_1415['UNSPSC Code']

0        82151500
1        82130000
2        81112000
3        25200000
4        80100000
           ...   
69231    80111600
69232    43230000
69233    81112200
69234    43210000
69235    84111600
Name: UNSPSC Code, Length: 69236, dtype: int64