In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import sagemaker

# Read dataset

In [2]:
%%sh
wget -N -P data http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
unzip data/bank.zip -d data

Archive:  data/bank.zip
  inflating: data/bank-full.csv      
  inflating: data/bank-names.txt     
  inflating: data/bank.csv           


--2020-04-15 13:39:56--  http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 579043 (565K) [application/x-httpd-php]
Saving to: ‘data/bank.zip’

     0K .......... .......... .......... .......... ..........  8%  288K 2s
    50K .......... .......... .......... .......... .......... 17%  798K 1s
   100K .......... .......... .......... .......... .......... 26%  144M 1s
   150K .......... .......... .......... .......... .......... 35%  297M 0s
   200K .......... .......... .......... .......... .......... 44%  835K 0s
   250K .......... .......... .......... .......... .......... 53% 59.6M 0s
   300K .......... .......... .......... .......... .......... 61%  867K 0s
   350K .......... .......... .......... .......... .......... 70% 72.5M 0s
   400K .....

In [3]:
DATA_PATH = 'data/'
df = pd.read_csv(DATA_PATH+'bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.info()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [5]:
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
enc = OrdinalEncoder()
enc.fit(df)

data = enc.transform(df)

# Split dataset

In [7]:
train, test = train_test_split(data, test_size=0.2)

In [8]:
print('all:  ', len(data))
print('train:', len(train))
print('test: ', len(test))

all:   45211
train: 36168
test:  9043


In [9]:
train_file = DATA_PATH+'bank-train.csv'
pd.DataFrame.from_records(train).to_csv(train_file, index=False, header=True, sep=',')

test_file = DATA_PATH+'bank-test.csv'
pd.DataFrame.from_records(test).to_csv(test_file, index=False, header=True, sep=',')

# Upload to S3

In [10]:
session = sagemaker.Session()
uri = session.upload_data(path=train_file, key_prefix='bank')
print(uri)

s3://sagemaker-us-east-2-384671335610/bank/bank-train.csv
