In [1]:
import pandas as pd
from sklearn.datasets import load_boston

In [2]:
# Download the data file
DATURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/audiology.standardized.data"
import urllib
urllib.request.urlretrieve (DATURL, "data/audiology.csv")

('audiology.csv', <http.client.HTTPMessage at 0x1a10a8f828>)

In [3]:
# Make up arbitrary attribute names
attrs = ['attr{:d}'.format(x) for x in range(71)]
df = pd.read_csv('data/audiology.csv', names=attrs)

In [4]:
df = df[df['attr63']!='?'] # remove rows where attr63 = '?'
df = df[df['attr63']!='unmeasured'] # remove rows where attr63 = 'unmeasured'

In [5]:
print("1. attr63 is currently type", df.dtypes['attr63'])

1. attr63 is currently type object


In [6]:
# Convert to categorical
df['attr63'] = df['attr63'].astype('category')
# Show the type again
print("2. attr63 is currently type", df.dtypes['attr63'])
print("   ordered=", df['attr63'].cat.ordered)

2. attr63 is currently type category
   ordered= False


In [7]:
# Show the categories
df['attr63'].cat.categories

Index(['good', 'normal', 'poor', 'very_good', 'very_poor'], dtype='object')

In [8]:
# Part way there, but this is still a nominal.
# We have to tell Pandas to treat this as ordinal AND
# we have to tell Pandas the correct order of the values.
# We can tell these categories have a definite order to them, so we need to fix that.
df['attr63'] = df['attr63'].cat.set_categories(
    ['very_poor', 'poor', 'normal', 'good', 'very_good', ],
    ordered=True)
# Show the type again
print("3. attr63 is currently type", df.dtypes['attr63'])
print("   ordered=", df['attr63'].cat.ordered)

3. attr63 is currently type category
   ordered= True
