# Fastai TabularPandas

In [358]:
from fastai.tabular.all import *

In [368]:
path = untar_data(URLs.ADULT_SAMPLE)

In [369]:
path.ls()

(#3) [Path('/root/.fastai/data/adult_sample/export.pkl'),Path('/root/.fastai/data/adult_sample/models'),Path('/root/.fastai/data/adult_sample/adult.csv')]

In [372]:
df.to_csv('adult.csv', index=False)

In [370]:
df = pd.read_csv(path/'adult.csv')
df.sample(4)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
19134,57,Self-emp-not-inc,79539,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,<50k
11768,36,Private,76845,HS-grad,9.0,Never-married,Other-service,Own-child,Black,Male,0,0,35,United-States,<50k
28697,19,Private,282698,7th-8th,4.0,Never-married,Adm-clerical,Own-child,White,Male,0,0,80,United-States,<50k
12488,27,Private,241607,Bachelors,13.0,Never-married,Tech-support,Other-relative,White,Male,0,0,50,United-States,<50k


## Preprocess
### Categorify is going to take every categorical variable and make a map from integer to unique categories, then replace the values by the corresponding index.
### FillMissing will fill the missing values in the continuous variables by the median of existing values (you can choose a specific value if you prefer)
### Normalize will normalize the continuous variables (substract the mean and divide by the std)

In [66]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']

In [67]:
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
salary             object
dtype: object

In [68]:
proc = [Categorify(), FillMissing()]

In [69]:
to = TabularPandas(df, proc, cat_names, cont_names)

In [75]:
cats = to.procs.categorify
cats['race']

['#na#', ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White']

In [72]:
to.show(max_n=4)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
0,Private,Assoc-acdm,Married-civ-spouse,#na#,Wife,White,False,49,101320,12.0
1,Private,Masters,Divorced,Exec-managerial,Not-in-family,White,False,44,236746,14.0
2,Private,HS-grad,Divorced,#na#,Unmarried,Black,True,38,96185,10.0
3,Self-emp-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,False,38,112847,15.0


In [83]:
to.conts.head(3)

Unnamed: 0,age,fnlwgt,education-num
0,49,101320,12.0
1,44,236746,14.0
2,38,96185,10.0


In [85]:
to.cats.head()
# we can see the transformed data

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na
0,5,8,3,0,6,5,1
1,5,13,1,5,2,5,1
2,5,12,1,0,5,3,2
3,6,15,3,11,1,2,1
4,7,6,3,9,6,3,2


In [87]:
to.show(3)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
0,Private,Assoc-acdm,Married-civ-spouse,#na#,Wife,White,False,49,101320,12.0
1,Private,Masters,Divorced,Exec-managerial,Not-in-family,White,False,44,236746,14.0
2,Private,HS-grad,Divorced,#na#,Unmarried,Black,True,38,96185,10.0


# Normalize

In [89]:
norm = Normalize()

In [90]:
cont_names

['age', 'fnlwgt', 'education-num']

In [91]:
to1 = TabularPandas(df, norm, cont_names = cont_names)

In [97]:
to1.conts.head()

Unnamed: 0,age,fnlwgt,education-num
0,0.763796,-0.838084,0.746294
1,0.397233,0.444987,1.523609
2,-0.042642,-0.886734,
3,-0.042642,-0.728873,1.912267
4,0.250608,-1.018314,


In [99]:
to1.normalize.means

{'age': 38.58164675532078,
 'fnlwgt': 189778.36651208502,
 'education-num': 10.079815864562988}

In [100]:
to1.normalize.stds

{'age': 13.640223192304274,
 'fnlwgt': 105548.3568809908,
 'education-num': 2.572959046228027}

# FillMissing

In [108]:
fm = FillMissing(fill_strategy=np.mean)

In [109]:
to2 = TabularPandas(df, fm, cont_names=cont_names)
to2.conts.head()

Unnamed: 0,age,fnlwgt,education-num
0,49,101320,12.0
1,44,236746,14.0
2,38,96185,10.079816
3,38,112847,15.0
4,42,82297,10.079816


## The DataLoaders

In [119]:
splits = RandomSplitter()(range_of(df))
splits

((#26049) [4698,10812,32444,3933,9501,22889,4432,11082,22393,30368...],
 (#6512) [14491,7143,27550,29980,29655,29235,470,22314,21625,20483...])

In [120]:
range_of(df)[:4], len(df)

([0, 1, 2, 3], 32561)

In [121]:
df = pd.read_csv(path/'adult.csv')

In [134]:
preprocess = [Categorify, FillMissing(fill_strategy=np.mean), Normalize]

In [138]:
to = TabularPandas(df, procs=preprocess,
                    cat_names=cat_names,
                    cont_names=cont_names,
                    y_names='salary',
                    y_block = CategoryBlock(),
                    splits=splits)

## Straight

In [142]:
dls = to.dataloaders()
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,5th-6th,Never-married,Farming-fishing,Unmarried,White,False,27.0,491420.98834,3.0,<50k
1,Private,1st-4th,Never-married,Machine-op-inspct,Not-in-family,White,False,44.0,367749.004957,2.0,<50k
2,Private,HS-grad,Divorced,Machine-op-inspct,Not-in-family,White,False,52.999999,197491.999731,9.0,<50k
3,?,HS-grad,Widowed,?,Not-in-family,Black,False,64.999999,143731.999616,9.0,<50k
4,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,45.0,126141.000921,13.0,>=50k
5,Private,Some-college,Never-married,Sales,Other-relative,White,False,20.999999,265355.999511,10.0,<50k
6,Self-emp-not-inc,10th,Divorced,Farming-fishing,Unmarried,White,False,49.0,59612.001463,6.0,<50k
7,Private,HS-grad,Never-married,Machine-op-inspct,Own-child,White,False,22.999999,60330.998565,9.0,<50k
8,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,False,60.999999,231182.999545,9.0,<50k
9,Local-gov,Bachelors,Never-married,Prof-specialty,Not-in-family,White,False,32.0,250585.002877,13.0,<50k


In [143]:
# with two dataloaders

In [167]:
train_dl = TabDataLoader(to.train, bs = 64, drop_last=True)
valid_dl = TabDataLoader(to.valid, bs = 128)

In [168]:
dls = DataLoaders(train_dl, valid_dl)
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Some-college,Never-married,Sales,Own-child,Black,False,22.999999,145650.9986,10.0,<50k
1,Self-emp-not-inc,9th,Divorced,Craft-repair,Not-in-family,White,False,38.0,357961.995262,5.0,<50k
2,Private,Preschool,Never-married,Machine-op-inspct,Not-in-family,White,False,58.999999,157304.999277,1.0,<50k
3,Private,Some-college,Married-civ-spouse,Transport-moving,Husband,White,False,24.0,233498.998613,10.0,>=50k
4,Self-emp-inc,Bachelors,Married-civ-spouse,Farming-fishing,Husband,White,False,62.0,56247.998968,13.0,>=50k
5,Private,10th,Divorced,Other-service,Not-in-family,White,False,39.0,192251.00001,6.0,<50k
6,Federal-gov,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,False,34.0,284703.001643,10.0,<50k
7,Local-gov,Some-college,Divorced,Adm-clerical,Own-child,White,False,35.0,226311.001032,10.0,<50k
8,Private,Some-college,Never-married,Other-service,Own-child,White,False,18.0,148951.999147,10.0,<50k
9,Private,Some-college,Never-married,Adm-clerical,Own-child,White,False,34.0,185215.999993,10.0,<50k


In [169]:
dls._dbunch_type

fastai.tabular.data.TabularDataLoaders

In [170]:
one_batch = dls.one_batch()

In [173]:
one_batch[0].shape

torch.Size([64, 7])

# TabularLearner