In [0]:
  !curl -s https://course.fast.ai/setup/colab | bash

Updating fastai...
Done.


In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
from fastai.tabular import *

In [0]:
from fastai import *

In [0]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.functional")

In [0]:
untar_data(URLs.ADULT_SAMPLE)

PosixPath('/root/.fastai/data/adult_sample')

In [0]:
path = Config().data_path()

In [0]:
path = path/'adult_sample'

In [0]:
path.ls()

[PosixPath('/root/.fastai/data/adult_sample/adult.csv'),
 PosixPath('/root/.fastai/data/adult_sample/export.pkl'),
 PosixPath('/root/.fastai/data/adult_sample/models')]

In [0]:
adult_df = pd.read_csv(path/'adult.csv')

In [0]:
# del dataframe - to delete any unwanted dataframes.  

In [0]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [0]:
# Data looks like a table in a dataframe. This is a very simple usecase.
# But in essence a dataframe of "anything" is still a table with rows and columns. 

In [0]:
adult_df.iloc[0:6]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k
5,20,Private,63210,HS-grad,9.0,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,15,United-States,<50k


Below are the columns in the dataframe. We will use some columns as input columns. We will use some inputs as Categorical Variables(ones with countable discrete values) and some as Continuous variables(one which are not discrete and can have values ranging from 0 to anything if its a number). 

We are going to predict Salary here and hence that is the output or y variable or dependent variable. 

age	
workclass	
fnlwgt	
education	
education-num	
marital-status	
occupation	
relationship	
race	
sex	
capital-gain	
capital-loss	
hours-per-week	
native-country	
salary

In [0]:
adult_df.shape

(32561, 15)

This says that there are 32561 rows and 15 columns

In [0]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]

procs are nothing but preprocessors. They are used to do one time activity before loading the data into other data objects further. 

Here we are using FillMissing, Categorify and Normalize. There are many other preprocessors. 

In [0]:
data = (TabularList.from_df(adult_df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs))

Below is the definition of TabularList class. It is a subclass of ItemList class. Both are fastai classes

TabularList(items:Iterator[T_co], cat_names:OptStrList=None, cont_names:OptStrList=None, procs=None, **kwargs) → TabularList :: ItemList

In [0]:
type(data)


fastai.tabular.data.TabularList

Fastai TabularList contains multiple types of objects. Below are the details

In [0]:
type(data.get(1))

pandas.core.series.Series

In [0]:
data.get(1)

age                             44
workclass                  Private
fnlwgt                      236746
education                  Masters
education-num                   14
marital-status            Divorced
occupation         Exec-managerial
relationship         Not-in-family
race                         White
sex                           Male
capital-gain                 10520
capital-loss                     0
hours-per-week                  45
native-country       United-States
salary                       >=50k
Name: 1, dtype: object

Pandas Series is a one-dimensional labeled array capable of holding data of any type  and in this case it is a series of objects.

The Dtype object is a pandas datatype. It basically says that there are multiple types of data in that object. 
Age is a number and workclass is a string and so on.

In [0]:
type(data.get(1).workclass)

str

In [0]:
type(data.get(1).age)

numpy.int64

In [0]:
type(data.items)

numpy.ndarray

In [0]:
data.items[1]

1

In [0]:
data.items.shape

(32561,)

Items is a numpy multidimensional array. However in this case it is an one dimensional array.

TabularList object has two more lists, one for Continuous variable and one for Categorical variables.

In [0]:
data.get(data.items[1])

age                             44
workclass                  Private
fnlwgt                      236746
education                  Masters
education-num                   14
marital-status            Divorced
occupation         Exec-managerial
relationship         Not-in-family
race                         White
sex                           Male
capital-gain                 10520
capital-loss                     0
hours-per-week                  45
native-country       United-States
salary                       >=50k
Name: 1, dtype: object

In [0]:
type(data.cat_names)

list

In [0]:
data.cat_names

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race']

In [0]:
type(data.cont_names)

list

In [0]:
data.cont_names

['age', 'fnlwgt', 'education-num']

In [0]:
data2 = data.split_by_idx(list(range(800,1000)))

In [0]:
type(data2)

fastai.data_block.ItemLists

In [0]:
type(data2.train)

fastai.tabular.data.TabularList

In [0]:
type(data2.valid)

fastai.tabular.data.TabularList

In [0]:
data2.valid.cat_names

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race']

In [0]:
data2.valid.cont_names

['age', 'fnlwgt', 'education-num']

In [0]:
data2.train.cat_names

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race']

In [0]:
data2.train.cont_names

['age', 'fnlwgt', 'education-num']

In [0]:
type(data2.train.get(1))

fastai.tabular.data.TabularLine

In [0]:
data2.train.get(1)

age                             44
workclass                  Private
fnlwgt                      236746
education                  Masters
education-num                   14
marital-status            Divorced
occupation         Exec-managerial
relationship         Not-in-family
race                         White
sex                           Male
capital-gain                 10520
capital-loss                     0
hours-per-week                  45
native-country       United-States
salary                       >=50k
Name: 1, dtype: object

In [0]:
data2.valid.get(1)

age                                46
workclass                Self-emp-inc
fnlwgt                         192779
education                 Prof-school
education-num                     NaN
marital-status     Married-civ-spouse
occupation             Prof-specialty
relationship                  Husband
race                            White
sex                              Male
capital-gain                    15024
capital-loss                        0
hours-per-week                     60
native-country          United-States
salary                          >=50k
Name: 801, dtype: object

In [0]:
data3= data2.label_from_df(cols=dep_var)

In [0]:
type(data3)

fastai.data_block.LabelLists

In [0]:
data3.lists

[LabelList (32361 items)
 x: TabularList
 workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; ,workclass  Private; education  Masters; marital-status  Divorced; occupation  Exec-managerial; relationship  Not-in-family; race  White; education-num_na False; age 0.3968; fnlwgt 0.4458; education-num 1.5334; ,workclass  Private; education  HS-grad; marital-status  Divorced; occupation #na#; relationship  Unmarried; race  Black; education-num_na True; age -0.0430; fnlwgt -0.8868; education-num -0.0312; ,workclass  Self-emp-inc; education  Prof-school; marital-status  Married-civ-spouse; occupation  Prof-specialty; relationship  Husband; race  Asian-Pac-Islander; education-num_na False; age -0.0430; fnlwgt -0.7288; education-num 1.9245; ,workclass  Self-emp-not-inc; education  7th-8th; marital-status  Married-civ-spouse; occupation  Other-servi

In [0]:
type(data3.train)

fastai.data_block.LabelList

In [0]:
type(data3.train.x)

fastai.tabular.data.TabularList

In [0]:
type(data3.train.y)

fastai.data_block.CategoryList

In [0]:
data3.train.x[0]

TabularLine workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; 

In [0]:
data3.train.y

CategoryList (32361 items)
>=50k,>=50k,<50k,>=50k,<50k
Path: /root/.fastai/data/adult_sample

In [0]:
data3.train.x[0]

TabularLine workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; 

In [0]:
final_data = data3.databunch()

In [0]:
type(final_data)

fastai.tabular.data.TabularDataBunch

In [0]:
type(final_data.train_dl.x)

fastai.tabular.data.TabularList

In [0]:
type(final_data.train_dl.y)

fastai.data_block.CategoryList

In [0]:
final_data.train_dl.x

TabularList (32361 items)
workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; ,workclass  Private; education  Masters; marital-status  Divorced; occupation  Exec-managerial; relationship  Not-in-family; race  White; education-num_na False; age 0.3968; fnlwgt 0.4458; education-num 1.5334; ,workclass  Private; education  HS-grad; marital-status  Divorced; occupation #na#; relationship  Unmarried; race  Black; education-num_na True; age -0.0430; fnlwgt -0.8868; education-num -0.0312; ,workclass  Self-emp-inc; education  Prof-school; marital-status  Married-civ-spouse; occupation  Prof-specialty; relationship  Husband; race  Asian-Pac-Islander; education-num_na False; age -0.0430; fnlwgt -0.7288; education-num 1.9245; ,workclass  Self-emp-not-inc; education  7th-8th; marital-status  Married-civ-spouse; occupation  Other-service; relationship

In [0]:
final_data.train_dl.y

CategoryList (32361 items)
>=50k,>=50k,<50k,>=50k,<50k
Path: /root/.fastai/data/adult_sample

The below command shows the three Tensors in the Databunch class. Tensors are nothing but multi dimensional matrices. The first two Tensors are for the features. We have a total of 10 features (9 input features and 1 output feature). 
Out of that, 7 are Categorical and 3 are Continouous features in this example. Therefore you see two Tensors of those sizes. 
The third Tensor is for Labels. 

In [0]:
next(iter(final_data.train_dl))

[[tensor([[ 5, 12,  3,  4,  1,  5,  1],
          [ 5, 16,  5, 11,  2,  4,  1],
          [ 5, 16,  3,  8,  1,  5,  1],
          [ 5,  2,  3, 13,  1,  5,  1],
          [ 5, 10,  1,  0,  2,  5,  2],
          [ 5, 16,  5,  2,  2,  5,  1],
          [ 5, 10,  4, 11,  2,  2,  1],
          [ 5,  1,  5,  6,  4,  5,  1],
          [ 7, 12,  3,  5,  1,  5,  1],
          [ 5, 12,  5,  8,  2,  5,  1],
          [ 5,  8,  5,  2,  2,  5,  1],
          [ 5, 16,  5,  5,  4,  5,  1],
          [ 5, 10,  5,  2,  2,  5,  1],
          [ 7, 11,  3, 11,  1,  5,  1],
          [ 7, 12,  3,  6,  1,  5,  1],
          [ 5,  2,  3,  4,  1,  5,  1],
          [ 1, 16,  5,  1,  4,  5,  1],
          [ 5, 12,  3,  4,  1,  5,  1],
          [ 1, 12,  3,  1,  1,  5,  1],
          [ 7, 10,  3, 11,  6,  5,  1],
          [ 5, 16,  3, 13,  1,  5,  1],
          [ 5, 10,  3, 14,  1,  5,  1],
          [ 5, 12,  5,  8,  4,  5,  1],
          [ 5, 12,  3,  5,  1,  5,  1],
          [ 5,  9,  3,  8,  1,  5,  1],


In [0]:
final_data.save('databunch.pkl')

In [0]:
final_data.train_ds[0]

LabelList (32361 items)
x: TabularList
workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; ,workclass  Private; education  Masters; marital-status  Divorced; occupation  Exec-managerial; relationship  Not-in-family; race  White; education-num_na False; age 0.3968; fnlwgt 0.4458; education-num 1.5334; ,workclass  Private; education  HS-grad; marital-status  Divorced; occupation #na#; relationship  Unmarried; race  Black; education-num_na True; age -0.0430; fnlwgt -0.8868; education-num -0.0312; ,workclass  Self-emp-inc; education  Prof-school; marital-status  Married-civ-spouse; occupation  Prof-specialty; relationship  Husband; race  Asian-Pac-Islander; education-num_na False; age -0.0430; fnlwgt -0.7288; education-num 1.9245; ,workclass  Self-emp-not-inc; education  7th-8th; marital-status  Married-civ-spouse; occupation  Other-service;

In [0]:
final_data.train_ds[0]

(TabularLine workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; ,
 Category >=50k)

In [0]:
final_data.train_dl

DeviceDataLoader(dl=<torch.utils.data.dataloader.DataLoader object at 0x7fe93af0d748>, device=device(type='cuda'), tfms=[], collate_fn=<function data_collate at 0x7fe93ddc2a60>)

In [0]:
final_data.train_ds[0:10]

LabelList (10 items)
x: TabularList
age                            0.763248
workclass                       Private
fnlwgt                        -0.838107
education                    Assoc-acdm
education-num                  0.751083
marital-status       Married-civ-spouse
occupation                          NaN
relationship                       Wife
race                              White
sex                              Female
capital-gain                          0
capital-loss                       1902
hours-per-week                       40
native-country            United-States
salary                            >=50k
education-num_na                  False
Name: 0, dtype: object,age                         0.396752
workclass                    Private
fnlwgt                      0.445849
education                    Masters
education-num                1.53338
marital-status              Divorced
occupation           Exec-managerial
relationship           Not-in-family
race 

Below is the data that can be sent to a Learner. In fact the data in the Tensors is what is sent to the Learner because the equations understand numbers and not values. 

In [0]:
final_data.show_batch(rows=10)

workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,target
?,HS-grad,Married-civ-spouse,?,Husband,Black,False,2.0093,1.1816,-0.4224,<50k
Federal-gov,HS-grad,Married-civ-spouse,Adm-clerical,Husband,White,False,0.7632,-0.2037,-0.4224,>=50k
Private,HS-grad,Never-married,Adm-clerical,Not-in-family,Black,False,-1.3624,-0.927,-0.4224,<50k
Self-emp-not-inc,Assoc-acdm,Married-civ-spouse,Sales,Husband,White,False,-0.7027,-0.1016,0.7511,<50k
?,Some-college,Never-married,?,Own-child,White,False,-1.2158,0.0514,-0.0312,<50k
Private,Bachelors,Married-civ-spouse,Tech-support,Husband,White,False,0.1036,0.5514,1.1422,>=50k
Private,Assoc-voc,Never-married,Prof-specialty,Unmarried,White,False,-0.4095,-0.1966,0.3599,<50k
Private,HS-grad,Married-civ-spouse,Craft-repair,Wife,Black,False,0.0303,2.4129,-0.4224,<50k
Private,Bachelors,Never-married,Sales,Not-in-family,White,False,0.1036,-1.467,1.1422,<50k
Self-emp-inc,Some-college,Divorced,Sales,Not-in-family,White,False,1.0564,0.5874,-0.0312,>=50k


In [0]:
final_data.sanity_check

<bound method DataBunch.sanity_check of TabularDataBunch;

Train: LabelList (32361 items)
x: TabularList
workclass  Private; education  Assoc-acdm; marital-status  Married-civ-spouse; occupation #na#; relationship  Wife; race  White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; ,workclass  Private; education  Masters; marital-status  Divorced; occupation  Exec-managerial; relationship  Not-in-family; race  White; education-num_na False; age 0.3968; fnlwgt 0.4458; education-num 1.5334; ,workclass  Private; education  HS-grad; marital-status  Divorced; occupation #na#; relationship  Unmarried; race  Black; education-num_na True; age -0.0430; fnlwgt -0.8868; education-num -0.0312; ,workclass  Self-emp-inc; education  Prof-school; marital-status  Married-civ-spouse; occupation  Prof-specialty; relationship  Husband; race  Asian-Pac-Islander; education-num_na False; age -0.0430; fnlwgt -0.7288; education-num 1.9245; ,workclass  Self-emp-not-inc; education  7th-8

This data is now in a format that can be sent to a Learner Module. 