## Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.3.5'

## Concepts

- Dataframes have two dimensions: Labeled Indices and Columns
- Each Column in a Dataframe is a Series
- Series must be homogenous, but Dataframes can be heterogenous

## Creating Dataframes

Dataframes can be created by:
- Loading a Dictionary of Lists. **Key**: column name. **Value**: the list (all must be equal length) - Column-wise creation
- Loading a Dictionary of Tuples. **Key**: column name. **Value**: the tuple (all must be equal length) - Column-wise creation
- Loading a List of Dictionaries. Row-wise creation

In [3]:
# Sample lists:
names = ['John', 'Jane', 'Peter', 'Mary', 'Jack', 'Daisy']
ages = [29, 32, 28, 35, 33, 25]
married = [True, False, True, True, False, False]

In [4]:
# Series:
s = pd.Series(names, name='names')
s

0     John
1     Jane
2    Peter
3     Mary
4     Jack
5    Daisy
Name: names, dtype: object

In [5]:
# Dataframe:
df = pd.DataFrame({'names': names, 'ages': ages, 'married': married })
df

Unnamed: 0,names,ages,married
0,John,29,True
1,Jane,32,False
2,Peter,28,True
3,Mary,35,True
4,Jack,33,False
5,Daisy,25,False


In [6]:
# Access specific element of a Series
s.iloc[2]

'Peter'

In [7]:
# Access specific element of a Dataframe
df.iloc[2, 0]

'Peter'

In [8]:
# The created series is equal to the column of a Dataframe
df.names.equals(s)

True

In [9]:
# Shape returns number of rows and columns
df.shape

(6, 3)

In [10]:
# Dtypes returns the type of each series in the Dataframe
df.dtypes

names      object
ages        int64
married      bool
dtype: object

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   names    6 non-null      object
 1   ages     6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [12]:
# Exact amount of memory used by the Dataframe
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   names    6 non-null      object
 1   ages     6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 550.0 bytes


## Reading Data into Dataframe

In [13]:
nutrition = pd.read_csv('../data/nutrition.csv')

In [14]:
nutrition.info(verbose=False, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8789 entries, 0 to 8788
Columns: 77 entries, Unnamed: 0 to water
dtypes: int64(3), object(74)
memory usage: 39.2 MB


In [15]:
nutrition.head()

Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [16]:
# Sample will return a pseudo-random number of elements
nutrition.sample(n=5)

Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
402,402,"Cheese, limburger",100 g,327,27g,17g,90mg,800.00 mg,15.4 mg,58.00 mcg,...,27.25 g,16.746 g,8.606 g,0.495 g,90.00 mg,0.0 g,3.79 g,0.00 mg,0.00 mg,48.42 g
3124,3124,"Oil, bearded seal (Oogruk) (Alaska Native)",100 g,899,100g,11g,52mg,0.00 mg,0.1 mg,2.00 mcg,...,99.60 g,10.926 g,47.080 g,33.001 g,52.00 mg,0,0.00 g,0,0,0.02 g
7500,7500,"School Lunch, frozen, BIG DADDY'S LS 16"" 51% W...",100 g,248,9.3g,4g,0,450.00 mg,0,0,...,9.33 g,3.990 g,2.750 g,1.150 g,0,0,2.12 g,0,0,47.57 g
8501,8501,"Beef, grilled, cooked, select, trimmed to 0"" f...",100 g,149,3.4g,1.3g,81mg,65.00 mg,82.6 mg,5.00 mcg,...,3.43 g,1.313 g,1.476 g,0.273 g,81.00 mg,0.0 g,1.19 g,0.00 mg,0.00 mg,66.20 g
2969,2969,"Rice, cooked, enriched, medium-grain, white",100 g,130,0.2g,0.1g,0,0.00 mg,0,58.00 mcg,...,0.21 g,0.057 g,0.065 g,0.056 g,0.00 mg,0,0.21 g,0,0,68.61 g


In [17]:
# Checking the Axes from the Dataframe

nutrition.axes

[RangeIndex(start=0, stop=8789, step=1),
 Index(['Unnamed: 0', 'name', 'serving_size', 'calories', 'total_fat',
        'saturated_fat', 'cholesterol', 'sodium', 'choline', 'folate',
        'folic_acid', 'niacin', 'pantothenic_acid', 'riboflavin', 'thiamin',
        'vitamin_a', 'vitamin_a_rae', 'carotene_alpha', 'carotene_beta',
        'cryptoxanthin_beta', 'lutein_zeaxanthin', 'lucopene', 'vitamin_b12',
        'vitamin_b6', 'vitamin_c', 'vitamin_d', 'vitamin_e', 'tocopherol_alpha',
        'vitamin_k', 'calcium', 'copper', 'irom', 'magnesium', 'manganese',
        'phosphorous', 'potassium', 'selenium', 'zink', 'protein', 'alanine',
        'arginine', 'aspartic_acid', 'cystine', 'glutamic_acid', 'glycine',
        'histidine', 'hydroxyproline', 'isoleucine', 'leucine', 'lysine',
        'methionine', 'phenylalanine', 'proline', 'serine', 'threonine',
        'tryptophan', 'tyrosine', 'valine', 'carbohydrate', 'fiber', 'sugars',
        'fructose', 'galactose', 'glucose', 'lactose

## Handling Duplicated or Unnecessary Indexes

In [18]:
# You could either drop the additional index column

nutrition.drop('Unnamed: 0', axis=1).head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [19]:
# Or you could set the additional column as the actual index

nutrition.set_index('Unnamed: 0').head()

Unnamed: 0_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [20]:
# Or you could reload the dataframe setting the column as the index

nutrition = pd.read_csv('../data/nutrition.csv', index_col=[0])
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


## Changing the Index

In [21]:
nutrition.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788],
           dtype='int64', length=8789)

In [22]:
# You can set any column from the Dataframe as its index
nutrition.set_index('name').head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [23]:
# By default, the column is dropped. But that behavior can be changed
nutrition.set_index('name', drop=False).head()

Unnamed: 0_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans","Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw","Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked","Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange","Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [24]:
# You can also append the original index, to create a multi-dimensional index
nutrition.set_index('name', drop=False, append=True).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
Unnamed: 0_level_1,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,Cornstarch,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans","Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw","Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked","Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange","Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [25]:
# You can check if the index is unique by verifying integrity
nutrition.set_index('name', drop=False, append=True, verify_integrity=True).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
Unnamed: 0_level_1,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,Cornstarch,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans","Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw","Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked","Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange","Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [26]:
# Verifying integrity will return an error if the index is not unique
nutrition.set_index('serving_size', verify_integrity=True).head()

ValueError: Index has duplicate keys: Index(['100 g'], dtype='object', name='serving_size')

## Extract info from Dataset by Label (rows and columns)

In [27]:
nutrition.set_index('name', inplace=True)
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [29]:
nutrition.loc['Eggplant, raw']

serving_size       100 g
calories              25
total_fat           0.2g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol            0.0 g
ash               0.66 g
caffeine         0.00 mg
theobromine      0.00 mg
water            92.30 g
Name: Eggplant, raw, Length: 75, dtype: object

In [31]:
# Each row is a Series
type(nutrition.loc['Eggplant, raw'])

pandas.core.series.Series

In [34]:
# Specific elements can be accessed
print(nutrition.loc['Eggplant, raw', 'calories'])
print(nutrition.loc['Eggplant, raw']['calories'])

25
25


In [37]:
# .loc also accepts Slices as parameters
nutrition.loc['Eggplant, raw':'Sherbet, orange', 'calories':'cholesterol']

Unnamed: 0_level_0,calories,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Eggplant, raw",25,0.2g,,0
"Teff, uncooked",367,2.4g,0.4g,0
"Sherbet, orange",144,2g,1.2g,1mg


In [39]:
# Or you can use lists for non-consecutive elements
nutrition.loc[
    ['Cornstarch', 'Eggplant, raw', 'Sherbet, orange'],
    ['protein', 'calories', 'vitamin_b6']
]

Unnamed: 0_level_0,protein,calories,vitamin_b6
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cornstarch,0.26 g,381,0.000 mg
"Eggplant, raw",0.98 g,25,0.084 mg
"Sherbet, orange",1.10 g,144,0.023 mg


## Extract info from Dataset by position (rows and columns)

In [41]:
nutrition.iloc[25]

serving_size       100 g
calories              25
total_fat             0g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol                0
ash               3.05 g
caffeine               0
theobromine            0
water            90.70 g
Name: PACE, Green Taco Sauce, Length: 75, dtype: object

In [56]:
nutrition.iloc[12, 11]

'0.003 mg'

In [57]:
nutrition.iloc[12:15, 11:18]

Unnamed: 0_level_0,riboflavin,thiamin,vitamin_a,vitamin_a_rae,carotene_alpha,carotene_beta,cryptoxanthin_beta
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Mango nectar, canned",0.003 mg,0.003 mg,692.00 IU,35.00 mcg,0.00 mcg,402.00 mcg,26.00 mcg
"Crackers, rusk toast",0.399 mg,0.404 mg,41.00 IU,12.00 mcg,0,0,0
"Chicken, boiled, feet",0.200 mg,0.060 mg,100.00 IU,30.00 mcg,0.00 mcg,0.00 mcg,0.00 mcg


In [52]:
nutrition.iloc[
    [22, 65, 12], 
    [6, 23, 8, 11]
]

Unnamed: 0_level_0,choline,vitamin_d,folic_acid,riboflavin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Spices, ground, savory",0,0.00 IU,0,0
"Nuts, glazed, walnuts",0,0,0,0
"Mango nectar, canned",1.5 mg,0.00 IU,0.00 mcg,0.003 mg


## Single value extraction

In [62]:
# .at
nutrition.at['Mango nectar, canned', 'choline']

'1.5 mg'

In [66]:
# .iat
nutrition.iat[12, 6]

'1.5 mg'

In [71]:
# .at and .iat are much more performant than .loc and .iloc
%timeit nutrition.loc['Mango nectar, canned', 'choline']

6.56 µs ± 76.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [72]:
%timeit nutrition.at['Mango nectar, canned', 'choline']

2.99 µs ± 67.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Get index or column number - get_loc

In [75]:
nutrition.index

Index(['Cornstarch', 'Nuts, pecans', 'Eggplant, raw', 'Teff, uncooked',
       'Sherbet, orange', 'Cauliflower, raw', 'Taro leaves, raw',
       'Lamb, raw, ground', 'Cheese, camembert', 'Vegetarian fillets',
       ...
       'Beef, braised, cooked, all grades, trimmed to 1/8" fat, separable lean and fat, flat half, brisket',
       'Beef, raw, select, trimmed to 1/8" fat, separable lean only, lip-on, boneless, rib eye steak/roast',
       'Beef, raw, choice, trimmed to 1/8" fat, separable lean only, lip-on, boneless, rib eye steak/roast',
       'Oil, uses similar to 95 degree hard butter, confection fat, palm kernel (hydrogenated), industrial',
       'Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round steak, round',
       'Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round',
       'Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand',
       'Lamb, raw

In [77]:
nutrition.index.get_loc('Lamb, raw, ground')

7

In [78]:
nutrition.columns

Index(['serving_size', 'calories', 'total_fat', 'saturated_fat', 'cholesterol',
       'sodium', 'choline', 'folate', 'folic_acid', 'niacin',
       'pantothenic_acid', 'riboflavin', 'thiamin', 'vitamin_a',
       'vitamin_a_rae', 'carotene_alpha', 'carotene_beta',
       'cryptoxanthin_beta', 'lutein_zeaxanthin', 'lucopene', 'vitamin_b12',
       'vitamin_b6', 'vitamin_c', 'vitamin_d', 'vitamin_e', 'tocopherol_alpha',
       'vitamin_k', 'calcium', 'copper', 'irom', 'magnesium', 'manganese',
       'phosphorous', 'potassium', 'selenium', 'zink', 'protein', 'alanine',
       'arginine', 'aspartic_acid', 'cystine', 'glutamic_acid', 'glycine',
       'histidine', 'hydroxyproline', 'isoleucine', 'leucine', 'lysine',
       'methionine', 'phenylalanine', 'proline', 'serine', 'threonine',
       'tryptophan', 'tyrosine', 'valine', 'carbohydrate', 'fiber', 'sugars',
       'fructose', 'galactose', 'glucose', 'lactose', 'maltose', 'sucrose',
       'fat', 'saturated_fatty_acids', 'monounsatur

In [79]:
nutrition.columns.get_loc('carbohydrate')

56

# Challenge

Select 10 random food items and assign to new dataframe

In [81]:
nutr_mini = nutrition.sample(n=10)
nutr_mini

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Pork, roasted, heated, separable lean and fat, bone-in, rump, ham and water product, cured",100 g,186,11g,3.8g,67mg,1181.00 mg,80.4 mg,2.00 mcg,0.00 mcg,3.901 mg,...,11.48 g,3.783 g,5.265 g,1.479 g,67.00 mg,0.0 g,3.30 g,0.00 mg,0.00 mg,64.60 g
"Poultry, raw, from backs and necks with skin, mechanically deboned",100 g,272,25g,7.5g,130mg,40.00 mg,0,6.00 mcg,0.00 mcg,4.630 mg,...,24.73 g,7.450 g,10.440 g,4.960 g,130.00 mg,0,0.96 g,0,0,62.66 g
"Nuts, with salt added, dry roasted, pistachio nuts",100 g,569,46g,5.6g,0,428.00 mg,71.4 mg,51.00 mcg,0.00 mcg,1.373 mg,...,45.82 g,5.645 g,24.534 g,13.346 g,0.00 mg,0.0 g,3.79 g,0.00 mg,0.00 mg,1.79 g
"Beef, raw, all grades, trimmed to 0"" fat, separable lean and fat, boneless, eye of round roast, round",100 g,124,3.4g,1.3g,60mg,53.00 mg,64.0 mg,4.00 mcg,0.00 mcg,6.687 mg,...,3.44 g,1.276 g,1.482 g,0.250 g,60.00 mg,0.0 g,1.09 g,0.00 mg,0.00 mg,73.09 g
"Oil, 80% diglycerides, ENOVA, cooking and salad",100 g,884,100g,4.6g,0,0.00 mg,0,0,0,0,...,100.00 g,4.630 g,37.016 g,53.370 g,0,0,0.00 g,0,0,0.00 g
"Game meat, pan-broiled, cooked, ground, deer",100 g,187,8.2g,4g,98mg,78.00 mg,101.9 mg,8.00 mcg,0.00 mcg,9.257 mg,...,8.22 g,3.993 g,1.939 g,0.444 g,98.00 mg,0.0 g,1.07 g,0.00 mg,0.00 mg,64.23 g
"Syrup, chocolate, NESTLE",100 g,269,0g,,0,150.00 mg,0.9 mg,3.00 mcg,0.00 mcg,0.173 mg,...,0.00 g,0.000 g,0.000 g,0.000 g,0.00 mg,0.0 g,0.87 g,18.00 mg,163.00 mg,31.92 g
"Beef, braised, cooked, separable lean and fat, oyster blade, imported, New Zealand",100 g,197,8.7g,3g,87mg,25.00 mg,0,0,0,1.645 mg,...,8.68 g,3.013 g,3.094 g,0.352 g,87.00 mg,0.0 g,0.63 g,0.00 mg,0.00 mg,62.23 g
"Cereals ready-to-eat, FRUITY PEBBLES, POST",100 g,402,4g,3.6g,0,531.00 mg,3.7 mg,370.00 mcg,366.00 mcg,18.520 mg,...,4.02 g,3.600 g,0.160 g,0.120 g,0.00 mg,0.0 g,2.50 g,0.00 mg,0.00 mg,3.00 g
"Beverages, diet, lemon, ready-to-drink, black, tea",100 g,1,0g,,0,17.00 mg,0.0 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.00 g,0.000 g,0.000 g,0.000 g,0.00 mg,0.0 g,0.07 g,2.00 mg,0.00 mg,99.72 g


Extract 'total_fat' and 'cholesterol' columns from this new dataframe

In [107]:
nutr_mini.loc[:, ['total_fat', 'cholesterol']]

Unnamed: 0_level_0,total_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Pork, roasted, heated, separable lean and fat, bone-in, rump, ham and water product, cured",11g,67mg
"Poultry, raw, from backs and necks with skin, mechanically deboned",25g,130mg
"Nuts, with salt added, dry roasted, pistachio nuts",46g,0
"Beef, raw, all grades, trimmed to 0"" fat, separable lean and fat, boneless, eye of round roast, round",3.4g,60mg
"Oil, 80% diglycerides, ENOVA, cooking and salad",100g,0
"Game meat, pan-broiled, cooked, ground, deer",8.2g,98mg
"Syrup, chocolate, NESTLE",0g,0
"Beef, braised, cooked, separable lean and fat, oyster blade, imported, New Zealand",8.7g,87mg
"Cereals ready-to-eat, FRUITY PEBBLES, POST",4g,0
"Beverages, diet, lemon, ready-to-drink, black, tea",0g,0


Extract all columns from 'vitamin_b12' until the end for the first three rows

In [108]:
nutr_mini.loc[:nutr_mini.index[2], 'vitamin_b12':]

Unnamed: 0_level_0,vitamin_b12,vitamin_b6,vitamin_c,vitamin_d,vitamin_e,tocopherol_alpha,vitamin_k,calcium,copper,irom,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Pork, roasted, heated, separable lean and fat, bone-in, rump, ham and water product, cured",0.47 mcg,0.316 mg,0.0 mg,0,0.23 mg,0.23 mg,0.0 mcg,10.00 mg,0.124 mg,0.82 mg,...,11.48 g,3.783 g,5.265 g,1.479 g,67.00 mg,0.0 g,3.30 g,0.00 mg,0.00 mg,64.60 g
"Poultry, raw, from backs and necks with skin, mechanically deboned",0.25 mcg,0.190 mg,1.5 mg,0,0,0,0,138.00 mg,0.065 mg,1.57 mg,...,24.73 g,7.450 g,10.440 g,4.960 g,130.00 mg,0,0.96 g,0,0,62.66 g
"Nuts, with salt added, dry roasted, pistachio nuts",0.00 mcg,1.122 mg,3.0 mg,0.00 IU,2.17 mg,2.17 mg,13.2 mcg,107.00 mg,1.293 mg,4.03 mg,...,45.82 g,5.645 g,24.534 g,13.346 g,0.00 mg,0.0 g,3.79 g,0.00 mg,0.00 mg,1.79 g


In [111]:
nutr_mini.iloc[:3, nutr_mini.columns.get_loc('vitamin_b12'):]

Unnamed: 0_level_0,vitamin_b12,vitamin_b6,vitamin_c,vitamin_d,vitamin_e,tocopherol_alpha,vitamin_k,calcium,copper,irom,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Pork, roasted, heated, separable lean and fat, bone-in, rump, ham and water product, cured",0.47 mcg,0.316 mg,0.0 mg,0,0.23 mg,0.23 mg,0.0 mcg,10.00 mg,0.124 mg,0.82 mg,...,11.48 g,3.783 g,5.265 g,1.479 g,67.00 mg,0.0 g,3.30 g,0.00 mg,0.00 mg,64.60 g
"Poultry, raw, from backs and necks with skin, mechanically deboned",0.25 mcg,0.190 mg,1.5 mg,0,0,0,0,138.00 mg,0.065 mg,1.57 mg,...,24.73 g,7.450 g,10.440 g,4.960 g,130.00 mg,0,0.96 g,0,0,62.66 g
"Nuts, with salt added, dry roasted, pistachio nuts",0.00 mcg,1.122 mg,3.0 mg,0.00 IU,2.17 mg,2.17 mg,13.2 mcg,107.00 mg,1.293 mg,4.03 mg,...,45.82 g,5.645 g,24.534 g,13.346 g,0.00 mg,0.0 g,3.79 g,0.00 mg,0.00 mg,1.79 g
