In [1]:
import pandas as pd
import numpy as np

In [2]:
# some python lists
names = ['Olga', 'Andrew', 'Brian', 'Telulah', 'Nicole', 'Tilda']
ages = [29, 21, 45, 23, 39, 46]
married = [False, True, True, True, False, True]

In [3]:
# pandas series
ser = pd.Series(names, name='name')

In [6]:
# pandas dataframe
df = pd.DataFrame({'name': names, 'age': ages, 'married': married})
df

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


In [8]:
#df is a 2 dimensional object (rows, columns)
df.shape

(6, 3)

In [9]:
#each column in a df is a series with its own dtype
# so you need to call .dtypes NOT dtype
df.dtypes

name       object
age         int64
married      bool
dtype: object

In [12]:
# since we created 3 python lists above, we can create a df using a dict format

pd.DataFrame({'name':names, 'age': ages, 'married':married})

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


In [13]:
# the length of all lists MUST be equal!!!

## Info
### can only use for a df

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [16]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [17]:
df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 557.0 bytes


## Nutrition Data (from kaggle)
* we will us this for exercises following

In [18]:
dataurl = 'https://andybek.com/pandas-nutrition'

In [19]:
nutrition = pd.read_csv(dataurl)
nutrition.head()

Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [22]:
nutrition.info(verbose = False, memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8789 entries, 0 to 8788
Columns: 77 entries, Unnamed: 0 to water
dtypes: int64(3), object(74)
memory usage: 39.2 MB


---
### clean up and remove dups
* note the unnamed column

In [23]:
#drop the unnamed columns
nutrition.drop(columns=['Unnamed: 0'], inplace=True)
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [25]:
# set the index to the name columns
nutrition.set_index(['name'], inplace=True)
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [27]:
## Taking random sample from a df

nutrition.sample(random_state=12)

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Thuringer, pork, beef, summer sausage, cervelat",100 g,362,30g,12g,74mg,1300.00 mg,78.9 mg,2.00 mcg,0.00 mcg,4.310 mg,...,30.43 g,11.510 g,12.970 g,1.200 g,74.00 mg,0.0 g,3.63 g,0.00 mg,0.00 mg,45.18 g


In [28]:
# if we set the random state parm across different users, we should get the same results
# if we want random each time:
nutrition.sample()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Bread, oat bran, reduced-calorie",100 g,201,3.2g,0.4g,0,459.00 mg,14.6 mg,81.00 mcg,47.00 mcg,3.763 mg,...,3.20 g,0.445 g,0.684 g,1.670 g,0.00 mg,0.0 g,1.50 g,0.00 mg,0.00 mg,46.00 g


In [29]:
# getting 3 random samples
nutrition.sample(n=3)

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, raw, Aust. marble score 4/5, separable lean and fat, boneless, tenderloin steak/roast, loin, Wagyu, imported, Australian",100 g,206,14g,6.1g,60mg,63.00 mg,0,0,0,0,...,14.09 g,6.105 g,6.167 g,0.531 g,60.00 mg,0.0 g,1.02 g,0.00 mg,0.00 mg,65.20 g
"Pork, roasted, heated, separable lean and fat, bone-in, shank, ham with natural juices, cured",100 g,191,11g,3.5g,74mg,801.00 mg,91.0 mg,3.00 mcg,0.00 mcg,7.190 mg,...,10.93 g,3.545 g,5.120 g,1.351 g,74.00 mg,0.0 g,2.98 g,0.00 mg,0.00 mg,62.94 g
"Cereals ready-to-eat, and Raisins, Honey, Wheat, QUAKER 100% Natural Granola with Oats, QUAKER",100 g,412,10g,1.1g,2mg,54.00 mg,31.8 mg,33.00 mcg,0.00 mcg,2.120 mg,...,10.38 g,1.120 g,5.870 g,2.390 g,2.00 mg,0.0 g,1.90 g,0.00 mg,0.00 mg,3.39 g


In [30]:
# if we wantedd to sample a % of the data use the frac parm
nutrition.sample(frac = 0.01) 
#this would be 1% of the data

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Soup, ready-to-serve, canned, chicken and vegetable",100 g,33,0.7g,0.1g,3mg,229.00 mg,7.7 mg,13.00 mcg,0.00 mcg,0.906 mg,...,0.73 g,0.121 g,0.158 g,0.134 g,3.00 mg,0.0 g,0.93 g,0.00 mg,0.00 mg,91.68 g
"Fast foods, and sausage, cheese, with egg, english muffin",100 g,286,18g,6.9g,163mg,548.00 mg,103.3 mg,57.00 mcg,41.00 mcg,3.192 mg,...,18.10 g,6.928 g,6.765 g,2.772 g,163.00 mg,0.0 g,2.26 g,0.00 mg,0.00 mg,48.82 g
"Margarine-like, soybean oil and butter, margarine-butter blend",100 g,727,80g,14g,12mg,719.00 mg,6.5 mg,2.00 mcg,0.00 mcg,0.022 mg,...,80.32 g,14.198 g,30.292 g,24.170 g,12.00 mg,0.0 g,1.53 g,0.00 mg,0.00 mg,17.07 g
"Mayonnaise, with olive oil, reduced fat",100 g,361,40g,5.4g,33mg,800.00 mg,9.0 mg,0.00 mcg,0.00 mcg,0.010 mg,...,40.00 g,5.370 g,29.542 g,4.006 g,33.00 mg,0.0 g,2.05 g,0.00 mg,0.00 mg,57.58 g
"Chicken, skin, BBQ, rotisserie, broiler",100 g,378,35g,9.3g,120mg,335.00 mg,39.3 mg,9.00 mcg,0.00 mcg,7.082 mg,...,35.15 g,9.333 g,14.464 g,4.018 g,120.00 mg,0.0 g,1.52 g,0.00 mg,0.00 mg,47.81 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wheat, hard red spring",100 g,329,1.9g,0.3g,0,2.00 mg,31.2 mg,43.00 mcg,0.00 mcg,5.710 mg,...,1.92 g,0.314 g,0.303 g,0.765 g,0.00 mg,0.0 g,1.89 g,0.00 mg,0.00 mg,12.76 g
"Waffle, microwaved, ready-to-heat, frozen, buttermilk",100 g,289,9.4g,2.1g,16mg,663.00 mg,0,64.00 mcg,50.00 mcg,6.730 mg,...,9.40 g,2.057 g,4.864 g,1.615 g,16.00 mg,0,3.04 g,0,0,36.48 g
"CAMPBELL'S CHUNKY Soups, Grilled Sirloin Steak with Hearty Vegetables Soup",100 g,51,0.8g,0.4g,4mg,363.00 mg,0,0,0,0,...,0.82 g,0.408 g,0,0,4.00 mg,0,1.16 g,0,0,87.00 g
"Snacks, taro chips",100 g,498,25g,6.4g,0,342.00 mg,44.3 mg,20.00 mcg,0.00 mcg,0.515 mg,...,24.90 g,6.430 g,4.430 g,12.880 g,0.00 mg,0.0 g,2.70 g,0.00 mg,0.00 mg,2.00 g


## Axis

### axis=0 is rows
### axis=1 is columns

In [31]:
nutrition.axes

[Index(['Cornstarch', 'Nuts, pecans', 'Eggplant, raw', 'Teff, uncooked',
        'Sherbet, orange', 'Cauliflower, raw', 'Taro leaves, raw',
        'Lamb, raw, ground', 'Cheese, camembert', 'Vegetarian fillets',
        ...
        'Beef, braised, cooked, all grades, trimmed to 1/8" fat, separable lean and fat, flat half, brisket',
        'Beef, raw, select, trimmed to 1/8" fat, separable lean only, lip-on, boneless, rib eye steak/roast',
        'Beef, raw, choice, trimmed to 1/8" fat, separable lean only, lip-on, boneless, rib eye steak/roast',
        'Oil, uses similar to 95 degree hard butter, confection fat, palm kernel (hydrogenated), industrial',
        'Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round steak, round',
        'Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round',
        'Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand',
      

In [33]:
nutrition.index[3]

'Teff, uncooked'

In [35]:
nutrition.columns[50]

'proline'

## Setting Index

In [36]:
# set the index to the name columns
# nutrition.set_index(['name'], inplace=True)
# nutrition.head()

# this was dome already above

In [37]:
# To check if there are any duplicates in the indes you can use the parm verify_integrity=True in set_index()
# the default is set to false so that you ARE allowed dups in the index
# if you set to false and it errors ... you know that you have dups in the index

## Extracting Data
## .loc

* can use BOTH rows and columns

In [38]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [39]:
nutrition.loc['Eggplant, raw']

serving_size       100 g
calories              25
total_fat           0.2g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol            0.0 g
ash               0.66 g
caffeine         0.00 mg
theobromine      0.00 mg
water            92.30 g
Name: Eggplant, raw, Length: 75, dtype: object

In [45]:
# note this returns a series, what if we want the whole row
nutrition.loc[['Eggplant, raw'], :]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g


In [44]:
#just the calories
nutrition.loc['Eggplant, raw', 'calories']

25

In [46]:
nutrition.loc[['Eggplant, raw'], ['calories']]

Unnamed: 0_level_0,calories
name,Unnamed: 1_level_1
"Eggplant, raw",25


In [48]:
# Slicing rows and columns same time
nutrition.loc['Eggplant, raw':'Sherbet, orange', 'calories':'cholesterol']

Unnamed: 0_level_0,calories,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Eggplant, raw",25,0.2g,,0
"Teff, uncooked",367,2.4g,0.4g,0
"Sherbet, orange",144,2g,1.2g,1mg


In [51]:
# getting multiple values that ARE NOT consecutive --> double bracets --> passing a list in

# df.loc[['row or list of rows'], ['column1', 'column5', 'column8']]

nutrition.loc[
                ['Raspberries, raw', 'Blackberries, raw'],
                ['protein', 'vitamin_b6', 'water']
    
]

Unnamed: 0_level_0,protein,vitamin_b6,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Raspberries, raw",1.20 g,0.055 mg,85.75 g
"Blackberries, raw",1.39 g,0.030 mg,88.15 g


## Extracting Data
## .iloc

* 0 indexed
* can use BOTH rows and columns

In [52]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [53]:
# getting the 4th item
nutrition.iloc[3]

serving_size      100 g
calories            367
total_fat          2.4g
saturated_fat      0.4g
cholesterol           0
                  ...  
alcohol               0
ash              2.37 g
caffeine              0
theobromine           0
water            8.82 g
Name: Teff, uncooked, Length: 75, dtype: object

In [54]:
#if we want the 4th, 6th and 9th position
nutrition.iloc[[4, 6, 9], :]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
"Taro leaves, raw",100 g,42,0.7g,0.2g,0,3.00 mg,12.8 mg,126.00 mcg,0.00 mcg,1.513 mg,...,0.74 g,0.151 g,0.060 g,0.307 g,0.00 mg,0.0 g,1.92 g,0.00 mg,0.00 mg,85.66 g
Vegetarian fillets,100 g,290,18g,2.8g,0,490.00 mg,82.0 mg,102.00 mcg,0.00 mcg,12.000 mg,...,18.00 g,2.849 g,4.376 g,9.332 g,0.00 mg,0.0 g,5.00 g,0.00 mg,0.00 mg,45.00 g


In [55]:
#if we wanted a certian slice of columns
nutrition.iloc[[4, 6, 9], 2:5]

Unnamed: 0_level_0,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sherbet, orange",2g,1.2g,1mg
"Taro leaves, raw",0.7g,0.2g,0
Vegetarian fillets,18g,2.8g,0


In [57]:
# or certian individual columns
nutrition.iloc[[4, 6, 9], [2,5,8]]

Unnamed: 0_level_0,total_fat,sodium,folic_acid
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sherbet, orange",2g,46.00 mg,0.00 mcg
"Taro leaves, raw",0.7g,3.00 mg,0.00 mcg
Vegetarian fillets,18g,490.00 mg,0.00 mcg


In [62]:
# if we wanted to see every other row

nutrition.iloc[
    [True if i%2 ==0 else False for i in range(len(nutrition))]
    
]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
"Taro leaves, raw",100 g,42,0.7g,0.2g,0,3.00 mg,12.8 mg,126.00 mcg,0.00 mcg,1.513 mg,...,0.74 g,0.151 g,0.060 g,0.307 g,0.00 mg,0.0 g,1.92 g,0.00 mg,0.00 mg,85.66 g
"Cheese, camembert",100 g,300,24g,15g,72mg,842.00 mg,15.4 mg,62.00 mcg,0.00 mcg,0.630 mg,...,24.26 g,15.259 g,7.023 g,0.724 g,72.00 mg,0.0 g,3.68 g,0.00 mg,0.00 mg,51.80 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beef, raw, select, trimmed to 1/8"" fat, separable lean only, lip-on, boneless, rib eye steak/roast",100 g,148,6.4g,2.3g,70mg,55.00 mg,49.4 mg,4.00 mcg,0.00 mcg,5.580 mg,...,6.41 g,2.313 g,2.830 g,0.396 g,70.00 mg,0.0 g,1.03 g,0.00 mg,0.00 mg,70.89 g
"Oil, uses similar to 95 degree hard butter, confection fat, palm kernel (hydrogenated), industrial",100 g,884,100g,94g,0,6.00 mg,0.2 mg,0.00 mcg,0.00 mcg,0.000 mg,...,100.00 g,93.701 g,0.257 g,0.000 g,0.00 mg,0.0 g,0.01 g,0.00 mg,0.00 mg,0.05 g
"Beef, raw, all grades, trimmed to 0"" fat, separable lean and fat, boneless, top round roast, round",100 g,125,3.5g,1.4g,62mg,54.00 mg,64.5 mg,4.00 mcg,0.00 mcg,6.422 mg,...,3.50 g,1.353 g,1.554 g,0.244 g,62.00 mg,0.0 g,1.11 g,0.00 mg,0.00 mg,72.51 g
"Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand",100 g,277,23g,12g,78mg,39.00 mg,0,1.00 mcg,0.00 mcg,6.550 mg,...,22.74 g,11.570 g,8.720 g,0.980 g,78.00 mg,0,0.92 g,0,0,59.80 g


## Single value access

* .at - by value (like loc)
* .iat - by position (like iloc)

In [63]:
nutrition.at['Nuts, pecans', 'calories']

691

In [64]:
nutrition.iat[2, 10]

'0.281 mg'

In [65]:
# at & iat are SAME as using loc & iloc BUT since its a single use function, its much faster

#we can time and compare

In [67]:
%timeit nutrition.loc['Nuts, pecans', 'calories']

7.22 µs ± 140 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [68]:
%timeit nutrition.at['Nuts, pecans', 'calories']

3.38 µs ± 25 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


## get_loc

-helps us get an integer location from the label

In [70]:
nutrition.columns.get_loc('vitamin_k')

26

In [71]:
# now we know that the column entitled vitamin_k is at position 26

---
# Practice Challenge
---

1. Randomly select 10 food items and assign the resulting dataframe to a new variable called *nutr_mini*.

In [72]:
nutr_mini = nutrition.sample(n=10)
nutr_mini

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, grilled, cooked, select, trimmed to 1/8"" fat, separable lean and fat, porterhouse steak, short loin",100 g,263,17g,6.8g,79mg,63.00 mg,59.5 mg,6.00 mcg,0.00 mcg,5.304 mg,...,16.75 g,6.843 g,7.469 g,0.737 g,79.00 mg,0.0 g,1.03 g,0.00 mg,0.00 mg,56.87 g
"WORTHINGTON FriPats, unprepared, frozen",100 g,209,9.1g,1.4g,2mg,517.00 mg,0,0,0,4.700 mg,...,9.10 g,1.400 g,2.200 g,5.200 g,2.00 mg,0,2.40 g,0,0,56.80 g
"Fish, raw, Atlantic, wolffish",100 g,96,2.4g,0.4g,46mg,85.00 mg,0,5.00 mcg,0.00 mcg,2.133 mg,...,2.39 g,0.365 g,0.837 g,0.845 g,46.00 mg,0,1.16 g,0,0,79.90 g
"Fish, liver (Alaska Native), chinook, king, salmon",100 g,156,8g,,0,0,0,0,0,5.000 mg,...,8.00 g,0,0,0,0,0,1.30 g,0,0,69.80 g
"Cereals ready-to-eat, KASHI 7 Whole Grain Puffs",100 g,336,2.3g,0.5g,0,9.00 mg,31.8 mg,27.00 mcg,0.00 mcg,3.200 mg,...,2.30 g,0.463 g,0.580 g,1.053 g,0.00 mg,0.0 g,2.10 g,0.00 mg,0.00 mg,4.00 g
"McDONALD'S, NEWMAN'S OWN Creamy Caesar Dressing",100 g,319,32g,5.9g,35mg,851.00 mg,0,4.00 mcg,0,0.020 mg,...,31.50 g,5.860 g,7.750 g,16.300 g,35.00 mg,0,3.19 g,0,0,54.90 g
"Pork, braised, cooked, separable lean and fat, boneless, Leg sirloin tip roast",100 g,156,2.6g,0.8g,84mg,43.00 mg,106.6 mg,1.00 mcg,0.00 mcg,7.685 mg,...,2.56 g,0.791 g,1.022 g,0.474 g,84.00 mg,0.0 g,1.14 g,0.00 mg,0.00 mg,66.99 g
"Nuts, with salt added, plain, almond butter",100 g,614,56g,6.6g,0,227.00 mg,52.1 mg,53.00 mcg,0.00 mcg,3.155 mg,...,55.50 g,6.550 g,32.445 g,13.613 g,0.00 mg,0.0 g,3.09 g,0.00 mg,0.00 mg,1.64 g
"Frozen novelties, with low calorie sweetener, pop, ice type",100 g,24,0g,,0,10.00 mg,0.0 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.00 g,0.000 g,0.000 g,0.000 g,0.00 mg,0.0 g,0.06 g,0.00 mg,0.00 mg,94.03 g
"Cookies, with added fiber, reduced fat, commercially prepared, brownies",100 g,345,9.7g,2.8g,0,290.00 mg,13.9 mg,42.00 mcg,34.00 mcg,1.441 mg,...,9.68 g,2.766 g,3.432 g,2.581 g,0.00 mg,0.0 g,1.14 g,10.00 mg,97.00 mg,24.83 g


---
2. From *nutr_mini*, extract the **total_fat** and **cholesterol** columns for all rows. 

In [74]:
nutr_mini.loc[:, ['total_fat', 'cholesterol']]

Unnamed: 0_level_0,total_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Beef, grilled, cooked, select, trimmed to 1/8"" fat, separable lean and fat, porterhouse steak, short loin",17g,79mg
"WORTHINGTON FriPats, unprepared, frozen",9.1g,2mg
"Fish, raw, Atlantic, wolffish",2.4g,46mg
"Fish, liver (Alaska Native), chinook, king, salmon",8g,0
"Cereals ready-to-eat, KASHI 7 Whole Grain Puffs",2.3g,0
"McDONALD'S, NEWMAN'S OWN Creamy Caesar Dressing",32g,35mg
"Pork, braised, cooked, separable lean and fat, boneless, Leg sirloin tip roast",2.6g,84mg
"Nuts, with salt added, plain, almond butter",56g,0
"Frozen novelties, with low calorie sweetener, pop, ice type",0g,0
"Cookies, with added fiber, reduced fat, commercially prepared, brownies",9.7g,0


---
3. Extract all the columns from **vitamin_b12** to the end, for the first, second, and third rows.

In [78]:
nutr_mini.columns.get_loc('vitamin_b12')

20

In [86]:
nutr_mini.iloc[0:3, 20:]

Unnamed: 0_level_0,vitamin_b12,vitamin_b6,vitamin_c,vitamin_d,vitamin_e,tocopherol_alpha,vitamin_k,calcium,copper,irom,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, grilled, cooked, select, trimmed to 1/8"" fat, separable lean and fat, porterhouse steak, short loin",2.07 mcg,0.572 mg,0.0 mg,5.00 IU,0.21 mg,0.21 mg,1.6 mcg,20.00 mg,0.068 mg,3.20 mg,...,16.75 g,6.843 g,7.469 g,0.737 g,79.00 mg,0.0 g,1.03 g,0.00 mg,0.00 mg,56.87 g
"WORTHINGTON FriPats, unprepared, frozen",1.90 mcg,0.900 mg,0.0 mg,0,0,0,0,97.00 mg,0,2.80 mg,...,9.10 g,1.400 g,2.200 g,5.200 g,2.00 mg,0,2.40 g,0,0,56.80 g
"Fish, raw, Atlantic, wolffish",2.03 mcg,0.400 mg,0.0 mg,0,0,0,0,6.00 mg,0.029 mg,0.09 mg,...,2.39 g,0.365 g,0.837 g,0.845 g,46.00 mg,0,1.16 g,0,0,79.90 g


---
4. Get the calories for the third food in *nutr_mini* using an attribute-based approach that is faster than .loc or .iloc. 

In [83]:
nutr_mini.at['Fish, raw, Atlantic, wolffish', 'calories']

96

## Numeric cleanup

### note alot of data has the units in cells

In [88]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


## these values are considered as strings
## we need to conver these to numeric values so can perform operations/analysis

## use the astype() method
* first need to remove the units from each value --> .replace()
* can change entire column or df

In [91]:
df_mini = nutrition.iloc[:6, :1]
df_mini

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100 g
"Nuts, pecans",100 g
"Eggplant, raw",100 g
"Teff, uncooked",100 g
"Sherbet, orange",100 g
"Cauliflower, raw",100 g


In [92]:
df_mini.replace(to_replace='100 g', value=100)

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100
"Nuts, pecans",100
"Eggplant, raw",100
"Teff, uncooked",100
"Sherbet, orange",100
"Cauliflower, raw",100


In [None]:
#this can also be written as:
df_mini.replace('100 g', 100)
#but this is very specific to 100g

## Regex

### Regular expression can use specifics on regex
* regex101.com

### \s will find white space

* so \sg will find all the with space and the letter g

In [93]:
# Putting that into the replace method

# .replace('pattern_to_be_replace', 'what_replace_with', regex=True)

df_mini.replace('\sg', '', regex=True)

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100
"Nuts, pecans",100
"Eggplant, raw",100
"Teff, uncooked",100
"Sherbet, orange",100
"Cauliflower, raw",100


In [94]:
# this is great for single column that has same units and space
# we have many types of units with varying spaces

## Put the units in the header column labels

### 1. Remove the numerics so just left with the units labels

## the regex to grab the just the units is
* [a-zA-Z]
## to negate that and get everything BUT the units (all numbers), add a ^ before
*[^a-zA-Z]

In [97]:
nutrition.replace('[^a-zA-Z]', '', regex=True).head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,g,381,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Nuts, pecans",g,691,g,g,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Eggplant, raw",g,25,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Teff, uncooked",g,367,g,g,,mg,mg,,,mg,...,g,g,g,g,,,g,,,g
"Sherbet, orange",g,144,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [100]:
## Note that calories is still numeric because it didnt have any units initially
## It was an integer not a string intially
## cast everything as a sting first

units = nutrition.astype(str).replace('[^a-zA-Z]', '', regex=True)
units.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,g,,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Nuts, pecans",g,,g,g,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Eggplant, raw",g,,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Teff, uncooked",g,,g,g,,mg,mg,,,mg,...,g,g,g,g,,,g,,,g
"Sherbet, orange",g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


### 2. How to get the unit to column label header

### some NaN's, some empties, some mix of the two
* Cant pick a single row to use
* Need to find the mode of each column
    * Recall the mode is the value that appears the most

In [101]:
units.mode()

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [102]:
headers = units.mode()
headers

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


### dropna() quick overview
* df.dropna() --> will drop all rows with a single NaN value (default)
* the parms are df.dropna(how='any', axis=0) (default)
* changing any to all will only drop rows that are ALL NaN values
* Can add a subset to the parm as well:
    - df.dropna(how='any', axis=0, subset=['column_name'])
    - this will only use the column in the subset as the na filter

### 3. Merging the header units to column labels

### use rename method with map
* iterate over the newly created headers


In [105]:
# for k in headers:
#     print(k, headers[k])
    
#this will return a series for each column in the df --> each column is a series

#look at just col and units
for k in headers:
    print(k, headers[k].iloc[0])

serving_size g
calories 
total_fat g
saturated_fat g
cholesterol mg
sodium mg
choline mg
folate mcg
folic_acid mcg
niacin mg
pantothenic_acid mg
riboflavin mg
thiamin mg
vitamin_a IU
vitamin_a_rae mcg
carotene_alpha mcg
carotene_beta mcg
cryptoxanthin_beta mcg
lutein_zeaxanthin mcg
lucopene 
vitamin_b12 mcg
vitamin_b6 mg
vitamin_c mg
vitamin_d IU
vitamin_e mg
tocopherol_alpha mg
vitamin_k mcg
calcium mg
copper mg
irom mg
magnesium mg
manganese mg
phosphorous mg
potassium mg
selenium mcg
zink mg
protein g
alanine g
arginine g
aspartic_acid g
cystine g
glutamic_acid g
glycine g
histidine g
hydroxyproline 
isoleucine g
leucine g
lysine g
methionine g
phenylalanine g
proline g
serine g
threonine g
tryptophan g
tyrosine g
valine g
carbohydrate g
fiber g
sugars g
fructose 
galactose 
glucose 
lactose 
maltose 
sucrose 
fat g
saturated_fatty_acids g
monounsaturated_fatty_acids g
polyunsaturated_fatty_acids g
fatty_acids_total_trans mg
alcohol g
ash g
caffeine mg
theobromine mg
water g


In [111]:
# but we know that some names DONT have unit values, see sucrose just above
# we can replace these with an NaN

headers = headers.replace('', np.nan)
headers

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [112]:
#then if we wanted to drop out the NaN columns

headers = headers.dropna(axis=1)
headers

Unnamed: 0,serving_size,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,pantothenic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,g,g,mg,mg,mg,mcg,mcg,mg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [117]:
#we can see that we dropped 9 columns from the dataFrame
# we drop them because when we map and combine, those columns in the final df will remain unchanged (original)

### create a mapper
* dict of key:value pairs
* {'old_col_namw' : 'new_name'}
* we can pass to rename()

In [115]:
#use a dict comprehension
mapper = {k: k + "_" + headers[k].iloc[0] for k in headers}

mapper

{'serving_size': 'serving_size_g',
 'total_fat': 'total_fat_g',
 'saturated_fat': 'saturated_fat_g',
 'cholesterol': 'cholesterol_mg',
 'sodium': 'sodium_mg',
 'choline': 'choline_mg',
 'folate': 'folate_mcg',
 'folic_acid': 'folic_acid_mcg',
 'niacin': 'niacin_mg',
 'pantothenic_acid': 'pantothenic_acid_mg',
 'riboflavin': 'riboflavin_mg',
 'thiamin': 'thiamin_mg',
 'vitamin_a': 'vitamin_a_IU',
 'vitamin_a_rae': 'vitamin_a_rae_mcg',
 'carotene_alpha': 'carotene_alpha_mcg',
 'carotene_beta': 'carotene_beta_mcg',
 'cryptoxanthin_beta': 'cryptoxanthin_beta_mcg',
 'lutein_zeaxanthin': 'lutein_zeaxanthin_mcg',
 'vitamin_b12': 'vitamin_b12_mcg',
 'vitamin_b6': 'vitamin_b6_mg',
 'vitamin_c': 'vitamin_c_mg',
 'vitamin_d': 'vitamin_d_IU',
 'vitamin_e': 'vitamin_e_mg',
 'tocopherol_alpha': 'tocopherol_alpha_mg',
 'vitamin_k': 'vitamin_k_mcg',
 'calcium': 'calcium_mg',
 'copper': 'copper_mg',
 'irom': 'irom_mg',
 'magnesium': 'magnesium_mg',
 'manganese': 'manganese_mg',
 'phosphorous': 'phospho

In [118]:
nutrition.rename(columns=mapper, inplace=True)
nutrition.head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


### 4. Final clean up of the units in the data fields
* we can use .replace() with a regex pattern



In [121]:
# replace( 'what to replace', 'what replace with', regex=True)
nutrition.replace('[a-z A-Z]', '', regex=True, inplace=True)
nutrition.head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100,381,0.1,,0,9.0,0.4,0.0,0.0,0.0,...,0.05,0.009,0.016,0.025,0.0,0.0,0.09,0.0,0.0,8.32
"Nuts, pecans",100,691,72.0,6.2,0,0.0,40.5,22.0,0.0,1.167,...,71.97,6.18,40.801,21.614,0.0,0.0,1.49,0.0,0.0,3.52
"Eggplant, raw",100,25,0.2,,0,2.0,6.9,22.0,0.0,0.649,...,0.18,0.034,0.016,0.076,0.0,0.0,0.66,0.0,0.0,92.3
"Teff, uncooked",100,367,2.4,0.4,0,12.0,13.1,0.0,0.0,3.363,...,2.38,0.449,0.589,1.071,0.0,0.0,2.37,0.0,0.0,8.82
"Sherbet, orange",100,144,2.0,1.2,1,46.0,7.7,4.0,0.0,0.063,...,2.0,1.16,0.53,0.08,1.0,0.0,0.4,0.0,0.0,66.1


In [124]:
nutrition.dtypes.value_counts()

object    73
int64      2
dtype: int64

In [125]:
#so 73 of the 75 columns are still objects ... we want integers
nutrition = nutrition.astype(float)

In [126]:
#now check
nutrition.dtypes.value_counts()

float64    75
dtype: int64

In [128]:
nutrition.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
serving_size_g,8789.0,100.000000,0.000000,100.0,100.00,100.00,100.00,100.0
calories,8789.0,226.283878,169.862001,0.0,91.00,191.00,337.00,902.0
total_fat_g,8789.0,10.556855,15.818247,0.0,1.00,5.10,14.00,100.0
saturated_fat_g,7199.0,4.192791,6.877009,0.1,0.70,2.20,5.00,96.0
cholesterol_mg,8789.0,38.723063,117.358944,0.0,0.00,2.00,65.00,3100.0
...,...,...,...,...,...,...,...,...
alcohol_g,8789.0,0.117522,1.600127,0.0,0.00,0.00,0.00,42.5
ash_g,8789.0,1.717408,2.967751,0.0,0.77,1.17,2.02,99.8
caffeine_mg,8789.0,2.608829,82.070233,0.0,0.00,0.00,0.00,5714.0
theobromine_mg,8789.0,4.695642,60.962608,0.0,0.00,0.00,0.00,2634.0


In [129]:
#test a calcualation
nutrition.calories.sum()

1988809.0

## Flitering in 2D

In [131]:
nutrition.shape

(8789, 75)

In [132]:
#we know that we have 8789 rows in the df
#what if we wanted to find octopus info??

nutrition.filter(like='Octopus', axis=0) #axis=0 because we want to look for rows

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Octopus (Alaska Native),100.0,56.0,0.8,0.2,41.0,0.0,0.0,0.0,0.0,2.0,...,0.8,0.2,0.0,0.2,41.0,0.0,1.5,0.0,0.0,84.0


In [133]:
# This is CASE SENSITIVE ..
nutrition.filter(like='octopus', axis=0)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Mollusks, raw, common, octopus",100.0,82.0,1.0,0.2,48.0,230.0,65.0,16.0,0.0,2.1,...,1.04,0.227,0.162,0.239,48.0,0.0,1.6,0.0,0.0,80.25
"Mollusks, moist heat, cooked, common, octopus",100.0,164.0,2.1,0.5,96.0,460.0,81.0,24.0,0.0,3.78,...,2.08,0.453,0.324,0.477,96.0,0.0,3.2,0.0,0.0,60.5


In [134]:
# We can filter with regex to get ALL octopus
nutrition.filter(regex='[Oo]ctopus', axis=0)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Octopus (Alaska Native),100.0,56.0,0.8,0.2,41.0,0.0,0.0,0.0,0.0,2.0,...,0.8,0.2,0.0,0.2,41.0,0.0,1.5,0.0,0.0,84.0
"Mollusks, raw, common, octopus",100.0,82.0,1.0,0.2,48.0,230.0,65.0,16.0,0.0,2.1,...,1.04,0.227,0.162,0.239,48.0,0.0,1.6,0.0,0.0,80.25
"Mollusks, moist heat, cooked, common, octopus",100.0,164.0,2.1,0.5,96.0,460.0,81.0,24.0,0.0,3.78,...,2.08,0.453,0.324,0.477,96.0,0.0,3.2,0.0,0.0,60.5


In [135]:
# we can use the regex case insensitive modifier --> (?i)
nutrition.filter(regex='(?i)Octopus', axis=0)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Octopus (Alaska Native),100.0,56.0,0.8,0.2,41.0,0.0,0.0,0.0,0.0,2.0,...,0.8,0.2,0.0,0.2,41.0,0.0,1.5,0.0,0.0,84.0
"Mollusks, raw, common, octopus",100.0,82.0,1.0,0.2,48.0,230.0,65.0,16.0,0.0,2.1,...,1.04,0.227,0.162,0.239,48.0,0.0,1.6,0.0,0.0,80.25
"Mollusks, moist heat, cooked, common, octopus",100.0,164.0,2.1,0.5,96.0,460.0,81.0,24.0,0.0,3.78,...,2.08,0.453,0.324,0.477,96.0,0.0,3.2,0.0,0.0,60.5


In [138]:
# We can filter along both rows AND columns
# we can also use the items func in filter to select certian columns of interest

nutrition.filter(regex='(?i)Octopus', axis=0).filter(items=['serving_size_g', 'total_fat_g', 'calories'], axis=1)

Unnamed: 0_level_0,serving_size_g,total_fat_g,calories
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Octopus (Alaska Native),100.0,0.8,56.0
"Mollusks, raw, common, octopus",100.0,1.0,82.0
"Mollusks, moist heat, cooked, common, octopus",100.0,2.1,164.0


In [142]:
# You could also use the .loc to add column filtering

nutrition.filter(regex='(?i)Octopus', axis=0).loc[:, ['serving_size_g', 'total_fat_g', 'calories']]

Unnamed: 0_level_0,serving_size_g,total_fat_g,calories
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Octopus (Alaska Native),100.0,0.8,56.0
"Mollusks, raw, common, octopus",100.0,1.0,82.0
"Mollusks, moist heat, cooked, common, octopus",100.0,2.1,164.0


## Sorting
* sort by column

In [144]:
nutrition.sort_values(by=['calories'], ascending=False).head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Fat, mutton tallow",100.0,902.0,100.0,47.0,102.0,0.0,79.8,0.0,0.0,0.0,...,100.0,47.3,40.6,7.8,102.0,0.0,0.0,0.0,0.0,0.0
"Fish oil, salmon",100.0,902.0,100.0,20.0,485.0,0.0,0.0,0.0,0.0,0.0,...,100.0,19.872,29.037,40.324,485.0,0.0,0.0,0.0,0.0,0.0
Lard,100.0,902.0,100.0,39.0,95.0,0.0,49.7,0.0,0.0,0.0,...,100.0,39.2,45.1,11.2,95.0,0.0,0.0,0.0,0.0,0.0
"Fat, beef tallow",100.0,902.0,100.0,50.0,109.0,0.0,79.8,0.0,0.0,0.0,...,100.0,49.8,41.8,4.0,109.0,0.0,0.0,0.0,0.0,0.0
"Fish oil, cod liver",100.0,902.0,100.0,23.0,570.0,0.0,0.0,0.0,0.0,0.0,...,100.0,22.608,46.711,22.541,570.0,0.0,0.0,0.0,0.0,0.0


In [146]:
#sorting by 2 columns
nutrition.sort_values(by=['cholesterol_mg', 'sodium_mg'], ascending=False).head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Veal, braised, cooked, brain, variety meats and by-products",100.0,136.0,9.6,2.2,3100.0,156.0,0.0,3.0,0.0,2.43,...,9.63,2.18,1.74,1.49,3100.0,0.0,1.4,0.0,0.0,76.89
"Beef, simmered, cooked, brain, variety meats and by-products",100.0,151.0,11.0,2.4,3100.0,108.0,490.9,5.0,0.0,3.62,...,10.53,2.394,1.882,1.632,3100.0,0.0,1.46,0.0,0.0,74.86
"Beef, raw, brain, variety meats and by-products",100.0,143.0,10.0,2.3,3010.0,126.0,0.0,3.0,0.0,3.55,...,10.3,2.3,1.89,1.586,3010.0,0.0,1.51,0.0,0.0,76.29
"Lamb, soaked and fried, cooked, brains, imported, New Zealand",100.0,154.0,11.0,1.4,2559.0,101.0,0.0,0.0,0.0,2.995,...,10.92,1.365,4.168,0.999,2559.0,0.0,3.39,0.0,0.0,73.11
"Pork, braised, cooked, brain, variety meats and by-products, fresh",100.0,138.0,9.5,2.2,2552.0,91.0,0.0,4.0,0.0,3.33,...,9.51,2.15,1.72,1.47,2552.0,0.0,1.4,0.0,0.0,75.88


In [147]:
# can also pass a py list to the ascending parm
# it will sort in order of the columns passed in list

nutrition.sort_values(by=['cholesterol_mg', 'sodium_mg'], ascending=[False, True]).head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, simmered, cooked, brain, variety meats and by-products",100.0,151.0,11.0,2.4,3100.0,108.0,490.9,5.0,0.0,3.62,...,10.53,2.394,1.882,1.632,3100.0,0.0,1.46,0.0,0.0,74.86
"Veal, braised, cooked, brain, variety meats and by-products",100.0,136.0,9.6,2.2,3100.0,156.0,0.0,3.0,0.0,2.43,...,9.63,2.18,1.74,1.49,3100.0,0.0,1.4,0.0,0.0,76.89
"Beef, raw, brain, variety meats and by-products",100.0,143.0,10.0,2.3,3010.0,126.0,0.0,3.0,0.0,3.55,...,10.3,2.3,1.89,1.586,3010.0,0.0,1.51,0.0,0.0,76.29
"Lamb, soaked and fried, cooked, brains, imported, New Zealand",100.0,154.0,11.0,1.4,2559.0,101.0,0.0,0.0,0.0,2.995,...,10.92,1.365,4.168,0.999,2559.0,0.0,3.39,0.0,0.0,73.11
"Pork, braised, cooked, brain, variety meats and by-products, fresh",100.0,138.0,9.5,2.2,2552.0,91.0,0.0,4.0,0.0,3.33,...,9.51,2.15,1.72,1.47,2552.0,0.0,1.4,0.0,0.0,75.88


## Between Method
* select a row or column
* then we can apply between criteria

In [148]:
# grab the calories between 20 and 60
nutrition.calories.between(20,60)

name
Cornstarch                                                                                            False
Nuts, pecans                                                                                          False
Eggplant, raw                                                                                          True
Teff, uncooked                                                                                        False
Sherbet, orange                                                                                       False
                                                                                                      ...  
Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round    False
Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Beef, raw, all grades, 

In [149]:
# Note it returns a bool where it meets the criteria

In [150]:
# If we now pass that as a Boolean Mask for the whole df
nutrition[nutrition.calories.between(20,60)]

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Eggplant, raw",100.0,25.0,0.2,,0.0,2.0,6.9,22.0,0.0,0.649,...,0.18,0.034,0.016,0.076,0.0,0.0,0.66,0.0,0.0,92.30
"Cauliflower, raw",100.0,25.0,0.3,0.1,0.0,30.0,44.3,57.0,0.0,0.507,...,0.28,0.130,0.034,0.031,0.0,0.0,0.76,0.0,0.0,92.07
"Taro leaves, raw",100.0,42.0,0.7,0.2,0.0,3.0,12.8,126.0,0.0,1.513,...,0.74,0.151,0.060,0.307,0.0,0.0,1.92,0.0,0.0,85.66
"PACE, Picante Sauce",100.0,25.0,0.0,,0.0,781.0,0.0,0.0,0.0,0.000,...,0.00,0.000,0.000,0.000,0.0,0.0,3.85,0.0,0.0,89.90
"Mango nectar, canned",100.0,51.0,0.1,,0.0,5.0,1.5,7.0,0.0,0.080,...,0.06,0.014,0.022,0.011,0.0,0.0,0.08,0.0,0.0,86.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beverages, added calcium, high vitamin C, greater than 3% fruit juice, reduced sugar, Fruit flavored drink",100.0,29.0,0.4,,0.0,25.0,0.1,2.0,0.0,0.000,...,0.37,0.000,0.000,0.000,0.0,0.0,0.06,0.0,0.0,93.00
"Ruby Red grapefruit juice blend (grapefruit, grape, apple), with added vitamin C, bottled, OCEAN SPRAY",100.0,44.0,0.1,,0.0,8.0,0.0,0.0,0.0,0.000,...,0.10,0.008,0.008,0.014,0.0,0.0,0.27,0.0,0.0,88.60
"Beverages, prepared with water, frozen concentrate, with juice and pulp, breakfast type, Orange drink",100.0,45.0,0.0,,0.0,10.0,0.0,0.0,0.0,0.253,...,0.00,0.001,0.001,0.001,0.0,0.0,0.48,0.0,0.0,88.08
"Apple juice, diluted with 3 volume water without added ascorbic acid, unsweetened, frozen concentrate",100.0,47.0,0.1,,0.0,7.0,1.8,0.0,0.0,0.038,...,0.10,0.018,0.002,0.031,0.0,0.0,0.32,0.0,0.0,87.90


In [151]:
# this returns 1190 possibilities, if we wanted a sample of that
nutrition[nutrition.calories.between(20,60)].sample(5)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Pear nectar, without added ascorbic acid, canned",100.0,60.0,0.0,,0.0,4.0,2.0,1.0,0.0,0.128,...,0.01,0.001,0.003,0.003,0.0,0.0,0.1,0.0,0.0,84.01
"Broccoli, with salt, drained, boiled, cooked, chopped, frozen",100.0,28.0,0.1,,0.0,260.0,16.2,56.0,0.0,0.458,...,0.12,0.018,0.008,0.055,0.0,0.0,0.71,0.0,0.0,90.72
"Beverages, Mango Peach, V8 SPLASH Juice Drinks",100.0,33.0,0.0,,0.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,91.6
"Beverages, prepared with water, frozen concentrate, with juice and pulp, breakfast type, Orange drink",100.0,45.0,0.0,,0.0,10.0,0.0,0.0,0.0,0.253,...,0.0,0.001,0.001,0.001,0.0,0.0,0.48,0.0,0.0,88.08
"Tomato products, with mushrooms, sauce, canned",100.0,35.0,0.1,,0.0,452.0,0.0,9.0,0.0,1.265,...,0.13,0.017,0.01,0.05,0.0,0.0,2.02,0.0,0.0,87.97


## Min, Max & Idx[MinMax]


In [152]:
nutrition.max()

serving_size_g      100.0
calories            902.0
total_fat_g         100.0
saturated_fat_g      96.0
cholesterol_mg     3100.0
                    ...  
alcohol_g            42.5
ash_g                99.8
caffeine_mg        5714.0
theobromine_mg     2634.0
water_g             100.0
Length: 75, dtype: float64

In [154]:
#this gives a series with all the cols max values
# we can do the same for rows
nutrition.max(axis=1)

name
Cornstarch                                                                                            381.0
Nuts, pecans                                                                                          691.0
Eggplant, raw                                                                                         229.0
Teff, uncooked                                                                                        429.0
Sherbet, orange                                                                                       144.0
                                                                                                      ...  
Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round    311.0
Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand    246.0
Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand    277.0
Beef, raw, all grades, 

In [155]:
# What food has the most potassium?
nutrition.potassium_mg.max()

16500.0

In [156]:
#what does this correspond to?
nutrition.potassium_mg.idxmax()

'Leavening agents, cream of tartar'

In [158]:
# look at the potassium
nutrition.potassium_mg.sort_values(ascending=False)

name
Leavening agents, cream of tartar                              16500.0
Leavening agents, low-sodium, baking powder                    10100.0
Parsley, freeze-dried                                           6300.0
Beverages, unsweetened, decaffeinated, instant, tea             6040.0
Beverages, powder, unsweetened, instant, tea                    6040.0
                                                                ...   
CAMPBELL'S CHUNKY Soups, Beef with White and Wild Rice Soup        0.0
Alcoholic beverage, Pinot Gris (Grigio), white, table, wine        0.0
Cloudberries, raw (Alaska Native)                                  0.0
Oil, all purpose, soy ( partially hydrogenated), industrial        0.0
Oil, sunflower, mid-oleic, industrial                              0.0
Name: potassium_mg, Length: 8789, dtype: float64

In [161]:
# want to find the items with a potassium to sodium ratio of 16
# the replace(0, 1) allows to not have a divide/multiply by 0 error
pot_sod = ((nutrition.potassium_mg.replace(0,1)) / (nutrition.sodium_mg.replace(0,1))).sort_values(ascending=False)
pot_sod.head()

name
Peanut flour, low fat                                         1358.0
Nuts, raw, pistachio nuts                                     1025.0
Beverages, reduced calorie, with whitener, instant, coffee     909.0
Soybeans, raw, mature seeds                                    898.5
Soy meal, raw, defatted                                        830.0
dtype: float64

In [162]:
# Now if we want to look at foods with the ratio in a range between 14-18
pot_sod.between(14, 18)

name
Peanut flour, low fat                                         False
Nuts, raw, pistachio nuts                                     False
Beverages, reduced calorie, with whitener, instant, coffee    False
Soybeans, raw, mature seeds                                   False
Soy meal, raw, defatted                                       False
                                                              ...  
Seasoning mix, original, chili, dry                           False
Salt, table                                                   False
PACE, Dry Taco Seasoning Mix                                  False
Seasoning mix, coriander & annatto, sazon, dry                False
Leavening agents, baking soda                                 False
Length: 8789, dtype: bool

In [163]:
# this again gives us a Bool Mask we can apply to the entire series
pot_sod[pot_sod.between(14, 18)].sample(10)

name
Lima beans, without salt, drained, boiled, cooked, baby, frozen, immature seeds    14.172414
Juice, with added ascorbic acid and calcium, grape and pear blend, apple           17.800000
Asparagus, drained, boiled, cooked                                                 16.000000
Sapodilla, raw                                                                     16.083333
Turnip greens, unprepared, frozen                                                  15.333333
Cereals, without salt, prepared with water, plain, original, MALT-O-MEAL           14.000000
Nuts, canned (liquid expressed from grated meat and water), coconut milk           16.923077
Peppers, without salt, drained, boiled, chopped, frozen, green, sweet              18.000000
Potatoes, dry form, flakes without milk, dehydrated, mashed                        14.259740
Beverages, high vitamin C, greater than 3% juice, fruit juice drink                15.250000
dtype: float64

## nlargest & nsmallest

In [164]:
nutrition.nlargest(10, columns='potassium_mg')

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Leavening agents, cream of tartar",100.0,258.0,0.0,,0.0,52.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,36.8,0.0,0.0,1.7
"Leavening agents, low-sodium, baking powder",100.0,97.0,0.4,0.1,0.0,90.0,0.0,0.0,0.0,0.0,...,0.4,0.073,0.006,0.121,0.0,0.0,46.4,0.0,0.0,6.2
"Parsley, freeze-dried",100.0,271.0,5.2,,0.0,391.0,0.0,194.0,0.0,10.4,...,5.2,0.0,0.0,0.0,0.0,0.0,19.12,0.0,0.0,2.0
"Beverages, powder, unsweetened, instant, tea",100.0,315.0,0.0,,0.0,72.0,118.3,103.0,0.0,10.8,...,0.0,0.0,0.0,0.0,0.0,0.0,16.04,5714.0,71.0,5.09
"Beverages, unsweetened, decaffeinated, instant, tea",100.0,315.0,0.0,,0.0,72.0,118.3,103.0,0.0,10.8,...,0.0,0.0,0.0,0.0,0.0,0.0,16.04,169.0,11.0,5.09
"Spices, dried, chervil",100.0,237.0,3.9,0.2,0.0,83.0,0.0,274.0,0.0,5.4,...,3.9,0.169,1.399,1.8,0.0,0.0,16.6,0.0,0.0,7.2
"Spices, dried, coriander leaf",100.0,279.0,4.8,0.1,0.0,211.0,97.1,274.0,0.0,10.707,...,4.78,0.115,2.232,0.328,0.0,0.0,14.08,0.0,0.0,7.3
"Celery flakes, dried",100.0,319.0,2.1,0.6,0.0,1435.0,122.3,107.0,0.0,4.64,...,2.1,0.555,0.405,1.035,0.0,0.0,13.9,0.0,0.0,9.0
"Beverages, powder, regular, instant, coffee",100.0,353.0,0.5,0.2,0.0,37.0,101.9,0.0,0.0,28.173,...,0.5,0.197,0.041,0.196,0.0,0.0,8.8,3142.0,0.0,3.1
"Beverages, half the caffeine, regular, instant, coffee",100.0,352.0,0.5,0.2,0.0,37.0,101.9,0.0,0.0,28.173,...,0.5,0.197,0.041,0.196,0.0,0.0,8.8,1571.0,0.0,3.1


In [165]:
# this returns the entire data frame worth of columns, if we just wanted to see th posassium
nutrition.nlargest(10, columns='potassium_mg').potassium_mg

name
Leavening agents, cream of tartar                         16500.0
Leavening agents, low-sodium, baking powder               10100.0
Parsley, freeze-dried                                      6300.0
Beverages, powder, unsweetened, instant, tea               6040.0
Beverages, unsweetened, decaffeinated, instant, tea        6040.0
Spices, dried, chervil                                     4740.0
Spices, dried, coriander leaf                              4466.0
Celery flakes, dried                                       4388.0
Beverages, powder, regular, instant, coffee                3535.0
Beverages, half the caffeine, regular, instant, coffee     3535.0
Name: potassium_mg, dtype: float64

In [168]:
#can also just access the potassium column first
nutrition.potassium_mg.nlargest(10)

name
Leavening agents, cream of tartar                         16500.0
Leavening agents, low-sodium, baking powder               10100.0
Parsley, freeze-dried                                      6300.0
Beverages, powder, unsweetened, instant, tea               6040.0
Beverages, unsweetened, decaffeinated, instant, tea        6040.0
Spices, dried, chervil                                     4740.0
Spices, dried, coriander leaf                              4466.0
Celery flakes, dried                                       4388.0
Beverages, powder, regular, instant, coffee                3535.0
Beverages, half the caffeine, regular, instant, coffee     3535.0
Name: potassium_mg, dtype: float64

In [170]:
# can also sort with in the nlargest and nsmllest methods

nutrition.nsmallest(10, ['sodium_mg', 'calories', 'total_fat_g'])

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beverages, well, tap, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,99.9
"Water, NAYA, non-carbonated, bottled",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,100.0
"Beverages, decaffeinated, brewed, green, tea",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.93
"Beverages, EVIAN, non-carbonated, bottled, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,99.97
"Beverages, CALISTOGA, non-carbonated, bottled, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,100.0
"Beverages, DANNON, non-carbonated, bottled, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.98
"Sweetener, herbal extract powder from Stevia leaf",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Beverages, CRYSTAL GEYSER, non-carbonated, bottled, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,100.0
"Babyfood, without added fluoride., GERBER, bottled, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.9
"Beverages, AQUAFINA, PEPSI, non-carbonated, bottled, water",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.97


# Challenge exercise

Find the 10 foods that have the most Vitamin B12. What do they have in common?

In [171]:
nutrition.head(3)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100.0,381.0,0.1,,0.0,9.0,0.4,0.0,0.0,0.0,...,0.05,0.009,0.016,0.025,0.0,0.0,0.09,0.0,0.0,8.32
"Nuts, pecans",100.0,691.0,72.0,6.2,0.0,0.0,40.5,22.0,0.0,1.167,...,71.97,6.18,40.801,21.614,0.0,0.0,1.49,0.0,0.0,3.52
"Eggplant, raw",100.0,25.0,0.2,,0.0,2.0,6.9,22.0,0.0,0.649,...,0.18,0.034,0.016,0.076,0.0,0.0,0.66,0.0,0.0,92.3


In [None]:
nutrition.filter(like='vitamin', axis=1).head(3)

Unnamed: 0_level_0,vitamin_a_IU,vitamin_a_rae_mcg,vitamin_b12_mcg,vitamin_b6_mg,vitamin_c_mg,vitamin_d_IU,vitamin_e_mg,vitamin_k_mcg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Cornstarch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Nuts, pecans",56.0,3.0,0.0,0.21,1.1,0.0,1.4,3.5
"Eggplant, raw",23.0,1.0,0.0,0.084,2.2,0.0,0.3,3.5


In [174]:
nutrition.vitamin_b12_mcg.nlargest(10)
#nutrition.loc[:, vitamin_b12_mcg].nlargest(10)

name
Mollusks, moist heat, cooked, mixed species, clam                                   98.89
Beef, boiled, cooked, variety meats and by-products liver, imported, New Zealand    96.00
Lamb, raw, liver, variety meats and by-products                                     90.05
Lamb, pan-fried, cooked, liver, variety meats and by-products                       85.70
Veal, braised, cooked, liver, variety meats and by-products                         84.60
Beef, raw, liver, variety meats and by-products, imported, New Zealand              84.50
Beef, pan-fried, cooked, liver, variety meats and by-products                       83.13
Lamb, braised, cooked, kidneys, variety meats and by-products                       78.90
Lamb, braised, cooked, liver, variety meats and by-products                         76.50
Veal, pan-fried, cooked, liver, variety meats and by-products                       72.50
Name: vitamin_b12_mcg, dtype: float64

---
Isolate the foods in the dataset that contain, or are based on, eggplant. Which of them has the most sodium?

In [179]:
nutrition.filter(regex='(?i)eggplant', axis=0)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Eggplant, raw",100.0,25.0,0.2,,0.0,2.0,6.9,22.0,0.0,0.649,...,0.18,0.034,0.016,0.076,0.0,0.0,0.66,0.0,0.0,92.3
"Eggplant, pickled",100.0,49.0,0.7,0.1,0.0,1674.0,11.9,20.0,0.0,0.66,...,0.7,0.14,0.063,0.294,0.0,0.0,1.73,0.0,0.0,86.9
"Eggplant, with salt, drained, boiled, cooked",100.0,33.0,0.2,,0.0,239.0,9.4,14.0,0.0,0.6,...,0.23,0.044,0.02,0.093,0.0,0.0,1.13,0.0,0.0,89.67
"Eggplant, without salt, drained, boiled, cooked",100.0,35.0,0.2,,0.0,1.0,9.4,14.0,0.0,0.6,...,0.23,0.044,0.02,0.093,0.0,0.0,0.54,0.0,0.0,89.67


In [178]:
nutrition.filter(regex='(?i)eggplant', axis=0).sodium_mg.nlargest(1)

name
Eggplant, pickled    1674.0
Name: sodium_mg, dtype: float64

---
Select a slice of the dataframe that contains 4 random rows and 2 random columns.

In [180]:
nutrition.sample(4, axis=0).sample(2, axis=1)

Unnamed: 0_level_0,thiamin_mg,fatty_acids_total_trans_mg
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Crackers, reduced fat, cheese",1.087,0.0
"Flan, prepared with whole milk, dry mix, caramel custard",0.031,12.0
"Pancakes, prepared from recipe, blueberry",0.195,56.0
"Cheese, cream",0.023,101.0


# Challenge Exercise

Remove all the food items that contain at least one NaN. Do this in a way that modifies the dataframe, i.e. the changes stick.

How many food items remain after the exclusions?

In [182]:
nutrition_modified = nutrition.dropna()
nutrition_modified

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Nuts, pecans",100.0,691.0,72.0,6.2,0.0,0.0,40.5,22.0,0.0,1.167,...,71.97,6.180,40.801,21.614,0.0,0.0,1.49,0.0,0.0,3.52
"Teff, uncooked",100.0,367.0,2.4,0.4,0.0,12.0,13.1,0.0,0.0,3.363,...,2.38,0.449,0.589,1.071,0.0,0.0,2.37,0.0,0.0,8.82
"Sherbet, orange",100.0,144.0,2.0,1.2,1.0,46.0,7.7,4.0,0.0,0.063,...,2.00,1.160,0.530,0.080,1.0,0.0,0.40,0.0,0.0,66.10
"Cauliflower, raw",100.0,25.0,0.3,0.1,0.0,30.0,44.3,57.0,0.0,0.507,...,0.28,0.130,0.034,0.031,0.0,0.0,0.76,0.0,0.0,92.07
"Taro leaves, raw",100.0,42.0,0.7,0.2,0.0,3.0,12.8,126.0,0.0,1.513,...,0.74,0.151,0.060,0.307,0.0,0.0,1.92,0.0,0.0,85.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beef, raw, all grades, trimmed to 0"" fat, separable lean and fat, boneless, top round roast, round",100.0,125.0,3.5,1.4,62.0,54.0,64.5,4.0,0.0,6.422,...,3.50,1.353,1.554,0.244,62.0,0.0,1.11,0.0,0.0,72.51
"Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand",100.0,206.0,8.9,3.9,109.0,50.0,0.0,0.0,0.0,7.680,...,8.86,3.860,3.480,0.520,109.0,0.0,1.60,0.0,0.0,59.95
"Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand",100.0,277.0,23.0,12.0,78.0,39.0,0.0,1.0,0.0,6.550,...,22.74,11.570,8.720,0.980,78.0,0.0,0.92,0.0,0.0,59.80
"Beef, raw, all grades, trimmed to 0"" fat, separable lean only, boneless, eye of round roast, round",100.0,121.0,3.0,1.1,60.0,53.0,64.2,4.0,0.0,6.720,...,3.04,1.086,1.266,0.233,60.0,0.0,1.10,0.0,0.0,73.43


From the remaining records, isolate those that have between 20 and 40 mg of Vitamin C per 100 g serving. Of these foods, which one is the least caloric, i.e. has the minimum calories? 

In [183]:
nutrition_modified.vitamin_c_mg.between(20,40)

name
Nuts, pecans                                                                                          False
Teff, uncooked                                                                                        False
Sherbet, orange                                                                                       False
Cauliflower, raw                                                                                      False
Taro leaves, raw                                                                                      False
                                                                                                      ...  
Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round    False
Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Beef, raw, all grades, 

In [184]:
nutrition_modified[nutrition_modified.vitamin_c_mg.between(20,40)]

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Broccoli, raw, chinese",100.0,30.0,0.8,0.1,0.0,7.0,26.5,104.0,0.0,0.459,...,0.76,0.116,0.053,0.347,0.0,0.0,0.83,0.0,0.0,92.55
"Broccoli raab, cooked",100.0,33.0,0.5,0.1,0.0,56.0,33.6,71.0,0.0,2.015,...,0.52,0.057,0.030,0.150,0.0,0.0,1.11,0.0,0.0,91.41
"Horseradish, prepared",100.0,48.0,0.7,0.1,0.0,420.0,6.5,57.0,0.0,0.386,...,0.69,0.090,0.130,0.339,0.0,0.0,1.76,0.0,0.0,85.08
"Spices, white, pepper",100.0,296.0,2.1,0.6,0.0,5.0,0.0,10.0,0.0,0.212,...,2.12,0.626,0.789,0.616,0.0,0.0,1.59,0.0,0.0,11.42
"Dandelion greens, raw",100.0,45.0,0.7,0.2,0.0,76.0,35.3,27.0,0.0,0.806,...,0.70,0.170,0.014,0.306,0.0,0.0,1.80,0.0,0.0,85.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beverages, fortified, ready to drink, milk and soy based, chocolate drink",100.0,101.0,1.7,0.4,4.0,63.0,1.6,42.0,42.0,1.688,...,1.69,0.422,1.154,0.057,4.0,0.0,0.97,0.0,13.0,75.82
"Cereals ready-to-eat, Peanut Butter, Multi Grain CHEERIOS, GENERAL MILLS",100.0,390.0,6.2,0.9,0.0,447.0,0.0,714.0,0.0,17.890,...,6.19,0.890,2.700,2.290,0.0,0.0,3.09,0.0,0.0,2.00
"Infant formula, with ARA and DHA, ready-to-feed, ADVANCE, NATURAL CARE, SIMILAC, ABBOTT NUTRITION",100.0,78.0,4.2,2.5,2.0,34.0,6.0,29.0,29.0,3.902,...,4.24,2.544,0.402,0.842,2.0,0.0,0.75,0.0,0.0,85.07
"Cereals ready-to-eat, KELLOGG'S SMART START Strong Heart Antioxidants Cereal, KELLOGG",100.0,371.0,1.5,0.3,0.0,398.0,14.9,800.0,781.0,40.000,...,1.50,0.300,0.100,0.500,0.0,0.0,1.70,0.0,0.0,2.50


In [185]:
nutrition_modified[nutrition_modified.vitamin_c_mg.between(20,40)].calories.nsmallest(1)

name
Asparagus, with salt, drained, boiled, cooked, frozen    18.0
Name: calories, dtype: float64

---
How many food items in the dataframe have Vitamin C levels of between 2 and 3 standard deviations (inclusive) above the mean?

In [189]:
#Mean of vitamin c
m = nutrition_modified.vitamin_c_mg.mean()
m

5.553368523406029

In [190]:
# 2 & 3 stdev's above mean
m2sd = m + (nutrition_modified.vitamin_c_mg.mean()*2)
m3sd = m + (nutrition_modified.vitamin_c_mg.mean()*3)

In [191]:
print(m, m2sd, m3sd)

5.553368523406029 16.660105570218086 22.213474093624114


In [193]:
nutrition_modified.vitamin_c_mg.between(m2sd, m3sd)

name
Nuts, pecans                                                                                          False
Teff, uncooked                                                                                        False
Sherbet, orange                                                                                       False
Cauliflower, raw                                                                                      False
Taro leaves, raw                                                                                      False
                                                                                                      ...  
Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round    False
Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Beef, raw, all grades, 

In [194]:
nutrition_modified[nutrition_modified.vitamin_c_mg.between(m2sd, m3sd)]

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Spices, white, pepper",100.0,296.0,2.1,0.6,0.0,5.0,0.0,10.0,0.0,0.212,...,2.12,0.626,0.789,0.616,0.0,0.0,1.59,0.0,0.0,11.42
"Spices, caraway seed",100.0,333.0,15.0,0.6,0.0,17.0,24.7,10.0,0.0,3.606,...,14.59,0.620,7.125,3.272,0.0,0.0,5.87,0.0,0.0,9.87
"Spices, ground, mace",100.0,475.0,32.0,9.5,0.0,80.0,0.0,76.0,0.0,1.350,...,32.38,9.510,11.170,4.390,0.0,0.0,2.23,0.0,0.0,8.17
"Spices, celery seed",100.0,392.0,25.0,2.2,0.0,160.0,24.7,10.0,0.0,3.060,...,25.27,2.190,15.930,3.720,0.0,0.0,9.27,0.0,0.0,6.04
"Spices, fennel seed",100.0,345.0,15.0,0.5,0.0,88.0,0.0,0.0,0.0,6.050,...,14.87,0.480,9.910,1.690,0.0,0.0,8.22,0.0,0.0,8.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Cereals ready-to-eat, Peanut Butter, Multi Grain CHEERIOS, GENERAL MILLS",100.0,390.0,6.2,0.9,0.0,447.0,0.0,714.0,0.0,17.890,...,6.19,0.890,2.700,2.290,0.0,0.0,3.09,0.0,0.0,2.00
"Beans, with salt, drained, boiled, cooked, sprouted, mature seeds, navy",100.0,78.0,0.8,0.1,0.0,250.0,0.0,106.0,0.0,1.263,...,0.81,0.098,0.060,0.468,0.0,0.0,1.09,0.0,0.0,76.02
"Beverages, 3-2-1 plan, Ready-To-Drink, High Protein Shake, Meal replacement, SLIMFAST",100.0,58.0,2.9,0.5,4.0,72.0,0.0,39.0,41.0,2.390,...,2.88,0.450,2.100,0.290,4.0,0.0,0.61,0.0,0.0,88.33
"Potatoes, as purchased, frozen, salt not added in processing, all types, french fried",100.0,150.0,4.7,0.9,0.0,23.0,0.0,35.0,0.0,2.038,...,4.66,0.940,2.960,0.265,0.0,0.0,1.68,0.0,0.0,66.61


In [195]:
results = nutrition_modified[nutrition_modified.vitamin_c_mg.between(m2sd, m3sd)]
results.describe()

Unnamed: 0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,...,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,100.0,251.815068,5.30137,1.14726,14.041096,329.609589,12.006849,268.212329,197.171233,8.662021,...,5.299521,1.14663,2.259541,1.258144,14.041096,0.0,2.585342,0.253425,4.609589,36.77911
std,0.0,148.63002,6.698667,1.821905,58.909384,284.029779,19.955812,296.868047,291.280321,8.270251,...,6.717493,1.824336,3.421132,1.441806,58.909384,0.0,1.760547,1.27475,24.411804,37.463868
min,100.0,17.0,0.2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.15,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2
25%,100.0,91.25,0.7,0.1,0.0,37.25,0.0,24.5,0.0,1.184,...,0.7075,0.14375,0.0925,0.2,0.0,0.0,1.065,0.0,0.0,2.5225
50%,100.0,308.0,3.35,0.6,0.0,302.0,7.55,107.5,0.0,3.503,...,3.36,0.6,1.024,0.715,0.0,0.0,2.4,0.0,0.0,8.835
75%,100.0,385.75,5.85,1.0,0.0,537.5,15.975,373.75,337.0,16.7,...,5.82,1.00375,2.3545,1.6975,0.0,0.0,3.6925,0.0,0.0,76.275
max,100.0,532.0,34.0,9.7,400.0,1202.0,194.4,1333.0,1292.0,37.0,...,33.98,9.684,18.963,8.282,400.0,0.0,9.27,11.0,198.0,94.79


In [196]:
results.vitamin_c_mg.describe()

count    146.000000
mean      19.714384
std        1.572084
min       16.700000
25%       18.400000
50%       20.000000
75%       21.000000
max       22.200000
Name: vitamin_c_mg, dtype: float64