In [1]:
import pandas as pd
import numpy as np

In [2]:
# some python lists
names = ['Olga', 'Andrew', 'Brian', 'Telulah', 'Nicole', 'Tilda']
ages = [29, 21, 45, 23, 39, 46]
married = [False, True, True, True, False, True]

In [3]:
# pandas series
ser = pd.Series(names, name='name')

In [6]:
# pandas dataframe
df = pd.DataFrame({'name': names, 'age': ages, 'married': married})
df

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


In [8]:
#df is a 2 dimensional object (rows, columns)
df.shape

(6, 3)

In [9]:
#each column in a df is a series with its own dtype
# so you need to call .dtypes NOT dtype
df.dtypes

name       object
age         int64
married      bool
dtype: object

In [12]:
# since we created 3 python lists above, we can create a df using a dict format

pd.DataFrame({'name':names, 'age': ages, 'married':married})

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


In [13]:
# the length of all lists MUST be equal!!!

## Info
### can only use for a df

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [16]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [17]:
df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 557.0 bytes


## Nutrition Data (from kaggle)
* we will us this for exercises following

In [18]:
dataurl = 'https://andybek.com/pandas-nutrition'

In [19]:
nutrition = pd.read_csv(dataurl)
nutrition.head()

Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [22]:
nutrition.info(verbose = False, memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8789 entries, 0 to 8788
Columns: 77 entries, Unnamed: 0 to water
dtypes: int64(3), object(74)
memory usage: 39.2 MB


---
### clean up and remove dups
* note the unnamed column

In [23]:
#drop the unnamed columns
nutrition.drop(columns=['Unnamed: 0'], inplace=True)
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [25]:
# set the index to the name columns
nutrition.set_index(['name'], inplace=True)
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [27]:
## Taking random sample from a df

nutrition.sample(random_state=12)

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Thuringer, pork, beef, summer sausage, cervelat",100 g,362,30g,12g,74mg,1300.00 mg,78.9 mg,2.00 mcg,0.00 mcg,4.310 mg,...,30.43 g,11.510 g,12.970 g,1.200 g,74.00 mg,0.0 g,3.63 g,0.00 mg,0.00 mg,45.18 g


In [28]:
# if we set the random state parm across different users, we should get the same results
# if we want random each time:
nutrition.sample()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Bread, oat bran, reduced-calorie",100 g,201,3.2g,0.4g,0,459.00 mg,14.6 mg,81.00 mcg,47.00 mcg,3.763 mg,...,3.20 g,0.445 g,0.684 g,1.670 g,0.00 mg,0.0 g,1.50 g,0.00 mg,0.00 mg,46.00 g


In [29]:
# getting 3 random samples
nutrition.sample(n=3)

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, raw, Aust. marble score 4/5, separable lean and fat, boneless, tenderloin steak/roast, loin, Wagyu, imported, Australian",100 g,206,14g,6.1g,60mg,63.00 mg,0,0,0,0,...,14.09 g,6.105 g,6.167 g,0.531 g,60.00 mg,0.0 g,1.02 g,0.00 mg,0.00 mg,65.20 g
"Pork, roasted, heated, separable lean and fat, bone-in, shank, ham with natural juices, cured",100 g,191,11g,3.5g,74mg,801.00 mg,91.0 mg,3.00 mcg,0.00 mcg,7.190 mg,...,10.93 g,3.545 g,5.120 g,1.351 g,74.00 mg,0.0 g,2.98 g,0.00 mg,0.00 mg,62.94 g
"Cereals ready-to-eat, and Raisins, Honey, Wheat, QUAKER 100% Natural Granola with Oats, QUAKER",100 g,412,10g,1.1g,2mg,54.00 mg,31.8 mg,33.00 mcg,0.00 mcg,2.120 mg,...,10.38 g,1.120 g,5.870 g,2.390 g,2.00 mg,0.0 g,1.90 g,0.00 mg,0.00 mg,3.39 g


In [30]:
# if we wantedd to sample a % of the data use the frac parm
nutrition.sample(frac = 0.01) 
#this would be 1% of the data

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Soup, ready-to-serve, canned, chicken and vegetable",100 g,33,0.7g,0.1g,3mg,229.00 mg,7.7 mg,13.00 mcg,0.00 mcg,0.906 mg,...,0.73 g,0.121 g,0.158 g,0.134 g,3.00 mg,0.0 g,0.93 g,0.00 mg,0.00 mg,91.68 g
"Fast foods, and sausage, cheese, with egg, english muffin",100 g,286,18g,6.9g,163mg,548.00 mg,103.3 mg,57.00 mcg,41.00 mcg,3.192 mg,...,18.10 g,6.928 g,6.765 g,2.772 g,163.00 mg,0.0 g,2.26 g,0.00 mg,0.00 mg,48.82 g
"Margarine-like, soybean oil and butter, margarine-butter blend",100 g,727,80g,14g,12mg,719.00 mg,6.5 mg,2.00 mcg,0.00 mcg,0.022 mg,...,80.32 g,14.198 g,30.292 g,24.170 g,12.00 mg,0.0 g,1.53 g,0.00 mg,0.00 mg,17.07 g
"Mayonnaise, with olive oil, reduced fat",100 g,361,40g,5.4g,33mg,800.00 mg,9.0 mg,0.00 mcg,0.00 mcg,0.010 mg,...,40.00 g,5.370 g,29.542 g,4.006 g,33.00 mg,0.0 g,2.05 g,0.00 mg,0.00 mg,57.58 g
"Chicken, skin, BBQ, rotisserie, broiler",100 g,378,35g,9.3g,120mg,335.00 mg,39.3 mg,9.00 mcg,0.00 mcg,7.082 mg,...,35.15 g,9.333 g,14.464 g,4.018 g,120.00 mg,0.0 g,1.52 g,0.00 mg,0.00 mg,47.81 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wheat, hard red spring",100 g,329,1.9g,0.3g,0,2.00 mg,31.2 mg,43.00 mcg,0.00 mcg,5.710 mg,...,1.92 g,0.314 g,0.303 g,0.765 g,0.00 mg,0.0 g,1.89 g,0.00 mg,0.00 mg,12.76 g
"Waffle, microwaved, ready-to-heat, frozen, buttermilk",100 g,289,9.4g,2.1g,16mg,663.00 mg,0,64.00 mcg,50.00 mcg,6.730 mg,...,9.40 g,2.057 g,4.864 g,1.615 g,16.00 mg,0,3.04 g,0,0,36.48 g
"CAMPBELL'S CHUNKY Soups, Grilled Sirloin Steak with Hearty Vegetables Soup",100 g,51,0.8g,0.4g,4mg,363.00 mg,0,0,0,0,...,0.82 g,0.408 g,0,0,4.00 mg,0,1.16 g,0,0,87.00 g
"Snacks, taro chips",100 g,498,25g,6.4g,0,342.00 mg,44.3 mg,20.00 mcg,0.00 mcg,0.515 mg,...,24.90 g,6.430 g,4.430 g,12.880 g,0.00 mg,0.0 g,2.70 g,0.00 mg,0.00 mg,2.00 g


## Axis

### axis=0 is rows
### axis=1 is columns

In [31]:
nutrition.axes

[Index(['Cornstarch', 'Nuts, pecans', 'Eggplant, raw', 'Teff, uncooked',
        'Sherbet, orange', 'Cauliflower, raw', 'Taro leaves, raw',
        'Lamb, raw, ground', 'Cheese, camembert', 'Vegetarian fillets',
        ...
        'Beef, braised, cooked, all grades, trimmed to 1/8" fat, separable lean and fat, flat half, brisket',
        'Beef, raw, select, trimmed to 1/8" fat, separable lean only, lip-on, boneless, rib eye steak/roast',
        'Beef, raw, choice, trimmed to 1/8" fat, separable lean only, lip-on, boneless, rib eye steak/roast',
        'Oil, uses similar to 95 degree hard butter, confection fat, palm kernel (hydrogenated), industrial',
        'Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round steak, round',
        'Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round',
        'Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand',
      

In [33]:
nutrition.index[3]

'Teff, uncooked'

In [35]:
nutrition.columns[50]

'proline'

## Setting Index

In [36]:
# set the index to the name columns
# nutrition.set_index(['name'], inplace=True)
# nutrition.head()

# this was dome already above

In [37]:
# To check if there are any duplicates in the indes you can use the parm verify_integrity=True in set_index()
# the default is set to false so that you ARE allowed dups in the index
# if you set to false and it errors ... you know that you have dups in the index

## Extracting Data
## .loc

* can use BOTH rows and columns

In [38]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [39]:
nutrition.loc['Eggplant, raw']

serving_size       100 g
calories              25
total_fat           0.2g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol            0.0 g
ash               0.66 g
caffeine         0.00 mg
theobromine      0.00 mg
water            92.30 g
Name: Eggplant, raw, Length: 75, dtype: object

In [45]:
# note this returns a series, what if we want the whole row
nutrition.loc[['Eggplant, raw'], :]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g


In [44]:
#just the calories
nutrition.loc['Eggplant, raw', 'calories']

25

In [46]:
nutrition.loc[['Eggplant, raw'], ['calories']]

Unnamed: 0_level_0,calories
name,Unnamed: 1_level_1
"Eggplant, raw",25


In [48]:
# Slicing rows and columns same time
nutrition.loc['Eggplant, raw':'Sherbet, orange', 'calories':'cholesterol']

Unnamed: 0_level_0,calories,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Eggplant, raw",25,0.2g,,0
"Teff, uncooked",367,2.4g,0.4g,0
"Sherbet, orange",144,2g,1.2g,1mg


In [51]:
# getting multiple values that ARE NOT consecutive --> double bracets --> passing a list in

# df.loc[['row or list of rows'], ['column1', 'column5', 'column8']]

nutrition.loc[
                ['Raspberries, raw', 'Blackberries, raw'],
                ['protein', 'vitamin_b6', 'water']
    
]

Unnamed: 0_level_0,protein,vitamin_b6,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Raspberries, raw",1.20 g,0.055 mg,85.75 g
"Blackberries, raw",1.39 g,0.030 mg,88.15 g


## Extracting Data
## .iloc

* 0 indexed
* can use BOTH rows and columns

In [52]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [53]:
# getting the 4th item
nutrition.iloc[3]

serving_size      100 g
calories            367
total_fat          2.4g
saturated_fat      0.4g
cholesterol           0
                  ...  
alcohol               0
ash              2.37 g
caffeine              0
theobromine           0
water            8.82 g
Name: Teff, uncooked, Length: 75, dtype: object

In [54]:
#if we want the 4th, 6th and 9th position
nutrition.iloc[[4, 6, 9], :]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
"Taro leaves, raw",100 g,42,0.7g,0.2g,0,3.00 mg,12.8 mg,126.00 mcg,0.00 mcg,1.513 mg,...,0.74 g,0.151 g,0.060 g,0.307 g,0.00 mg,0.0 g,1.92 g,0.00 mg,0.00 mg,85.66 g
Vegetarian fillets,100 g,290,18g,2.8g,0,490.00 mg,82.0 mg,102.00 mcg,0.00 mcg,12.000 mg,...,18.00 g,2.849 g,4.376 g,9.332 g,0.00 mg,0.0 g,5.00 g,0.00 mg,0.00 mg,45.00 g


In [55]:
#if we wanted a certian slice of columns
nutrition.iloc[[4, 6, 9], 2:5]

Unnamed: 0_level_0,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sherbet, orange",2g,1.2g,1mg
"Taro leaves, raw",0.7g,0.2g,0
Vegetarian fillets,18g,2.8g,0


In [57]:
# or certian individual columns
nutrition.iloc[[4, 6, 9], [2,5,8]]

Unnamed: 0_level_0,total_fat,sodium,folic_acid
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sherbet, orange",2g,46.00 mg,0.00 mcg
"Taro leaves, raw",0.7g,3.00 mg,0.00 mcg
Vegetarian fillets,18g,490.00 mg,0.00 mcg


In [62]:
# if we wanted to see every other row

nutrition.iloc[
    [True if i%2 ==0 else False for i in range(len(nutrition))]
    
]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
"Taro leaves, raw",100 g,42,0.7g,0.2g,0,3.00 mg,12.8 mg,126.00 mcg,0.00 mcg,1.513 mg,...,0.74 g,0.151 g,0.060 g,0.307 g,0.00 mg,0.0 g,1.92 g,0.00 mg,0.00 mg,85.66 g
"Cheese, camembert",100 g,300,24g,15g,72mg,842.00 mg,15.4 mg,62.00 mcg,0.00 mcg,0.630 mg,...,24.26 g,15.259 g,7.023 g,0.724 g,72.00 mg,0.0 g,3.68 g,0.00 mg,0.00 mg,51.80 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beef, raw, select, trimmed to 1/8"" fat, separable lean only, lip-on, boneless, rib eye steak/roast",100 g,148,6.4g,2.3g,70mg,55.00 mg,49.4 mg,4.00 mcg,0.00 mcg,5.580 mg,...,6.41 g,2.313 g,2.830 g,0.396 g,70.00 mg,0.0 g,1.03 g,0.00 mg,0.00 mg,70.89 g
"Oil, uses similar to 95 degree hard butter, confection fat, palm kernel (hydrogenated), industrial",100 g,884,100g,94g,0,6.00 mg,0.2 mg,0.00 mcg,0.00 mcg,0.000 mg,...,100.00 g,93.701 g,0.257 g,0.000 g,0.00 mg,0.0 g,0.01 g,0.00 mg,0.00 mg,0.05 g
"Beef, raw, all grades, trimmed to 0"" fat, separable lean and fat, boneless, top round roast, round",100 g,125,3.5g,1.4g,62mg,54.00 mg,64.5 mg,4.00 mcg,0.00 mcg,6.422 mg,...,3.50 g,1.353 g,1.554 g,0.244 g,62.00 mg,0.0 g,1.11 g,0.00 mg,0.00 mg,72.51 g
"Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand",100 g,277,23g,12g,78mg,39.00 mg,0,1.00 mcg,0.00 mcg,6.550 mg,...,22.74 g,11.570 g,8.720 g,0.980 g,78.00 mg,0,0.92 g,0,0,59.80 g


## Single value access

* .at - by value (like loc)
* .iat - by position (like iloc)

In [63]:
nutrition.at['Nuts, pecans', 'calories']

691

In [64]:
nutrition.iat[2, 10]

'0.281 mg'

In [65]:
# at & iat are SAME as using loc & iloc BUT since its a single use function, its much faster

#we can time and compare

In [67]:
%timeit nutrition.loc['Nuts, pecans', 'calories']

7.22 µs ± 140 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [68]:
%timeit nutrition.at['Nuts, pecans', 'calories']

3.38 µs ± 25 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


## get_loc

-helps us get an integer location from the label

In [70]:
nutrition.columns.get_loc('vitamin_k')

26

In [71]:
# now we know that the column entitled vitamin_k is at position 26

---
# Practice Challenge
---

1. Randomly select 10 food items and assign the resulting dataframe to a new variable called *nutr_mini*.

In [72]:
nutr_mini = nutrition.sample(n=10)
nutr_mini

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, grilled, cooked, select, trimmed to 1/8"" fat, separable lean and fat, porterhouse steak, short loin",100 g,263,17g,6.8g,79mg,63.00 mg,59.5 mg,6.00 mcg,0.00 mcg,5.304 mg,...,16.75 g,6.843 g,7.469 g,0.737 g,79.00 mg,0.0 g,1.03 g,0.00 mg,0.00 mg,56.87 g
"WORTHINGTON FriPats, unprepared, frozen",100 g,209,9.1g,1.4g,2mg,517.00 mg,0,0,0,4.700 mg,...,9.10 g,1.400 g,2.200 g,5.200 g,2.00 mg,0,2.40 g,0,0,56.80 g
"Fish, raw, Atlantic, wolffish",100 g,96,2.4g,0.4g,46mg,85.00 mg,0,5.00 mcg,0.00 mcg,2.133 mg,...,2.39 g,0.365 g,0.837 g,0.845 g,46.00 mg,0,1.16 g,0,0,79.90 g
"Fish, liver (Alaska Native), chinook, king, salmon",100 g,156,8g,,0,0,0,0,0,5.000 mg,...,8.00 g,0,0,0,0,0,1.30 g,0,0,69.80 g
"Cereals ready-to-eat, KASHI 7 Whole Grain Puffs",100 g,336,2.3g,0.5g,0,9.00 mg,31.8 mg,27.00 mcg,0.00 mcg,3.200 mg,...,2.30 g,0.463 g,0.580 g,1.053 g,0.00 mg,0.0 g,2.10 g,0.00 mg,0.00 mg,4.00 g
"McDONALD'S, NEWMAN'S OWN Creamy Caesar Dressing",100 g,319,32g,5.9g,35mg,851.00 mg,0,4.00 mcg,0,0.020 mg,...,31.50 g,5.860 g,7.750 g,16.300 g,35.00 mg,0,3.19 g,0,0,54.90 g
"Pork, braised, cooked, separable lean and fat, boneless, Leg sirloin tip roast",100 g,156,2.6g,0.8g,84mg,43.00 mg,106.6 mg,1.00 mcg,0.00 mcg,7.685 mg,...,2.56 g,0.791 g,1.022 g,0.474 g,84.00 mg,0.0 g,1.14 g,0.00 mg,0.00 mg,66.99 g
"Nuts, with salt added, plain, almond butter",100 g,614,56g,6.6g,0,227.00 mg,52.1 mg,53.00 mcg,0.00 mcg,3.155 mg,...,55.50 g,6.550 g,32.445 g,13.613 g,0.00 mg,0.0 g,3.09 g,0.00 mg,0.00 mg,1.64 g
"Frozen novelties, with low calorie sweetener, pop, ice type",100 g,24,0g,,0,10.00 mg,0.0 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.00 g,0.000 g,0.000 g,0.000 g,0.00 mg,0.0 g,0.06 g,0.00 mg,0.00 mg,94.03 g
"Cookies, with added fiber, reduced fat, commercially prepared, brownies",100 g,345,9.7g,2.8g,0,290.00 mg,13.9 mg,42.00 mcg,34.00 mcg,1.441 mg,...,9.68 g,2.766 g,3.432 g,2.581 g,0.00 mg,0.0 g,1.14 g,10.00 mg,97.00 mg,24.83 g


---
2. From *nutr_mini*, extract the **total_fat** and **cholesterol** columns for all rows. 

In [74]:
nutr_mini.loc[:, ['total_fat', 'cholesterol']]

Unnamed: 0_level_0,total_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Beef, grilled, cooked, select, trimmed to 1/8"" fat, separable lean and fat, porterhouse steak, short loin",17g,79mg
"WORTHINGTON FriPats, unprepared, frozen",9.1g,2mg
"Fish, raw, Atlantic, wolffish",2.4g,46mg
"Fish, liver (Alaska Native), chinook, king, salmon",8g,0
"Cereals ready-to-eat, KASHI 7 Whole Grain Puffs",2.3g,0
"McDONALD'S, NEWMAN'S OWN Creamy Caesar Dressing",32g,35mg
"Pork, braised, cooked, separable lean and fat, boneless, Leg sirloin tip roast",2.6g,84mg
"Nuts, with salt added, plain, almond butter",56g,0
"Frozen novelties, with low calorie sweetener, pop, ice type",0g,0
"Cookies, with added fiber, reduced fat, commercially prepared, brownies",9.7g,0


---
3. Extract all the columns from **vitamin_b12** to the end, for the first, second, and third rows.

In [78]:
nutr_mini.columns.get_loc('vitamin_b12')

20

In [86]:
nutr_mini.iloc[0:3, 20:]

Unnamed: 0_level_0,vitamin_b12,vitamin_b6,vitamin_c,vitamin_d,vitamin_e,tocopherol_alpha,vitamin_k,calcium,copper,irom,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, grilled, cooked, select, trimmed to 1/8"" fat, separable lean and fat, porterhouse steak, short loin",2.07 mcg,0.572 mg,0.0 mg,5.00 IU,0.21 mg,0.21 mg,1.6 mcg,20.00 mg,0.068 mg,3.20 mg,...,16.75 g,6.843 g,7.469 g,0.737 g,79.00 mg,0.0 g,1.03 g,0.00 mg,0.00 mg,56.87 g
"WORTHINGTON FriPats, unprepared, frozen",1.90 mcg,0.900 mg,0.0 mg,0,0,0,0,97.00 mg,0,2.80 mg,...,9.10 g,1.400 g,2.200 g,5.200 g,2.00 mg,0,2.40 g,0,0,56.80 g
"Fish, raw, Atlantic, wolffish",2.03 mcg,0.400 mg,0.0 mg,0,0,0,0,6.00 mg,0.029 mg,0.09 mg,...,2.39 g,0.365 g,0.837 g,0.845 g,46.00 mg,0,1.16 g,0,0,79.90 g


---
4. Get the calories for the third food in *nutr_mini* using an attribute-based approach that is faster than .loc or .iloc. 

In [83]:
nutr_mini.at['Fish, raw, Atlantic, wolffish', 'calories']

96

## Numeric cleanup

### note alot of data has the units in cells

In [88]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


## these values are considered as strings
## we need to conver these to numeric values so can perform operations/analysis

## use the astype() method
* first need to remove the units from each value --> .replace()
* can change entire column or df

In [91]:
df_mini = nutrition.iloc[:6, :1]
df_mini

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100 g
"Nuts, pecans",100 g
"Eggplant, raw",100 g
"Teff, uncooked",100 g
"Sherbet, orange",100 g
"Cauliflower, raw",100 g


In [92]:
df_mini.replace(to_replace='100 g', value=100)

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100
"Nuts, pecans",100
"Eggplant, raw",100
"Teff, uncooked",100
"Sherbet, orange",100
"Cauliflower, raw",100


In [None]:
#this can also be written as:
df_mini.replace('100 g', 100)
#but this is very specific to 100g

## Regex

### Regular expression can use specifics on regex
* regex101.com

### \s will find white space

* so \sg will find all the with space and the letter g

In [93]:
# Putting that into the replace method

# .replace('pattern_to_be_replace', 'what_replace_with', regex=True)

df_mini.replace('\sg', '', regex=True)

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100
"Nuts, pecans",100
"Eggplant, raw",100
"Teff, uncooked",100
"Sherbet, orange",100
"Cauliflower, raw",100


In [94]:
# this is great for single column that has same units and space
# we have many types of units with varying spaces

## Put the units in the header column labels

### 1. Remove the numerics so just left with the units labels

## the regex to grab the just the units is
* [a-zA-Z]
## to negate that and get everything BUT the units (all numbers), add a ^ before
*[^a-zA-Z]

In [97]:
nutrition.replace('[^a-zA-Z]', '', regex=True).head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,g,381,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Nuts, pecans",g,691,g,g,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Eggplant, raw",g,25,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Teff, uncooked",g,367,g,g,,mg,mg,,,mg,...,g,g,g,g,,,g,,,g
"Sherbet, orange",g,144,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [100]:
## Note that calories is still numeric because it didnt have any units initially
## It was an integer not a string intially
## cast everything as a sting first

units = nutrition.astype(str).replace('[^a-zA-Z]', '', regex=True)
units.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,g,,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Nuts, pecans",g,,g,g,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Eggplant, raw",g,,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Teff, uncooked",g,,g,g,,mg,mg,,,mg,...,g,g,g,g,,,g,,,g
"Sherbet, orange",g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


### 2. How to get the unit to column label header

### some NaN's, some empties, some mix of the two
* Cant pick a single row to use
* Need to find the mode of each column
    * Recall the mode is the value that appears the most

In [101]:
units.mode()

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [102]:
headers = units.mode()
headers

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
