# 1. Using `PCA` to reduce the dimensionality of the data

Using the hotelling dataset, we will use PCA to reduce the dimensionality of the data. We will then use the reduced data to train a model and compare the performance of the model with the original data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Read USDA National Nutrient Database for Standard Reference
nndb = pd.read_csv("nndb_flat.csv")

In [2]:
nndb.head()

Unnamed: 0,ID,FoodGroup,ShortDescrip,Descrip,CommonName,MfgName,ScientificName,Energy_kcal,Protein_g,Fat_g,...,Folate_USRDA,Niacin_USRDA,Riboflavin_USRDA,Thiamin_USRDA,Calcium_USRDA,Copper_USRDA,Magnesium_USRDA,Phosphorus_USRDA,Selenium_USRDA,Zinc_USRDA
0,1001,Dairy and Egg Products,"BUTTER,WITH SALT","Butter, salted",,,,717.0,0.85,81.11,...,0.0075,0.002625,0.026154,0.004167,0.02,0.0,0.004762,0.034286,0.018182,0.008182
1,1002,Dairy and Egg Products,"BUTTER,WHIPPED,WITH SALT","Butter, whipped, with salt",,,,717.0,0.85,81.11,...,0.0075,0.002625,0.026154,0.004167,0.02,1.8e-05,0.004762,0.032857,0.018182,0.004545
2,1003,Dairy and Egg Products,"BUTTER OIL,ANHYDROUS","Butter oil, anhydrous",,,,876.0,0.28,99.48,...,0.0,0.000188,0.003846,0.000833,0.003333,1e-06,0.0,0.004286,0.0,0.000909
3,1004,Dairy and Egg Products,"CHEESE,BLUE","Cheese, blue",,,,353.0,21.4,28.74,...,0.09,0.0635,0.293846,0.024167,0.44,4.4e-05,0.054762,0.552857,0.263636,0.241818
4,1005,Dairy and Egg Products,"CHEESE,BRICK","Cheese, brick",,,,371.0,23.24,29.68,...,0.05,0.007375,0.27,0.011667,0.561667,2.7e-05,0.057143,0.644286,0.263636,0.236364


In [3]:
nndb.shape

(8618, 45)

In [5]:
print(nndb.columns)

Index(['ID', 'FoodGroup', 'ShortDescrip', 'Descrip', 'CommonName', 'MfgName',
       'ScientificName', 'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g',
       'Sugar_g', 'Fiber_g', 'VitA_mcg', 'VitB6_mg', 'VitB12_mcg', 'VitC_mg',
       'VitE_mg', 'Folate_mcg', 'Niacin_mg', 'Riboflavin_mg', 'Thiamin_mg',
       'Calcium_mg', 'Copper_mcg', 'Iron_mg', 'Magnesium_mg', 'Manganese_mg',
       'Phosphorus_mg', 'Selenium_mcg', 'Zinc_mg', 'VitA_USRDA', 'VitB6_USRDA',
       'VitB12_USRDA', 'VitC_USRDA', 'VitE_USRDA', 'Folate_USRDA',
       'Niacin_USRDA', 'Riboflavin_USRDA', 'Thiamin_USRDA', 'Calcium_USRDA',
       'Copper_USRDA', 'Magnesium_USRDA', 'Phosphorus_USRDA', 'Selenium_USRDA',
       'Zinc_USRDA'],
      dtype='object')


In [4]:
nndb["FoodGroup"].unique()

array(['Dairy and Egg Products', 'Spices and Herbs', 'Baby Foods',
       'Fats and Oils', 'Poultry Products', 'Soups, Sauces, and Gravies',
       'Sausages and Luncheon Meats', 'Breakfast Cereals', 'Snacks',
       'Fruits and Fruit Juices', 'Pork Products',
       'Vegetables and Vegetable Products', 'Nut and Seed Products',
       'Beef Products', 'Beverages', 'Finfish and Shellfish Products',
       'Legumes and Legume Products', 'Lamb, Veal, and Game Products',
       'Baked Products', 'Sweets', 'Cereal Grains and Pasta',
       'Fast Foods', 'Meals, Entrees, and Side Dishes',
       'American Indian/Alaska Native Foods', 'Restaurant Foods'],
      dtype=object)

In [9]:
# return the columns with missing values
nndb.columns[nndb.isna().any()].tolist()

['CommonName', 'MfgName', 'ScientificName']

Let's create our binary target `y`:
* `y = 1` if `FoodGroup` is processed food
* `y = 0` otherwise

### Prepare the data for PCA

In [15]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Select only the columns that we want to use
df = nndb[[
    "Energy_kcal", "Protein_g", "Fat_g", "Carb_g",
    "Sugar_g", "Fiber_g", "VitA_mcg", "VitB6_mg", "VitB12_mcg", "VitC_mg",
    "VitE_mg", "Folate_mcg", "Niacin_mg", "Riboflavin_mg", "Thiamin_mg",
    "Calcium_mg", "Copper_mcg", "Iron_mg", "Magnesium_mg", "Manganese_mg",
    "Phosphorus_mg", "Selenium_mcg", "Zinc_mg", "VitA_USRDA", "VitB6_USRDA",
    "VitB12_USRDA", "VitC_USRDA", "VitE_USRDA", "Folate_USRDA",
    "Niacin_USRDA", "Riboflavin_USRDA", "Thiamin_USRDA", "Calcium_USRDA",
    "Copper_USRDA", "Magnesium_USRDA", "Phosphorus_USRDA", "Selenium_USRDA",
    "Zinc_USRDA"
]]

# check correlation between features
corr = df.corr()

# plot correlation matrix with plotly
fig = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Viridis',
    colorbar=dict(
        title="Correlation",
        titleside="right",
        tickmode="array",
        tickvals=[-1, -0.5, 0, 0.5, 1],
        ticktext=["-1", "-0.5", "0", "0.5", "1"]
    )
))
fig.update_layout(
    title="Correlation Matrix",
    xaxis_title="Features",
    yaxis_title="Features",
    width=800,
    height=800
)
fig.show()

In [14]:
# scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# apply PCA
pca = PCA()
pca.fit(df_scaled)

# plotly chart of explained variance
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(1, len(pca.explained_variance_ratio_) + 1),
    y=pca.explained_variance_ratio_,
    mode="lines+markers",
    name="Explained Variance"
))
fig.add_trace(go.Scatter(
    x=np.arange(1, len(pca.explained_variance_ratio_) + 1),
    y=np.cumsum(pca.explained_variance_ratio_),
    mode="lines+markers",
    name="Cummulative Explained Variance"
))
fig.update_layout(title="Explained Variance", xaxis_title="Number of Principal Components", yaxis_title="Explained Variance")
fig.show()

In [28]:
# how are PCA and old features related

i_columns = enumerate(df.columns)
components = pd.DataFrame(index=df.columns)

for i, _ in i_columns:
    components[f"PCA{i+1}"] = pca.components_[i]

components

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA29,PCA30,PCA31,PCA32,PCA33,PCA34,PCA35,PCA36,PCA37,PCA38
Energy_kcal,0.09326,-0.107176,0.14546,-0.149726,0.368909,-0.05759,0.103243,0.29062,-0.092876,-0.356498,...,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0
Protein_g,0.110906,0.121813,0.149082,0.346201,0.034279,0.032767,-0.027833,-0.002977,0.109394,-0.20787,...,-1.890335e-16,3.163934e-17,-2.024497e-16,-2.9496310000000004e-17,-1.4561050000000002e-17,-1.299961e-16,-9.966250000000001e-17,8.332522000000001e-17,-8.091156000000001e-17,1.138998e-16
Fat_g,0.013142,-0.017774,0.136413,-0.076802,0.439537,0.074793,0.176196,0.200649,0.097304,-0.32427,...,-7.082541000000001e-17,5.1641040000000003e-17,-5.0685850000000005e-17,-2.697306e-17,2.675011e-18,-1.269275e-16,-2.0327790000000002e-17,-5.2306040000000004e-17,-5.836623e-17,4.511126e-17
Carb_g,0.098943,-0.207495,0.009074,-0.29374,-0.013825,-0.196217,-0.08455,0.189213,-0.314505,-0.046345,...,4.9824050000000004e-18,-6.472271e-17,4.127141e-17,5.193363e-18,-1.040506e-16,1.703335e-16,-1.012492e-16,-5.719756e-17,1.089477e-16,-2.597007e-16
Sugar_g,0.038806,-0.136635,-0.037849,-0.25127,0.016922,-0.109763,0.066932,0.173473,-0.485823,-0.081555,...,-1.399315e-16,6.266639e-17,-1.645957e-16,1.203319e-16,1.050764e-16,-5.2370970000000005e-17,4.7247600000000004e-17,1.116632e-16,-1.09803e-16,1.224727e-16
Fiber_g,0.112393,-0.141664,0.138136,-0.205899,-0.025736,-0.081862,-0.317901,-0.060312,0.007412,0.12416,...,-2.545333e-16,2.315262e-17,1.679275e-16,-1.284863e-17,-1.266821e-16,-1.922042e-16,-1.784844e-16,-1.229228e-16,1.569477e-16,-4.964511e-17
VitA_mcg,0.109097,0.338184,-0.063989,-0.247484,-0.036248,0.020855,0.086161,0.056885,0.055642,0.043153,...,0.2386066,-0.02272781,0.4128512,0.1181811,0.1130265,0.07935807,-0.1617691,-0.07458257,0.1203731,-0.3458059
VitB6_mg,0.240479,-0.083039,-0.123038,0.067578,0.009669,0.164281,-0.031202,-0.113926,-0.103492,0.058795,...,0.1838569,0.04574826,-0.2091523,-0.01665581,0.0266606,-0.09884051,-0.03856949,0.08843021,-0.4649074,-0.2420292
VitB12_mcg,0.147756,0.359417,-0.066177,-0.029753,0.014009,-0.018807,0.100115,-0.030154,-0.104117,0.031206,...,-0.01564267,-0.04547069,0.08845955,0.01366776,0.2354201,0.3123825,0.4257001,0.3899986,-0.03453583,-0.03002042
VitC_mg,0.070302,-0.05178,-0.086169,-0.118719,-0.218295,0.535416,-0.129593,0.179636,-0.01615,-0.145443,...,0.001018973,-0.002154311,-0.0001125124,0.01253663,0.00983595,-0.002001862,0.006980025,-0.006077908,0.001936782,0.001529845


In [33]:
# evaluate transformed data
df_pca = pd.DataFrame(pca.transform(df_scaled), columns=[f"PCA{i+1}" for i in range(len(df.columns))])
df_pca

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA29,PCA30,PCA31,PCA32,PCA33,PCA34,PCA35,PCA36,PCA37,PCA38
0,-1.811468,0.244360,0.358148,-1.705050,3.521139,0.552613,1.680265,1.475408,0.708498,-1.883046,...,7.374200e-17,1.194560e-16,-1.670709e-16,4.711125e-17,2.105577e-16,-6.604401e-16,1.779894e-16,-4.974839e-16,-3.455326e-17,1.138719e-16
1,-1.806892,0.251895,0.361289,-1.722863,3.519338,0.545058,1.667116,1.477944,0.715387,-1.882137,...,1.188027e-16,1.107266e-16,-1.525643e-16,1.048057e-16,2.519964e-16,-6.678213e-16,1.344299e-16,-4.467314e-16,-1.315672e-16,1.328545e-16
2,-1.788169,0.230042,0.503071,-2.111154,4.540671,0.657578,1.953727,1.945396,0.788197,-2.520843,...,8.059669e-17,4.852664e-18,7.704801e-17,-3.371002e-16,1.269018e-16,-8.001026e-16,7.005380e-17,-5.066218e-16,-2.431743e-17,2.298996e-16
3,0.404461,0.116966,2.611367,0.508961,-0.618369,0.435269,2.658160,0.128548,-0.055579,-0.370366,...,-4.379706e-18,2.069746e-16,-3.855313e-17,-6.005457e-16,-7.509730e-17,-3.509069e-16,6.949657e-16,-1.236622e-16,-2.708181e-16,3.451692e-17
4,0.443959,0.118445,3.476585,0.350402,-0.985125,0.472345,3.301049,0.167361,-0.116149,-0.293110,...,-4.948216e-17,1.992606e-16,-9.482261e-17,-9.132349e-16,-4.026773e-16,-3.087704e-16,7.827304e-16,-8.572031e-17,-6.258026e-16,-9.276519e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8613,1.043757,2.526914,1.181932,1.661541,1.329422,0.652453,0.109771,1.632799,-0.502331,1.254252,...,-9.878928e-17,9.346454e-16,3.967358e-17,2.551977e-16,1.978661e-16,-4.088124e-16,-3.124614e-16,7.598761e-16,3.756268e-16,6.154342e-17
8614,-0.819215,0.819225,1.250838,1.359015,-0.718280,0.087358,0.093824,-0.152917,0.414429,0.289715,...,1.209364e-16,-1.559622e-16,-3.265877e-16,-1.676175e-16,-2.891745e-17,3.125738e-16,-4.580690e-17,1.623216e-16,-5.669426e-16,3.401538e-17
8615,-1.696311,-1.327648,-0.934428,-2.552526,-0.101841,-1.316227,0.172453,1.181518,-3.083094,-0.295371,...,-4.960412e-16,-2.794399e-17,-6.630556e-16,4.563741e-16,3.365243e-16,1.710148e-16,2.745209e-16,4.177815e-16,-2.729727e-16,-2.026512e-16
8616,0.699677,0.080787,3.054021,-0.119478,0.448425,0.499595,-3.114522,-0.731586,1.882250,1.269459,...,1.028739e-15,-1.781433e-16,-2.744409e-16,2.916365e-16,5.778587e-16,3.614657e-16,6.199224e-16,1.465224e-16,2.428175e-16,-2.767416e-17


### Using PCA-transformed data to train a model

We can train several models with different amount of features, and evaluate the performance of the models.

Let's use 5, 10 and 20 PCs to predict our target.

# 2. Using `SelectKBest` to select the best features

In [None]:
# https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
from sklearn.feature_selection import SelectKBest, f_classif

# 3. Using `feature_importances_` to select the best features
