In [7]:
# Suppress FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Suppress CopyWarning
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import pandas as pd
import numpy as np
import seaborn as sns
import sklearn

# Set global seed for consistency
np.random.seed(42)

# Import support functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Import eval functions
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# matplotlib defaults
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.cluster import KMeans
from sklearn.svm import SVC

# Print multiple outputs per cell
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

## Data Load and Inspection

In [10]:
# Load dataset
data = pd.read_csv("food.csv", encoding= 'unicode_escape')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Public Food Key      1449 non-null   object 
 1   Food Profile ID      1449 non-null   int64  
 2   Derivation           1449 non-null   object 
 3   Food Name            1449 non-null   object 
 4   Food Description     1449 non-null   object 
 5   Sampling Details     1449 non-null   object 
 6   Nitrogen Factor      1449 non-null   float64
 7   Fat Factor           1449 non-null   float64
 8   Specific Gravity     1449 non-null   float64
 9   Analysed Portion     1449 non-null   object 
 10  Unanalysed Portion   1449 non-null   object 
 11  Classification       1449 non-null   int64  
 12  Classification Name  1449 non-null   object 
dtypes: float64(3), int64(2), object(8)
memory usage: 147.3+ KB


In [14]:
data.sample(5)

Unnamed: 0,Public Food Key,Food Profile ID,Derivation,Food Name,Food Description,Sampling Details,Nitrogen Factor,Fat Factor,Specific Gravity,Analysed Portion,Unanalysed Portion,Classification,Classification Name
538,F006333,6000,Analysed,"Passionfruit, raw",Pulp and seeds of purple skinned passionfruit.,The majority of nutrient data were derived fro...,6.25,0.8,0.0,41% (pulp & seeds),59% (skin & stalk),16504,"Other tropical and subtropical fruit, inedible..."
754,F009434,8018,Recipe,"Veal, steak, boneless, leg, untrimmed, grilled...",Cooked veal from the hind portion of the carca...,These nutrient data were derived using a recip...,0.0,0.0,0.0,"99.2% (lean meat 97.0%, internal/external fat ...",0.9% (bone/gristle),18104,Veal
49,F009582,87,Analysed,"Wine, white, chardonnay",Alcoholic beverage produced by fermentation of...,"Proximates, organic acids and iodine were deri...",6.25,0.8,0.99,100%,0%,29202,"Wines, white (including sparkling varieties)"
1380,F006250,13287,Analysed,"Onion, spring, fresh, raw",Small white immature onion with long thin gree...,The majority of nutrient data were derived fro...,6.25,0.8,0.0,"78% (bulb, stem)","22% (tops, base, roots)",24802,"Onion, leek and garlic"
141,F001548,1961,Recipe,"Bread, organic, toasted","Commercially prepared, sliced or unsliced brea...",These nutrient data were derived using a recip...,0.0,0.0,0.0,100%,0%,12203,"Breads, and bread rolls, white, not stated as ..."


In [16]:
# Check for null values
data.isna().sum()

Public Food Key        0
Food Profile ID        0
Derivation             0
Food Name              0
Food Description       0
Sampling Details       0
Nitrogen Factor        0
Fat Factor             0
Specific Gravity       0
Analysed Portion       0
Unanalysed Portion     0
Classification         0
Classification Name    0
dtype: int64