In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor



In [2]:
df = pd.read_csv('./data/raw_data_cleaned.csv')

In [3]:
df = df.rename(columns={'Unnamed: 0': 'Food Name'})

In [4]:
df.columns

Index(['Food Name', 'Water (g)', 'Energy (kal)', 'Protein (g)', 'lipid (g)',
       'Carbohydrate (g)', 'Fiber (g)', 'Sugars (g)', 'Ash (g)', 'Ca (mg)',
       'Fe (mg)', 'Mg (mg)', 'P (mg)', 'K (mg)', 'Na (mg)', 'Zn (mg)',
       'Se (µg)', 'Cu (mg)', 'Mn (mg)', 'I (µg)', 'Vc (mg)', 'Thiamin (mg)',
       'Riboflavin (mg)', 'Niacin (mg)', 'B6 (mg)', 'Folate,DFE (µg)',
       'B12 (µg)', 'Va,RAE (µg)', 'Ve (mg)', 'Vd (IU)', 'Vk (µg)',
       'saturated (g)', 'monounsaturated (g)', 'polyunsaturated (g)',
       'trans (g)', 'Cholesterol (mg)', 'Caffeine (mg)', 'phenolics (mg)',
       'pH', 'Plain Occurences', 'Cool Occurences', 'Warm Occurences',
       'Cold Occurences', 'Heavy Cold Occurences', 'Heavy Warm Occurences',
       'Hot Occurences', 'Heavy Hot Occurences'],
      dtype='object')

In [5]:
for col in ['Plain Occurences', 'Cool Occurences', 'Warm Occurences',
       'Cold Occurences', 'Heavy Cold Occurences', 'Heavy Warm Occurences',
       'Hot Occurences', 'Heavy Hot Occurences']:
    print(df[col].value_counts().sort_values(ascending=False))

0    140
1     41
2     37
3     23
Name: Plain Occurences, dtype: int64
0    170
1     43
2     28
Name: Cool Occurences, dtype: int64
0    142
1     62
2     37
Name: Warm Occurences, dtype: int64
0    166
1     58
2     17
Name: Cold Occurences, dtype: int64
0    225
1     16
Name: Heavy Cold Occurences, dtype: int64
0    236
1      5
Name: Heavy Warm Occurences, dtype: int64
0    230
1     11
Name: Hot Occurences, dtype: int64
0    235
1      6
Name: Heavy Hot Occurences, dtype: int64


In [6]:
#df.isnull().sum().sort_values(ascending=False)

In [7]:
#df.describe()

In [8]:
#df.info()

## Data Process

### Normalization

In [9]:
def occurrences(df):
    df["% Plain"] = df["Plain Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Cool"] = df["Cool Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Warm"] = df["Warm Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Cold"] = df["Cold Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Heavy Cold"] = df["Heavy Cold Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Heavy Warm"] = df["Heavy Warm Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Hot"] = df["Hot Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Heavy Hot"] = df["Heavy Hot Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    return df

In [10]:
def linear_scale_v1(df):
    df["hot_cold_scale"] = (df["% Plain"] * (3.0/7.0)) + (df["% Cool"] * (2.0/7.0)) + (df["% Warm"] * (4.0/7.0)) + (df["% Cold"] * (1.0/7.0)) + (df["% Heavy Cold"] * (0.0/7.0)) + (df["% Heavy Warm"] * (5.0/7.0)) + (df["% Hot"] * (6.0/7.0)) + (df["% Heavy Hot"] * (7.0/7.0))

    return df

In [11]:
def linear_scale(df):
    df["hot_cold_scale"] = (df["% Plain"] * (3.0/6.0)) + (df["% Cool"] * (2.0/6.0)) + (df["% Warm"] * (4.0/6.0)) + (df["% Cold"] * (1.0/6.0)) + (df["% Heavy Cold"] * (0.0/6.0)) + (df["% Heavy Warm"] * (4.5/6.0)) + (df["% Hot"] * (5.0/6.0)) + (df["% Heavy Hot"] * (6.0/6.0))

    return df

In [12]:
df = occurrences(df)

In [13]:
df = linear_scale(df)

### NaN Values

Iterative Imputation
https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/

In [14]:
# df.info()

In [15]:
def iterative_imputation(df):
    ii_imp = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=10, random_state=1121218)

    name = df.iloc[:, 0:1]
    inputs = df.iloc[:, 1:39]
    outputs = df.iloc[:, 39:]
    
    inputs = ii_imp.fit_transform(inputs)
    new_inputs = pd.DataFrame(inputs, columns = ['Water (g)',
     'Energy (kal)',
     'Protein (g)',
     'lipid (g)',
     'Carbohydrate (g)',
     'Fiber (g)',
     'Sugars (g)',
     'Ash (g)',
     'Ca (mg)',
     'Fe (mg)',
     'Mg (mg)',
     'P (mg)',
     'K (mg)',
     'Na (mg)',
     'Zn (mg)',
     'Se (µg)',
     'Cu (mg)',
     'Mn (mg)',
     'I (µg)',
     'Vc (mg)',
     'Thiamin (mg)',
     'Riboflavin (mg)',
     'Niacin (mg)',
     'B6 (mg)',
     'Folate,DFE (µg)',
     'B12 (µg)',
     'Va,RAE (µg)',
     'Ve (mg)',
     'Vd (IU)',
     'Vk (µg)',
     'saturated (g)',
     'monounsaturated (g)',
     'polyunsaturated (g)',
     'trans (g)',
     'Cholesterol (mg)',
     'Caffeine (mg)',
     'phenolics (mg)',
     'pH']) 
    
    return pd.concat([name, new_inputs, outputs], axis=1)

In [20]:
df = iterative_imputation(df)
# df.isnull().sum().sort_values(ascending=False)

## Save Data

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Food Name              241 non-null    object 
 1   Water (g)              241 non-null    float64
 2   Energy (kal)           241 non-null    float64
 3   Protein (g)            241 non-null    float64
 4   lipid (g)              241 non-null    float64
 5   Carbohydrate (g)       241 non-null    float64
 6   Fiber (g)              241 non-null    float64
 7   Sugars (g)             241 non-null    float64
 8   Ash (g)                241 non-null    float64
 9   Ca (mg)                241 non-null    float64
 10  Fe (mg)                241 non-null    float64
 11  Mg (mg)                241 non-null    float64
 12  P (mg)                 241 non-null    float64
 13  K (mg)                 241 non-null    float64
 14  Na (mg)                241 non-null    float64
 15  Zn (mg

In [19]:
df.to_csv('./data/data_processed.csv') #just keep adjust code and re-running and overwriting this file, to then use in pipeline to see results