In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import preprocessing



In [2]:
df = pd.read_csv('./data/raw_data_cleaned.csv')

In [3]:
df = df.rename(columns={'Unnamed: 0': 'Food Name'})

In [4]:
df.columns

Index(['Food Name', 'Water (g)', 'Energy (kal)', 'Protein (g)', 'lipid (g)',
       'Carbohydrate (g)', 'Fiber (g)', 'Sugars (g)', 'Ash (g)', 'Ca (mg)',
       'Fe (mg)', 'Mg (mg)', 'P (mg)', 'K (mg)', 'Na (mg)', 'Zn (mg)',
       'Se (µg)', 'Cu (mg)', 'Mn (mg)', 'I (µg)', 'Vc (mg)', 'Thiamin (mg)',
       'Riboflavin (mg)', 'Niacin (mg)', 'B6 (mg)', 'Folate,DFE (µg)',
       'B12 (µg)', 'Va,RAE (µg)', 'Ve (mg)', 'Vd (IU)', 'Vk (µg)',
       'saturated (g)', 'monounsaturated (g)', 'polyunsaturated (g)',
       'trans (g)', 'Cholesterol (mg)', 'Caffeine (mg)', 'phenolics (mg)',
       'pH', 'Plain Occurences', 'Cool Occurences', 'Warm Occurences',
       'Cold Occurences', 'Heavy Cold Occurences', 'Heavy Warm Occurences',
       'Hot Occurences', 'Heavy Hot Occurences'],
      dtype='object')

In [5]:
for col in ['Plain Occurences', 'Cool Occurences', 'Warm Occurences',
       'Cold Occurences', 'Heavy Cold Occurences', 'Heavy Warm Occurences',
       'Hot Occurences', 'Heavy Hot Occurences']:
    print(df[col].value_counts().sort_values(ascending=False))

0    140
1     41
2     37
3     23
Name: Plain Occurences, dtype: int64
0    170
1     43
2     28
Name: Cool Occurences, dtype: int64
0    142
1     62
2     37
Name: Warm Occurences, dtype: int64
0    166
1     58
2     17
Name: Cold Occurences, dtype: int64
0    225
1     16
Name: Heavy Cold Occurences, dtype: int64
0    236
1      5
Name: Heavy Warm Occurences, dtype: int64
0    230
1     11
Name: Hot Occurences, dtype: int64
0    235
1      6
Name: Heavy Hot Occurences, dtype: int64


In [6]:
df.isnull().sum().sort_values(ascending=False)

I (µg)                   167
Vk (µg)                   99
Vd (IU)                   91
Sugars (g)                73
trans (g)                 71
pH                        63
Folate,DFE (µg)           57
B12 (µg)                  56
B6 (mg)                   54
Se (µg)                   51
Cholesterol (mg)          48
Mn (mg)                   46
Ve (mg)                   45
polyunsaturated (g)       43
Cu (mg)                   41
monounsaturated (g)       39
saturated (g)             39
Vc (mg)                   31
phenolics (mg)            26
Va,RAE (µg)               23
Fiber (g)                 20
Caffeine (mg)             18
Ash (g)                   14
Niacin (mg)                9
Thiamin (mg)               7
Mg (mg)                    5
Riboflavin (mg)            5
K (mg)                     4
Ca (mg)                    4
Na (mg)                    3
P (mg)                     3
Fe (mg)                    3
Zn (mg)                    2
lipid (g)                  2
Protein (g)   

In [7]:
#df.describe()

In [8]:
#df.info()

## Data Process

### Normalization

In [9]:
df.columns

Index(['Food Name', 'Water (g)', 'Energy (kal)', 'Protein (g)', 'lipid (g)',
       'Carbohydrate (g)', 'Fiber (g)', 'Sugars (g)', 'Ash (g)', 'Ca (mg)',
       'Fe (mg)', 'Mg (mg)', 'P (mg)', 'K (mg)', 'Na (mg)', 'Zn (mg)',
       'Se (µg)', 'Cu (mg)', 'Mn (mg)', 'I (µg)', 'Vc (mg)', 'Thiamin (mg)',
       'Riboflavin (mg)', 'Niacin (mg)', 'B6 (mg)', 'Folate,DFE (µg)',
       'B12 (µg)', 'Va,RAE (µg)', 'Ve (mg)', 'Vd (IU)', 'Vk (µg)',
       'saturated (g)', 'monounsaturated (g)', 'polyunsaturated (g)',
       'trans (g)', 'Cholesterol (mg)', 'Caffeine (mg)', 'phenolics (mg)',
       'pH', 'Plain Occurences', 'Cool Occurences', 'Warm Occurences',
       'Cold Occurences', 'Heavy Cold Occurences', 'Heavy Warm Occurences',
       'Hot Occurences', 'Heavy Hot Occurences'],
      dtype='object')

In [10]:
def normalize(df):
    df.loc["Water (g)": "pH"] = (df.loc["Water (g)": "pH"]-df.loc["Water (g)": "pH"].mean())/df.loc["Water (g)": "pH"].std()
    
    name = df.iloc[:, 0:1]
    inputs = df.loc[:, 'Water (g)':'pH'].astype(float)
    outputs = df.loc[:, 'pH':].drop(columns='pH')

    # normalized_inputs = (inputs-inputs.mean())/inputs.std()
    normalized_inputs=(inputs-inputs.min())/(inputs.max()-inputs.min())
    
    df = pd.concat([name, normalized_inputs, outputs], axis=1)
    return df
    

In [11]:
df.loc["Water (g)": "pH"] = (df.loc["Water (g)": "pH"]-df.loc["Water (g)": "pH"].mean())/df.loc["Water (g)": "pH"].std()

In [12]:
name = df.iloc[:, 0:1]
inputs = df.loc[:, 'Water (g)':'pH'].astype(float)
outputs = df.loc[:, 'pH':].drop(columns='pH')

# normalized_inputs = (inputs-inputs.mean())/inputs.std()
normalized_inputs=(inputs-inputs.min())/(inputs.max()-inputs.min())

In [13]:
df = pd.concat([name, normalized_inputs, outputs], axis=1)

### Y Feature Engineering

In [14]:
def occurrences(df):
    df["% Plain"] = df["Plain Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Cool"] = df["Cool Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Warm"] = df["Warm Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Cold"] = df["Cold Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Heavy Cold"] = df["Heavy Cold Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Heavy Warm"] = df["Heavy Warm Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Hot"] = df["Hot Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    df["% Heavy Hot"] = df["Heavy Hot Occurences"]/(df["Plain Occurences"] + df["Cool Occurences"] + df["Warm Occurences"] + df["Cold Occurences"] + df["Heavy Cold Occurences"] + df["Heavy Warm Occurences"] + df["Hot Occurences"] + df["Heavy Hot Occurences"])

    return df

In [15]:
def linear_scale_v1(df):
    df["hot_cold_scale"] = (df["% Plain"] * (3.0/7.0)) + (df["% Cool"] * (2.0/7.0)) + (df["% Warm"] * (4.0/7.0)) + (df["% Cold"] * (1.0/7.0)) + (df["% Heavy Cold"] * (0.0/7.0)) + (df["% Heavy Warm"] * (5.0/7.0)) + (df["% Hot"] * (6.0/7.0)) + (df["% Heavy Hot"] * (7.0/7.0))

    return df

In [16]:
def linear_scale(df):
#     df["hot_cold_scale"] = (df["% Plain"] * (3.0/7.0)) + (df["% Cool"] * (2.0/7.0)) + (df["% Warm"] * (4.0/7.0)) + (df["% Cold"] * (1.0/7.0)) + (df["% Heavy Cold"] * (0.0/7.0)) + (df["% Heavy Warm"] * (5.0/7.0)) + (df["% Hot"] * (6.0/7.0)) + (df["% Heavy Hot"] * (7.0/7.0))
    df["hot_cold_scale"] = (df["% Plain"] * (3.0/6.0)) + (df["% Cool"] * (2.0/6.0)) + (df["% Warm"] * (4.0/6.0)) + (df["% Cold"] * (1.0/6.0)) + (df["% Heavy Cold"] * (0.0/6.0)) + (df["% Heavy Warm"] * (4.5/6.0)) + (df["% Hot"] * (5.0/6.0)) + (df["% Heavy Hot"] * (6.0/6.0))

    return df

In [17]:
df = occurrences(df)

In [18]:
def mode_old(df):
    newDf = pd.DataFrame(df.iloc[:, 39:47].idxmax(axis=1).str.rstrip("Occurences"))
    df["Mode"] = newDf[0]
    return df

def mode(df):
    df['Plain'] = df['Plain Occurences'].astype(int)
    df['Cold'] = df['Cool Occurences'].astype(int) + df['Cold Occurences'].astype(int) + df['Heavy Cold Occurences'].astype(int)
    df['Hot'] = df['Warm Occurences'].astype(int) + df['Heavy Warm Occurences'].astype(int) + df['Hot Occurences'].astype(int) + df['Heavy Hot Occurences'].astype(int)
    newDf = pd.DataFrame(df.loc[:, ['Plain', 'Cold', 'Hot']].idxmax(axis=1))
    df["Mode"] = newDf[0]
    return df

In [19]:
df = linear_scale(df)

In [20]:
df = mode(df)

In [21]:
df.head(20)

Unnamed: 0,Food Name,Water (g),Energy (kal),Protein (g),lipid (g),Carbohydrate (g),Fiber (g),Sugars (g),Ash (g),Ca (mg),...,% Cold,% Heavy Cold,% Heavy Warm,% Hot,% Heavy Hot,hot_cold_scale,Plain,Cold,Hot,Mode
0,Alfalfa,0.903084,0.036199,0.092251,0.007,0.029565,0.035714,0.002002,0.012024,0.093645,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
1,dolichos sinensis,0.902082,0.020362,0.04059,0.003,0.074421,0.080827,,0.007014,0.051839,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
2,hazelnuts,0.053164,0.710407,0.27583,0.6075,0.170252,0.182331,0.043443,0.041082,0.095318,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
3,"Beans, kidney",0.117641,0.376697,0.435055,0.0083,0.611785,0.468045,0.022322,0.009018,0.119565,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
4,"Peanuts,",0.065078,0.641403,0.476015,0.4924,0.164441,0.159774,0.047247,0.032064,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
5,"Peas, green",0.789547,0.091629,0.1,0.004,0.147314,0.107143,0.056757,0.011022,0.020903,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
6,"Rice, white",0.129055,0.40724,0.121956,0.0058,0.808849,0.007519,,0.004008,0.007525,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
7,vigna umbellata(Chi xiao dou),0.126151,0.004412,0.372694,0.006,0.646345,0.144737,,0.032064,0.061873,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
8,"ginkgo nuts, dried",0.124149,0.393665,0.190959,0.02,0.738607,0.174812,,0.03006,0.016722,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
9,Butter fish,0.728875,0.158371,0.341328,0.073,0.0,0.0,0.0,0.014028,0.038462,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain


### NaN Values

Iterative Imputation
https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/

In [22]:
def nanValuesRid(df):
    not_too_many_null_cols = df.columns[df.isnull().mean() < 0.3]
    df = df[not_too_many_null_cols]
    
    return df

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 60 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Food Name              241 non-null    object 
 1   Water (g)              241 non-null    float64
 2   Energy (kal)           241 non-null    float64
 3   Protein (g)            239 non-null    float64
 4   lipid (g)              239 non-null    float64
 5   Carbohydrate (g)       240 non-null    float64
 6   Fiber (g)              221 non-null    float64
 7   Sugars (g)             168 non-null    float64
 8   Ash (g)                227 non-null    float64
 9   Ca (mg)                237 non-null    float64
 10  Fe (mg)                238 non-null    float64
 11  Mg (mg)                236 non-null    float64
 12  P (mg)                 238 non-null    float64
 13  K (mg)                 237 non-null    float64
 14  Na (mg)                238 non-null    float64
 15  Zn (mg

In [32]:
def iterative_imputation(df):
    ii_imp = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=10, random_state=1121218)

    name = df.iloc[:, 0:1]
    inputs = df.loc[:, 'Water (g)':'pH']
    outputs = df.loc[:, 'pH':].drop(columns='pH')
    
    not_too_many_null_cols = df.columns[df.isnull().mean() < 0.3]
    
    inputs = ii_imp.fit_transform(inputs)
    new_inputs = pd.DataFrame(inputs, columns = not_too_many_null_cols[1:35]) 
    
    return pd.concat([name, new_inputs, outputs], axis=1)

In [26]:
def label_encoder(df):
    # reading word labels
    label_encoder = preprocessing.LabelEncoder()

    # encode word labels in column
    df['hot_cold_scale'] = label_encoder.fit_transform(df['hot_cold_scale'])

    df['hot_cold_scale'].unique()

    return df
    

---

In [27]:
df = pd.read_csv('./data/raw_data_cleaned.csv')

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,Water (g),Energy (kal),Protein (g),lipid (g),Carbohydrate (g),Fiber (g),Sugars (g),Ash (g),Ca (mg),...,phenolics (mg),pH,Plain Occurences,Cool Occurences,Warm Occurences,Cold Occurences,Heavy Cold Occurences,Heavy Warm Occurences,Hot Occurences,Heavy Hot Occurences
0,Alfalfa,90.2,32.0,5.0,0.7,2.9,1.9,0.2,1.2,112.0,...,378.0,,3,0,0,0,0,0,0,0
1,dolichos sinensis,90.1,18.0,2.2,0.3,7.3,4.3,,0.7,62.0,...,39.2,,3,0,0,0,0,0,0,0
2,hazelnuts,5.31,628.0,14.95,60.75,16.7,9.7,4.34,4.1,114.0,...,274.0,,3,0,0,0,0,0,0,0
3,"Beans, kidney",11.75,333.0,23.58,0.83,60.01,24.9,2.23,0.9,143.0,...,27.4,5.7,3,0,0,0,0,0,0,0
4,"Peanuts,",6.5,567.0,25.8,49.24,16.13,8.5,4.72,3.2,92.0,...,356.7,7.5,3,0,0,0,0,0,0,0


### Run All

In [29]:
def data_process(df):
    df = normalize(df)
    df = occurrences(df)
    df = linear_scale(df)
    df = mode(df)
    
    df = iterative_imputation(df)
    
    df = label_encoder(df)
    
    return df


In [33]:
df = data_process(df)



## Save Data

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             241 non-null    object 
 1   Water (g)              241 non-null    float64
 2   Energy (kal)           241 non-null    float64
 3   Protein (g)            241 non-null    float64
 4   lipid (g)              241 non-null    float64
 5   Carbohydrate (g)       241 non-null    float64
 6   Fiber (g)              241 non-null    float64
 7   Ash (g)                241 non-null    float64
 8   Ca (mg)                241 non-null    float64
 9   Fe (mg)                241 non-null    float64
 10  Mg (mg)                241 non-null    float64
 11  P (mg)                 241 non-null    float64
 12  K (mg)                 241 non-null    float64
 13  Na (mg)                241 non-null    float64
 14  Zn (mg)                241 non-null    float64
 15  Se (µg

In [35]:
#just keep adjust code and re-running and overwriting this file, to then use in pipeline to see results

df.to_csv('./data/data_processed_normalized.csv') 
df.to_csv('./data/data_processed.csv')