In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn import svm
import sklearn.model_selection as model_selection
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.utils import shuffle

In [2]:
# df = pd.read_csv('./data/data_processed.csv', index_col=0)
df = pd.read_csv('./data/data_processed_normalized.csv', index_col=0)

In [3]:
df.columns

Index(['Food Name', 'Water (g)', 'Energy (kal)', 'Protein (g)', 'lipid (g)',
       'Carbohydrate (g)', 'Fiber (g)', 'Ash (g)', 'Ca (mg)', 'Fe (mg)',
       'Mg (mg)', 'P (mg)', 'K (mg)', 'Na (mg)', 'Zn (mg)', 'Se (µg)',
       'Cu (mg)', 'Mn (mg)', 'Vc (mg)', 'Thiamin (mg)', 'Riboflavin (mg)',
       'Niacin (mg)', 'B6 (mg)', 'Folate,DFE (µg)', 'B12 (µg)', 'Va,RAE (µg)',
       'Ve (mg)', 'saturated (g)', 'monounsaturated (g)',
       'polyunsaturated (g)', 'trans (g)', 'Cholesterol (mg)', 'Caffeine (mg)',
       'phenolics (mg)', 'pH', 'Plain Occurences', 'Cool Occurences',
       'Warm Occurences', 'Cold Occurences', 'Heavy Cold Occurences',
       'Heavy Warm Occurences', 'Hot Occurences', 'Heavy Hot Occurences',
       '% Plain', '% Cool', '% Warm', '% Cold', '% Heavy Cold', '% Heavy Warm',
       '% Hot', '% Heavy Hot', 'hot_cold_scale', 'Plain', 'Cold', 'Hot',
       'Mode'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,Food Name,Water (g),Energy (kal),Protein (g),lipid (g),Carbohydrate (g),Fiber (g),Ash (g),Ca (mg),Fe (mg),...,% Cold,% Heavy Cold,% Heavy Warm,% Hot,% Heavy Hot,hot_cold_scale,Plain,Cold,Hot,Mode
0,Alfalfa,0.903084,0.036199,0.092251,0.007,0.029565,0.035714,0.012024,0.093645,0.028747,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
1,dolichos sinensis,0.902082,0.020362,0.04059,0.003,0.074421,0.080827,0.007014,0.051839,0.008214,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
2,hazelnuts,0.053164,0.710407,0.27583,0.6075,0.170252,0.182331,0.041082,0.095318,0.048255,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
3,"Beans, kidney",0.117641,0.376697,0.435055,0.0083,0.611785,0.468045,0.009018,0.119565,0.084189,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain
4,"Peanuts,",0.065078,0.641403,0.476015,0.4924,0.164441,0.159774,0.032064,0.076923,0.047023,...,0.0,0.0,0.0,0.0,0.0,0.5,3,0,0,Plain


In [5]:
# TO DO label encoder, temp solution
df['hot_cold_scale'] = df['hot_cold_scale'] * 10
df['hot_cold_scale'] = df['hot_cold_scale'].astype(int)

### Logistic Regression

In [6]:
def logistic_regression(df, y_col_name):
    """
    Apply sample logistic regression model to dataframe

    df : (dataframe)
    x : (compounds column as int)
    y : (classification column as int)
    """
    # values of each column
    x = df.iloc[:, 1:39] # df.iloc[2:, 1:39]??
    y = df[y_col_name][:]

    # split data into training and test set
    # 75 training, 25 testing
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)

    # standardize and scale data
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)

    # modeling + analysis
    model = LogisticRegression(random_state=0)
    model.fit(xtrain, ytrain.ravel())
    y_pred = model.predict(xtest)

    # confusion matrix of test size
    conf_m = confusion_matrix(ytest, y_pred)
    print("Confusion Matrix : ", conf_m)

    # accuracy score of test size
    print ("Accuracy : ", accuracy_score(ytest, y_pred))

In [7]:
logistic_regression(df, "hot_cold_scale")

Confusion Matrix :  [[ 2  1  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 0  0  6  3  0  0  0  0  0]
 [ 0  0  1 12  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0 14  0  0  0]
 [ 0  0  0  0  0  0  5  0  0]
 [ 0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  0  0  1]]
Accuracy :  0.8163265306122449


### KNN

In [8]:
def knn(df, y_col_name):
    x = df.iloc[:, 1:39]
    y = df[y_col_name][:]
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    knn = KNeighborsClassifier(n_neighbors = 5)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    
    print ("Accuracy : ", accuracy_score(y_test, y_pred))

In [9]:
knn(df, "hot_cold_scale")

Accuracy :  0.8979591836734694




more processing for following

In [10]:
def label_mode(row):
    if row['Mode'] == 'Plain':
        return 0
    elif row['Mode'] == 'Cold':
        return 1
    elif row['Mode'] == 'Hot':
        return 2
    
df['mode_code'] = df.apply (lambda row: label_mode(row), axis=1)
df = df.drop(columns='Food Name')

### Random Forest

In [11]:
def random_forest(df):
    X = df.drop(["Plain Occurences", "Cool Occurences", "Warm Occurences", "Cold Occurences", "Heavy Cold Occurences", "Heavy Warm Occurences","Hot Occurences", "Heavy Hot Occurences", "mode_code", "Mode"], axis = 1)
    Y = df["mode_code"].astype('int')
    factor = pd.factorize(df['Mode'])
    df.Mode = factor[0]
    definitions = factor[1]
    print(definitions)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 21)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    reversefactor = dict(zip(range(3),definitions))
    y_test = np.vectorize(reversefactor.get)(y_test)
    y_pred = np.vectorize(reversefactor.get)(y_pred)
    print(pd.crosstab(y_test, y_pred, rownames=['Actual Temp'], colnames=['Predicted Temp']))

In [12]:
random_forest(df)

Index(['Plain', 'Cold', 'Hot'], dtype='object')
Predicted Temp  Cold  Hot  Plain
Actual Temp                     
Cold              25    0      0
Hot                0   15      0
Plain              0    0     21


### SVM

In [13]:
def svm(df):
    X = df.drop(["Plain Occurences", "Cool Occurences", "Warm Occurences", "Cold Occurences", "Heavy Cold Occurences", "Heavy Warm Occurences","Hot Occurences", "Heavy Hot Occurences", "mode_code", "Mode"], axis = 1)
    y = df["mode_code"]
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=101)
    rbf = SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train, y_train)
    poly = SVC(kernel='poly', degree=3, C=1).fit(X_train, y_train)
    poly_pred = poly.predict(X_test)
    rbf_pred = rbf.predict(X_test)
    poly_accuracy = accuracy_score(y_test, poly_pred)
    poly_f1 = f1_score(y_test, poly_pred, average='weighted')
    print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
    print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))
    rbf_accuracy = accuracy_score(y_test, rbf_pred)
    rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
    print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
    print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))

In [14]:
svm(df)

Accuracy (Polynomial Kernel):  95.92
F1 (Polynomial Kernel):  95.84
Accuracy (RBF Kernel):  91.84
F1 (RBF Kernel):  91.55
