In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn import svm
import sklearn.model_selection as model_selection
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.utils import shuffle

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import preprocessing


In [None]:
# df = pd.read_csv('./data/data_processed.csv', index_col=0)
df = pd.read_csv('./data/data_processed_normalized.csv', index_col=0)
# df = pd.read_csv('./data/raw_data_cleaned.csv')

In [None]:
df.columns

In [None]:
df.head()

### Logistic Regression

In [None]:
def logistic_regression(df, y_col_name):
    """
    Apply sample logistic regression model to dataframe

    df : (dataframe)
    x : (compounds column as int)
    y : (classification column as int)
    """
    # values of each column
    x = df.iloc[:, 1:] # df.iloc[2:, 1:39]??
    y = df[y_col_name][:]

    # split data into training and test set
    # 75 training, 25 testing
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)
    
    train = data_process(pd.concat([xtrain, ytrain]))
    xtrain = trian.iloc[:, 1:39]
    ytrain = train[y_col_name][:]
    
    test = data_process(pd.concat([xtest, ytest], axis=1))
    xtest = test.iloc[:, 1:39]
    ytest = test[y_col_name][:]


    # standardize and scale data
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)

    # modeling + analysis
    model = LogisticRegression(random_state=0)
    model.fit(xtrain, ytrain.ravel())
    y_pred = model.predict(xtest)

    # confusion matrix of test size
    conf_m = confusion_matrix(ytest, y_pred)
    print("Confusion Matrix : ", conf_m)

    # accuracy score of test size
    print ("Accuracy : ", accuracy_score(ytest, y_pred))

In [None]:
logistic_regression(df, "hot_cold_scale")

### KNN

In [None]:
def knn(df, y_col_name):
    x = df.iloc[:, 1:39]
    y = df[y_col_name][:]
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    knn = KNeighborsClassifier(n_neighbors = 5)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    
    print ("Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
knn(df, "hot_cold_scale")

more processing for following

In [None]:
def label_mode(row):
    if row['Mode'] == 'Plain':
        return 0
    elif row['Mode'] == 'Cold':
        return 1
    elif row['Mode'] == 'Hot':
        return 2
    
df['mode_code'] = df.apply (lambda row: label_mode(row), axis=1)
df = df.drop(columns='Food Name')

### Random Forest

In [None]:
def random_forest(df):
    X = df.drop(["Plain Occurences", "Cool Occurences", "Warm Occurences", "Cold Occurences", "Heavy Cold Occurences", "Heavy Warm Occurences","Hot Occurences", "Heavy Hot Occurences", "mode_code", "Mode"], axis = 1)
    Y = df["mode_code"].astype('int')
    factor = pd.factorize(df['Mode'])
    df.Mode = factor[0]
    definitions = factor[1]
    print(definitions)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 21)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    reversefactor = dict(zip(range(3),definitions))
    y_test = np.vectorize(reversefactor.get)(y_test)
    y_pred = np.vectorize(reversefactor.get)(y_pred)
    print(pd.crosstab(y_test, y_pred, rownames=['Actual Temp'], colnames=['Predicted Temp']))

In [None]:
random_forest(df)

### SVM

In [None]:
def svm(df):
    X = df.drop(["Plain Occurences", "Cool Occurences", "Warm Occurences", "Cold Occurences", "Heavy Cold Occurences", "Heavy Warm Occurences","Hot Occurences", "Heavy Hot Occurences", "mode_code", "Mode"], axis = 1)
    y = df["mode_code"]
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=101)
    rbf = SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train, y_train)
    poly = SVC(kernel='poly', degree=3, C=1).fit(X_train, y_train)
    poly_pred = poly.predict(X_test)
    rbf_pred = rbf.predict(X_test)
    poly_accuracy = accuracy_score(y_test, poly_pred)
    poly_f1 = f1_score(y_test, poly_pred, average='weighted')
    print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
    print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))
    rbf_accuracy = accuracy_score(y_test, rbf_pred)
    rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
    print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
    print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))

In [None]:
svm(df)