In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time

import csv
import os.path

import json

In [2]:
# Prepare data set
dummy = pd.read_csv('../../data/menu.csv')
df = dummy.copy()
df.drop(columns=['Item', 'Serving Size'], inplace=True)

In [3]:
# Process target labels into numeric
df2 = df.copy()
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['Category'].values)
df2['y'] = y

In [4]:
c_start = -1
c_end =  5
np.logspace(c_start, c_end, 10)

array([1.00000000e-01, 4.64158883e-01, 2.15443469e+00, 1.00000000e+01,
       4.64158883e+01, 2.15443469e+02, 1.00000000e+03, 4.64158883e+03,
       2.15443469e+04, 1.00000000e+05])

In [9]:
# Build model
def build_model(itera, size, norm, random, param_c):
    """
    Build model of prediction.

    Parameters
    ----------
    submit: str
        Click to run model
    itera: int
        Number of iteration
    size : int
        Size of test set
    norm : int
        Regularization
    random : int
        Condition for random_state

    Returns
    -------
    list
        Text and graph output

    """

    X = df.drop('Category', axis=1)
    y = df['Category']

#     listC = [1e-8, 1e-4, 1e-3, 1e-2, 0.1,
#     0.2, 0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]


        listC = [1e-8, 1e-4, 1e-3, 1e-2, 0.1,
             0.2, 0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]

        C = []
        for i in listC:
            C.append(i)
            if i == param_c:
                break

    score_train = []
    score_test = []
    weighted_coefs = []
    time_train = []
    time_predict = []

    for seed in range(itera):
        if random == 'Yes':
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=size, random_state=seed)
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=size, random_state=1)

        acc_train = []
        acc_test = []
        tm_train = []
        tm_predict = []

        for alpha_run in C:
            lr = LogisticRegression(C=alpha_run, penalty=norm)

            time_train_start = time.time()
            lr.fit(X_train, y_train)
            time_train_end = time.time()

            if alpha_run == 0.1:
                coefs = lr.coef_
                weighted_coefs.append(coefs)

            time_predict_start = time.time()
            lr.predict(X_test)
            time_predict_end = time.time()

            acc_train.append(lr.score(X_train, y_train))
            acc_test.append(lr.score(X_test, y_test))
            tm_train.append(time_train_end - time_train_start)
            tm_predict.append(time_predict_end - time_predict_start)

        score_train.append(acc_train)
        score_test.append(acc_test)
        time_train.append(tm_train)
        time_predict.append(tm_predict)

    mean_coefs = np.mean(weighted_coefs, axis=0)
    scoret = np.mean(score_train, axis=0)
    score = np.mean(score_test, axis=0)
    timet = np.mean(time_train, axis=0)
    timep = np.mean(time_predict, axis=0)

    top_predictor = X.columns[np.argmax(np.abs(mean_coefs))]
    abs_mean_coefs = np.abs(mean_coefs[0, :])
    coefs_count = len(abs_mean_coefs)

    return {
        "description": [
            'Logistic({0})'.format(norm),
            'Accuracy: {0:.2%}'.format(np.amax(score)),
            'C: {0}'.format(C[np.argmax(score)]),
            'Top predictor:{0}'.format(top_predictor),
            'Training time: {0:.4f} sec'.format(np.amax(timet)),
            'Testing time: {0:.4f} sec'.format(np.amax(timep))],
        "scoret_x": [i for i in range(len(scoret))],
        "scoret_y": list(scoret),
        "score_x": [i for i in range(len(scoret))],
        "score_y": list(score),
        "coefs_x": list(X.columns[np.argsort(abs_mean_coefs)]),
        "coefs_y": sorted(abs_mean_coefs),
    }

In [7]:
param_c = 20

listC = [1e-8, 1e-4, 1e-3, 1e-2, 0.1, 
     0.2, 0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]

C = []
for i in listC:
    C.append(i)
    if i == param_c:
        break
        
C       

[1e-08, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.4, 0.75, 1, 1.5, 3, 5, 10, 15, 20]

In [18]:
build_model(itera=1, size=0.5, norm="l1", random="Yes", param_c=100)

{'description': ['Logistic(l1)',
  'Accuracy: 86.15%',
  'C: 0.4',
  'Top predictor:Iron (% Daily Value)',
  'Training time: 1.2468 sec',
  'Testing time: 0.0005 sec'],
 'scoret_x': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 'scoret_y': [0.046153846153846156,
  0.43846153846153846,
  0.4846153846153846,
  0.49230769230769234,
  0.8153846153846154,
  0.8769230769230769,
  0.9307692307692308,
  0.9384615384615385,
  0.9615384615384616,
  0.9615384615384616,
  0.9692307692307692,
  0.9769230769230769,
  0.9769230769230769,
  0.9846153846153847,
  0.9846153846153847,
  0.9846153846153847],
 'score_x': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 'score_y': [0.06923076923076923,
  0.4461538461538462,
  0.49230769230769234,
  0.5384615384615384,
  0.8,
  0.8538461538461538,
  0.8615384615384616,
  0.8461538461538461,
  0.8384615384615385,
  0.8307692307692308,
  0.823076923076923,
  0.8,
  0.7923076923076923,
  0.7692307692307693,
  0.7538461538461538,
  0.730769230

In [25]:
df['Category'].unique()

array(['Breakfast', 'Beef & Pork', 'Chicken & Fish', 'Salads',
       'Snacks & Sides', 'Desserts', 'Beverages', 'Coffee & Tea',
       'Smoothies & Shakes'], dtype=object)

In [23]:
df.groupby(['Category'])['Calories'].count().values

array([15, 27, 42, 27, 95,  7,  6, 28, 13])