In [9]:
import joblib
import pandas as pd
import re
from pprint import pprint

In [10]:
import sys
if sys.version_info[0] < 3:
    raise Exception("Must be using Python 3")

In [11]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # to always print vars in console like usually only at the end

# Ingredient Type/Category prediction dataframes

In [12]:
def get_condensed_ing_type_df(model_name_, test_or_cvtrain_df, best_params_, train_duration_, shape_dict_):
    
    shape_dict = shape_dict_
    shape_dict.index.values[0] = 0
    train_duration = train_duration_
    train_duration.index.values[0] = 0
    shortened_model_name = model_name[6:]
    
    shape_dict = shape_dict[0].apply(pd.Series)
    shape_dict = shape_dict.rename(index=str, columns={0: 'X_cvtrain Shape', 1: 'X_test Shape'}).rename(
        {'0': shortened_model_name})

    
    train_duration = train_duration.rename(index=str, columns={0: '100 train iter. in minutes'}).rename(
        {'0': shortened_model_name})
    best_params = best_params_.drop(
        columns=['random_state', 'penalty', 'loss']).rename({0: shortened_model_name})
    
    df = test_or_cvtrain_df.loc[['micro avg', 'macro avg']].drop(columns='support')
    micro_avg_df = df[0:1].rename(
        index=str,
        columns={'precision': 'micro-avg-p',
                 'recall': 'micro-avg-r',
                 'f1-score': 'micro-avg-f1'}).rename({'micro avg': shortened_model_name})

    macro_avg_df = df[1:2].rename(
        index=str,
        columns={'precision': 'macro-avg-p',
                 'recall': 'macro-avg-r',
                 'f1-score': 'macro-avg-f1'}).rename({'macro avg': shortened_model_name})
    results_df = pd.concat([micro_avg_df, macro_avg_df], axis=1, sort=True)
    results_best_params_df = pd.concat([results_df, best_params], axis=1, sort=True)
    results_best_params_dur_df = pd.concat([results_best_params_df, train_duration], axis=1, sort=True)
    return pd.concat([results_best_params_dur_df, shape_dict], axis=1, sort=True)


def get_ingtype_classification_reports(model_name):
    file_name_base = base_path + 'ing_types_prediction_svc__' + model_name + '__'
    cv_train_file_name = file_name_base + 'classification_report_cvtrain_df.pkl'
    test_file_name = file_name_base + 'classification_report_test_df.pkl'
    df_cvtrain, df_test = pd.read_pickle(cv_train_file_name), pd.read_pickle(test_file_name)
    df_cvtrain = df_cvtrain.drop(index=['weighted avg']).rename_axis(model_name[6:])
    df_test = df_test.drop(index=['weighted avg']).rename_axis(model_name[6:])

    return df_cvtrain, df_test

In [13]:
cuisine_results_cvtrain = pd.DataFrame()
cuisine_results_test = pd.DataFrame()
base_path = './resultsLinearSVC/'

models = [
    'model_googlenews',  # 0
    'model_wiki_fasttext',
    'model_im2rec_joint_null',  # 2
    'model_im2rec_joint_avg',
    'model_im2rec_base',  # 4
    'model_im2rec_fasttext'
]

ing_cat_results_cvtrain = pd.DataFrame()
ing_cat_results_test = pd.DataFrame()
for i, model in enumerate(models):
    model_name = models[i]
    file_name_base = base_path + 'ing_types_prediction_svc__' + model_name + '__'
    file_names = [
        file_name_base + 'classification_report_cvtrain_df.pkl',  # 0
        file_name_base + 'classification_report_test_df.pkl',  # 1
        file_name_base + 'best_params_df.pkl',  # 2
        file_name_base + 'train_duration_dict_df.pkl',  # 3
        file_name_base + 'shape_dict_df.pkl'  # 4
        # file_name_base + 'cv_results_df.pkl',
        #file_name_base + 'gridsearch_clf.joblib.pkl'
    ]
    dfs = [pd.read_pickle(file_name) for file_name in file_names]
    best_params = pd.read_pickle(file_names[2]).T
    train_duration = pd.read_pickle(file_names[3])[i:i+1]
    shape_dict = pd.read_pickle(file_names[4])
    
    classification_report_cvtrain_df = dfs[0]
    classification_report_test_df = dfs[1]
    
    ing_cat_results_cvtrain = ing_cat_results_cvtrain.append(get_condensed_ing_type_df(model_name, 
                                                                                       classification_report_cvtrain_df, 
                                                                                       best_params, 
                                                                                       train_duration, 
                                                                                       shape_dict))
    ing_cat_results_test = ing_cat_results_test.append(get_condensed_ing_type_df(model_name, 
                                                                                 classification_report_test_df, 
                                                                                 best_params, 
                                                                                 train_duration, 
                                                                                 shape_dict))

ing_cat_results_cvtrain.rename_axis('cvtrain dataset')
ing_cat_results_test.rename_axis('test dataset')

ingtype_results_cvtrain = ing_cat_results_cvtrain
ingtype_results_test = ing_cat_results_test

Unnamed: 0_level_0,micro-avg-p,micro-avg-r,micro-avg-f1,macro-avg-p,macro-avg-r,macro-avg-f1,C,class_weight,dual,max_iter,multi_class,tol,100 train iter. in minutes,X_cvtrain Shape,X_test Shape
cvtrain dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
googlenews,0.684211,0.684211,0.684211,0.624208,0.66296,0.636028,0.02,balanced,False,3000,ovr,1e-06,7,"(456, 300)","(43, 300)"
wiki_fasttext,0.721821,0.721821,0.721821,0.659976,0.694756,0.673757,0.05,balanced,False,3000,ovr,1e-06,41,"(1384, 300)","(146, 300)"
im2rec_joint_null,0.300885,0.300885,0.300885,0.199645,0.240986,0.214285,1.0,,False,3000,crammer_singer,1e-06,20,"(339, 1024)","(37, 1024)"
im2rec_joint_avg,0.300885,0.300885,0.300885,0.200172,0.235458,0.213121,0.01,,False,3000,crammer_singer,0.0001,20,"(339, 1024)","(37, 1024)"
im2rec_base,0.772861,0.772861,0.772861,0.687432,0.709105,0.695454,0.05,balanced,False,3000,ovr,1e-06,6,"(339, 300)","(37, 300)"
im2rec_fasttext,0.768786,0.768786,0.768786,0.747358,0.700814,0.718177,0.05,,False,3000,ovr,1e-06,27,"(1384, 300)","(146, 300)"


Unnamed: 0_level_0,micro-avg-p,micro-avg-r,micro-avg-f1,macro-avg-p,macro-avg-r,macro-avg-f1,C,class_weight,dual,max_iter,multi_class,tol,100 train iter. in minutes,X_cvtrain Shape,X_test Shape
test dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
googlenews,0.72093,0.72093,0.72093,0.736172,0.715873,0.678733,0.02,balanced,False,3000,ovr,1e-06,7,"(456, 300)","(43, 300)"
wiki_fasttext,0.746575,0.746575,0.746575,0.718039,0.696419,0.691779,0.05,balanced,False,3000,ovr,1e-06,41,"(1384, 300)","(146, 300)"
im2rec_joint_null,0.189189,0.189189,0.189189,0.121795,0.230769,0.153846,1.0,,False,3000,crammer_singer,1e-06,20,"(339, 1024)","(37, 1024)"
im2rec_joint_avg,0.216216,0.216216,0.216216,0.133333,0.269231,0.172161,0.01,,False,3000,crammer_singer,0.0001,20,"(339, 1024)","(37, 1024)"
im2rec_base,0.675676,0.675676,0.675676,0.630952,0.711111,0.636735,0.05,balanced,False,3000,ovr,1e-06,6,"(339, 300)","(37, 300)"
im2rec_fasttext,0.767123,0.767123,0.767123,0.770681,0.678732,0.698164,0.05,,False,3000,ovr,1e-06,27,"(1384, 300)","(146, 300)"


In [14]:
ingtype_best_classification_report_cvtrain, ingtype_best_classification_report_test = get_ingtype_classification_reports(models[5])
ingtype_best_classification_report_cvtrain
ingtype_best_classification_report_test

Unnamed: 0_level_0,precision,recall,f1-score,support
im2rec_fasttext,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
alcoholic_beverage,0.926829,0.844444,0.883721,45.0
animal,0.0,0.0,0.0,17.0
cereal/crop,0.870968,0.75,0.80597,36.0
dairy,0.918919,0.944444,0.931507,36.0
fish/seafood,0.88,0.862745,0.871287,51.0
flower,0.685714,0.4,0.505263,60.0
fruit,0.781065,0.785714,0.783383,168.0
herb,0.648148,0.432099,0.518519,81.0
meat,0.979167,0.903846,0.94,52.0
nut/seed/pulse,0.83871,0.866667,0.852459,30.0


Unnamed: 0_level_0,precision,recall,f1-score,support
im2rec_fasttext,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
alcoholic_beverage,1.0,0.8,0.888889,5.0
animal,0.0,0.0,0.0,1.0
cereal/crop,0.75,1.0,0.857143,3.0
dairy,1.0,1.0,1.0,3.0
fish/seafood,0.75,0.6,0.666667,5.0
flower,0.75,0.5,0.6,6.0
fruit,0.789474,0.833333,0.810811,18.0
herb,0.555556,0.555556,0.555556,9.0
meat,1.0,1.0,1.0,5.0
nut/seed/pulse,1.0,0.666667,0.8,3.0


In [15]:
ingtype_results_cvtrain1 = ingtype_results_cvtrain.iloc[:, :6]
ingtype_results_cvtrain2 = ingtype_results_cvtrain.iloc[:, 6:]
ingtype_results_test1 = ingtype_results_test.iloc[:, :6]
ingtype_results_test2 = ingtype_results_test.iloc[:, 6:]

ingtype_results_cvtrain_tex1 = ingtype_results_cvtrain1.to_latex()
ingtype_results_cvtrain_tex2 = ingtype_results_cvtrain2.to_latex()
ingtype_results_test_tex1 = ingtype_results_test1.to_latex()
ingtype_results_test_tex2 = ingtype_results_test2.to_latex()

with open("ingtype_results_cvtrain1.tex", "w") as ingtype_results_cvtrain_tex_file1:
    ingtype_results_cvtrain_tex_file1.write(str(ingtype_results_cvtrain_tex1))
with open("ingtype_results_cvtrain2.tex", "w") as ingtype_results_cvtrain_tex_file2:
    ingtype_results_cvtrain_tex_file2.write(str(ingtype_results_cvtrain_tex2))
with open("ingtype_results_test1.tex", "w") as ingtype_results_test_tex_file1:
    ingtype_results_test_tex_file1.write(str(ingtype_results_test_tex1))
with open("ingtype_results_test2.tex", "w") as ingtype_results_test_tex_file2:
    ingtype_results_test_tex_file2.write(str(ingtype_results_test_tex2))
    
    
ingtype_best_cr_cvtrain_tex = ingtype_best_classification_report_cvtrain.to_latex()
ingtype_best_cr_test_tex = ingtype_best_classification_report_test.to_latex()
with open("ingtype_best_cr_cvtrain.tex", "w") as ingtype_best_cr_cvtrain_tex_file:
    ingtype_best_cr_cvtrain_tex_file.write(str(ingtype_best_cr_cvtrain_tex))
with open("ingtype_best_cr_test.tex", "w") as ingtype_best_cr_test_tex_file:
    ingtype_best_cr_test_tex_file.write(str(ingtype_best_cr_test_tex))

852

1156

852

1156

1314

1314

# Cuisine prediction dataframes

In [16]:
def get_condensed_cuisine_df(model_name, 
                             representation_name, 
                             test_or_cvtrain_df, 
                             best_params_, 
                             train_duration_, 
                             shape_dict_, 
                             tfidf=False):
    
    shortened_model_name = model_name[6:]
    
    shape_dict = shape_dict_
    shape_dict.index.values[0] = 0
    shape_dict = shape_dict[0].apply(pd.Series)
    shape_dict = shape_dict.rename(index=str, columns={0: 'X_cvtrain Shape', 1: 'X_test Shape'}).rename(
        {'0': shortened_model_name + representation_name})

    train_duration = train_duration_
    train_duration.index.values[0] = 0
    train_duration = train_duration.rename(index=str, columns={0: 'minutes'}).rename(
        {'0': shortened_model_name + representation_name})

    best_params = best_params_.drop(columns=['random_state'])#, 'class_weight', 'multi_class'])
    best_params = best_params.rename(
        {0: shortened_model_name + representation_name})

    df = test_or_cvtrain_df.drop(index=['0', '1', '2', '3', 'weighted avg'], columns='support')

    if tfidf:
        micro_avg_df = df[1:2]
        macro_avg_df = df[0:1]
    else:
        micro_avg_df = df[0:1]
        macro_avg_df = df[1:2]

    micro_avg_df = micro_avg_df.rename(
        index=str,
        columns={'precision': 'micro-p',
                 'recall': 'micro-r',
                 'f1-score': 'micro-f1'}
    ).rename({'micro avg': shortened_model_name + representation_name})

    macro_avg_df = macro_avg_df.rename(
        index=str,
        columns={'precision': 'macro-p',
                 'recall': 'macro-r',
                 'f1-score': 'macro-f1'}
    ).rename({'macro avg': shortened_model_name + representation_name})

    results_df = pd.concat([micro_avg_df, macro_avg_df], axis=1, sort=True)
    results_best_params_df = pd.concat([results_df, best_params], axis=1, sort=True)
    results_best_params_dur_df = pd.concat([results_best_params_df, train_duration], axis=1, sort=True)
    return pd.concat([results_best_params_dur_df, shape_dict], axis=1, sort=True)


def mk_cuisine_dfs(models_, representations_):
    results_cvtrain_df = pd.DataFrame()
    results_test_df = pd.DataFrame()
    for i, model in enumerate(models_):
        model_name = models_[i]
        for j, representation in enumerate(representations_):
            file_name_base = base_path + 'cuisine_prediction_logreg__' + model_name + '_'
            representation_name = representations_[j]
            results_paths = file_name_base + representation_name + '__'
            file_names = [
                results_paths + 'classification_report_cvtrain_df.pkl',  # 0
                results_paths + 'classification_report_test_df.pkl',  # 1
                results_paths + 'best_params_df.pkl',  # 2
                file_name_base + 'train_duration_dict_df.pkl',  # 3
                file_name_base + 'shape_dict_df.pkl',  # 4
                # results_paths + 'cv_results_df.pkl'
            ]
            shape_dict = pd.read_pickle(file_names[4])[j:j + 1]
            best_params = pd.read_pickle(file_names[2]).T
            train_duration = pd.read_pickle(file_names[3])[j:j + 1]

            classification_dfs = [pd.read_pickle(file_name) for file_name in file_names[:2]]
            classification_report_cvtrain_df = classification_dfs[0]
            classification_report_test_df = classification_dfs[1]
            shortened_rep_name = '-' + representation_endings[j]

            current_results_df_cvtrain = get_condensed_cuisine_df(model_name, shortened_rep_name,
                                                                  classification_report_cvtrain_df,
                                                                  best_params,
                                                                  train_duration,
                                                                  shape_dict)
            current_results_df_test = get_condensed_cuisine_df(model_name, shortened_rep_name,
                                                               classification_report_test_df,
                                                               best_params,
                                                               train_duration,
                                                               shape_dict)

            results_cvtrain_df = results_cvtrain_df.append(current_results_df_cvtrain)
            results_test_df = results_test_df.append(current_results_df_test)
    results_cvtrain_df.rename_axis('cvtrain dataset')
    results_test_df.rename_axis('test dataset')
    return results_cvtrain_df, results_test_df


def append_tfidf(cuisine_results_cvtrain_, cuisine_results_test_):
    cuisine_results_cvtrain_df, cuisine_results_test_df = cuisine_results_cvtrain_, cuisine_results_test_
    file_name_base = base_path + 'cuisine_prediction_logreg__' + models[1] + '_'
    results_paths = file_name_base + '3ing_list2tfidf_vec' + '__'
    file_names = [
        results_paths + 'classification_report_cvtrain_df.pkl',  # 0
        results_paths + 'classification_report_test_df.pkl',  # 1
        results_paths + 'best_params_df.pkl',  # 2
        file_name_base + 'train_duration_dict_df.pkl',  # 3
        file_name_base + 'shape_dict_df.pkl',  # 4
        # results_paths + 'cv_results_df.pkl'
    ]
    shape_dict = pd.read_pickle(file_names[4])[3:4]
    best_params = pd.read_pickle(file_names[2]).T
    train_duration = pd.read_pickle(file_names[3])[3:4]

    classification_dfs = [pd.read_pickle(file_name) for file_name in file_names[:2]]
    classification_report_cvtrain_df = classification_dfs[0]
    classification_report_test_df = classification_dfs[1]

    model_name, representation_name = '      tfidf', ''

    tfidf_cuisine_results_cvtrain = get_condensed_cuisine_df(model_name, 
                                                             representation_name,
                                                             classification_report_cvtrain_df,
                                                             best_params,
                                                             train_duration,
                                                             shape_dict,
                                                             tfidf=False)
    tfidf_cuisine_results_test = get_condensed_cuisine_df(model_name, 
                                                          representation_name,
                                                          classification_report_test_df,
                                                          best_params,
                                                          train_duration,
                                                          shape_dict,
                                                          tfidf=False)

    tfidf_cuisine_results_cvtrain = tfidf_cuisine_results_cvtrain.append(cuisine_results_cvtrain_df)
    tfidf_cuisine_results_test = tfidf_cuisine_results_test.append(cuisine_results_test_df)
    return tfidf_cuisine_results_cvtrain, tfidf_cuisine_results_test


def get_classification_reports(model_name, representation_name, representation_ending):
    cv_train_file_name = base_path + 'cuisine_prediction_logreg__' + model_name + '_' + representation_name + '__classification_report_cvtrain_df.pkl'
    test_file_name = base_path + 'cuisine_prediction_logreg__' + model_name + '_' + representation_name + '__classification_report_test_df.pkl'
    df_cvtrain, df_test = pd.read_pickle(cv_train_file_name), pd.read_pickle(test_file_name)
    df_cvtrain = df_cvtrain.drop(index=['weighted avg']).rename(
        {'0': 'Western',
         '1': 'Eastern',
         '2': 'South-Asian',
         '3': 'Southern'}).rename_axis(model_name[6:] + '-' + representation_ending)
    df_test = df_test.drop(index=['weighted avg']).rename(
        {'0': 'Western',
         '1': 'Eastern',
         '2': 'South-Asian',
         '3': 'Southern'}).rename_axis(model_name[6:] + '-' + representation_ending)
    return df_cvtrain, df_test

In [18]:
base_path = './resultsLinearSVC/'
#base_path = './resultsLogReg/'


models = [  
    'model_googlenews',  # 0
    'model_wiki_fasttext',  # 1
    'model_im2rec_joint_null',  # 2
    'model_im2rec_joint_avg',  # 3
    'model_im2rec_base',  # 4
    'model_im2rec_fasttext'  # 5
]

representations = [
    '0_ing_lists2simple_avged_model_vec',
    '1_ing_list2tfidf_avged_model_vec',
    '2_ing_list2tfidf_avged_model_vec_concat_2tfidfvec'
    #'3_ing_list2tfidf_vec'
]

representation_endings = ['sum', 'tfidf', 'concat']

cuisine_results_cvtrain, cuisine_results_test = mk_cuisine_dfs(models, representations)
cuisine_results_cvtrain
cuisine_results_test

Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
googlenews-sum,0.815417,0.815417,0.815417,0.612687,0.738439,0.657196,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.810675,0.810675,0.810675,0.607128,0.733855,0.655117,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.807822,0.807822,0.807822,0.593693,0.740496,0.644301,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.81823,0.81823,0.81823,0.615099,0.730458,0.655571,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.816559,0.816559,0.816559,0.626505,0.724543,0.666916,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.814612,0.814612,0.814612,0.629419,0.726221,0.669161,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.818967,0.818967,0.818967,0.627987,0.727637,0.66761,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.813282,0.813282,0.813282,0.615648,0.727338,0.65947,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.812495,0.812495,0.812495,0.608471,0.737542,0.656126,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"
im2rec_joint_avg-sum,0.823098,0.823098,0.823098,0.654947,0.705273,0.677936,10.0,balanced,True,1000,394,"(50836, 1024)","(5647, 1024)"


Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
googlenews-sum,0.799575,0.799575,0.799575,0.558444,0.662334,0.584284,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.789837,0.789837,0.789837,0.548688,0.644094,0.570077,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.789483,0.789483,0.789483,0.544536,0.664274,0.578127,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.790545,0.790545,0.790545,0.546277,0.654734,0.566756,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.792139,0.792139,0.792139,0.57598,0.642186,0.601319,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.790722,0.790722,0.790722,0.584665,0.639792,0.606772,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.795821,0.795821,0.795821,0.574389,0.611854,0.581965,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.796706,0.796706,0.796706,0.564317,0.630353,0.579569,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.786081,0.786081,0.786081,0.545738,0.646852,0.576712,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"
im2rec_joint_avg-sum,0.803613,0.803613,0.803613,0.594594,0.594923,0.586057,10.0,balanced,True,1000,394,"(50836, 1024)","(5647, 1024)"


In [19]:
models = [  
    'model_googlenews',  # 0
    'model_wiki_fasttext',  # 1
    'model_im2rec_joint_null',  # 2
    'model_im2rec_joint_avg',  # 3
    'model_im2rec_base',  # 4
    'model_im2rec_fasttext'  # 5
]

representations = [
    '0_ing_lists2simple_avged_model_vec',
    '1_ing_list2tfidf_avged_model_vec',
    '2_ing_list2tfidf_avged_model_vec_concat_2tfidfvec'
    #'3_ing_list2tfidf_vec'
]

representation_endings = ['sum', 'tfidf', 'concat']

cuisine_results_cvtrain, cuisine_results_test = mk_cuisine_dfs(models, representations)
cuisine_results_cvtrain
cuisine_results_test

Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
googlenews-sum,0.815417,0.815417,0.815417,0.612687,0.738439,0.657196,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.810675,0.810675,0.810675,0.607128,0.733855,0.655117,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.807822,0.807822,0.807822,0.593693,0.740496,0.644301,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.81823,0.81823,0.81823,0.615099,0.730458,0.655571,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.816559,0.816559,0.816559,0.626505,0.724543,0.666916,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.814612,0.814612,0.814612,0.629419,0.726221,0.669161,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.818967,0.818967,0.818967,0.627987,0.727637,0.66761,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.813282,0.813282,0.813282,0.615648,0.727338,0.65947,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.812495,0.812495,0.812495,0.608471,0.737542,0.656126,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"
im2rec_joint_avg-sum,0.823098,0.823098,0.823098,0.654947,0.705273,0.677936,10.0,balanced,True,1000,394,"(50836, 1024)","(5647, 1024)"


Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
googlenews-sum,0.799575,0.799575,0.799575,0.558444,0.662334,0.584284,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.789837,0.789837,0.789837,0.548688,0.644094,0.570077,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.789483,0.789483,0.789483,0.544536,0.664274,0.578127,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.790545,0.790545,0.790545,0.546277,0.654734,0.566756,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.792139,0.792139,0.792139,0.57598,0.642186,0.601319,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.790722,0.790722,0.790722,0.584665,0.639792,0.606772,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.795821,0.795821,0.795821,0.574389,0.611854,0.581965,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.796706,0.796706,0.796706,0.564317,0.630353,0.579569,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.786081,0.786081,0.786081,0.545738,0.646852,0.576712,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"
im2rec_joint_avg-sum,0.803613,0.803613,0.803613,0.594594,0.594923,0.586057,10.0,balanced,True,1000,394,"(50836, 1024)","(5647, 1024)"


In [48]:
cuisine_results_cvtrain, cuisine_results_test = mk_cuisine_dfs(models, representations)
tfidf_cuisine_results_cvtrain, tfidf_cuisine_results_test = append_tfidf(cuisine_results_cvtrain, 
                                                                         cuisine_results_test)
tfidf_cuisine_results_cvtrain
tfidf_cuisine_results_test

Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
tfidf,0.815477,0.815477,0.815477,0.618125,0.737758,0.663677,100.0,balanced,True,1000,6,"(50850, 381)","(5648, 381)"
googlenews-sum,0.815417,0.815417,0.815417,0.612687,0.738439,0.657196,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.810675,0.810675,0.810675,0.607128,0.733855,0.655117,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.807822,0.807822,0.807822,0.593693,0.740496,0.644301,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.81823,0.81823,0.81823,0.615099,0.730458,0.655571,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.816559,0.816559,0.816559,0.626505,0.724543,0.666916,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.814612,0.814612,0.814612,0.629419,0.726221,0.669161,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.818967,0.818967,0.818967,0.627987,0.727637,0.66761,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.813282,0.813282,0.813282,0.615648,0.727338,0.65947,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.812495,0.812495,0.812495,0.608471,0.737542,0.656126,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"


Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
tfidf,0.788598,0.788598,0.788598,0.564138,0.643382,0.591236,100.0,balanced,True,1000,6,"(50850, 381)","(5648, 381)"
googlenews-sum,0.799575,0.799575,0.799575,0.558444,0.662334,0.584284,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.789837,0.789837,0.789837,0.548688,0.644094,0.570077,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.789483,0.789483,0.789483,0.544536,0.664274,0.578127,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.790545,0.790545,0.790545,0.546277,0.654734,0.566756,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.792139,0.792139,0.792139,0.57598,0.642186,0.601319,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.790722,0.790722,0.790722,0.584665,0.639792,0.606772,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.795821,0.795821,0.795821,0.574389,0.611854,0.581965,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.796706,0.796706,0.796706,0.564317,0.630353,0.579569,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.786081,0.786081,0.786081,0.545738,0.646852,0.576712,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"


In [49]:
from IPython.display import display, HTML
HTML('''
        <style>
            .df tbody tr:first-child{
              background: #ffffff;
            }
            .df tbody tr:nth-child(6n+2),
            .df tbody tr:nth-child(6n+3),
            .df tbody tr:nth-child(6n+4) {
              background: #eeeeee;
            }
            .df tbody tr:nth-child(6n+5),
            .df tbody tr:nth-child(6n+6),
            .df tbody tr:nth-child(6n+7) {
              background: #ffffff;
            } 
        </style>
        ''' + tfidf_cuisine_results_cvtrain.to_html(classes="df"))

HTML('''
        <style>
            .df tbody tr:first-child{
              background: #ffffff;
            }
            .df tbody tr:nth-child(6n+2),
            .df tbody tr:nth-child(6n+3),
            .df tbody tr:nth-child(6n+4) {
              background: #eeeeee;
            }
            .df tbody tr:nth-child(6n+5),
            .df tbody tr:nth-child(6n+6),
            .df tbody tr:nth-child(6n+7) {
              background: #ffffff;
            } 
        </style>
        ''' + tfidf_cuisine_results_test.to_html(classes="df"))

Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
tfidf,0.815477,0.815477,0.815477,0.618125,0.737758,0.663677,100.0,balanced,True,1000,6,"(50850, 381)","(5648, 381)"
googlenews-sum,0.815417,0.815417,0.815417,0.612687,0.738439,0.657196,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.810675,0.810675,0.810675,0.607128,0.733855,0.655117,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.807822,0.807822,0.807822,0.593693,0.740496,0.644301,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.81823,0.81823,0.81823,0.615099,0.730458,0.655571,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.816559,0.816559,0.816559,0.626505,0.724543,0.666916,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.814612,0.814612,0.814612,0.629419,0.726221,0.669161,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.818967,0.818967,0.818967,0.627987,0.727637,0.66761,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.813282,0.813282,0.813282,0.615648,0.727338,0.65947,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.812495,0.812495,0.812495,0.608471,0.737542,0.656126,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"


Unnamed: 0,micro-p,micro-r,micro-f1,macro-p,macro-r,macro-f1,C,class_weight,dual,max_iter,minutes,X_cvtrain Shape,X_test Shape
tfidf,0.788598,0.788598,0.788598,0.564138,0.643382,0.591236,100.0,balanced,True,1000,6,"(50850, 381)","(5648, 381)"
googlenews-sum,0.799575,0.799575,0.799575,0.558444,0.662334,0.584284,0.1,balanced,True,1000,188,"(50828, 300)","(5648, 300)"
googlenews-tfidf,0.789837,0.789837,0.789837,0.548688,0.644094,0.570077,10.0,balanced,True,1000,84,"(50828, 300)","(5648, 300)"
googlenews-concat,0.789483,0.789483,0.789483,0.544536,0.664274,0.578127,5.0,balanced,True,1000,72,"(50828, 614)","(5648, 614)"
wiki_fasttext-sum,0.790545,0.790545,0.790545,0.546277,0.654734,0.566756,0.1,balanced,True,1000,137,"(50850, 300)","(5648, 300)"
wiki_fasttext-tfidf,0.792139,0.792139,0.792139,0.57598,0.642186,0.601319,10.0,balanced,True,1000,78,"(50850, 300)","(5648, 300)"
wiki_fasttext-concat,0.790722,0.790722,0.790722,0.584665,0.639792,0.606772,10.0,balanced,True,1000,78,"(50850, 681)","(5648, 681)"
im2rec_joint_null-sum,0.795821,0.795821,0.795821,0.574389,0.611854,0.581965,5.0,balanced,True,1000,388,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-tfidf,0.796706,0.796706,0.796706,0.564317,0.630353,0.579569,100.0,balanced,True,1000,184,"(50836, 1024)","(5647, 1024)"
im2rec_joint_null-concat,0.786081,0.786081,0.786081,0.545738,0.646852,0.576712,50.0,balanced,True,1000,148,"(50836, 1326)","(5647, 1326)"


In [50]:
tfidf_cuisine_results_cvtrain1 = tfidf_cuisine_results_cvtrain.iloc[:, :6]
tfidf_cuisine_results_cvtrain2 = tfidf_cuisine_results_cvtrain.iloc[:, 6:]
tfidf_cuisine_results_test1 = tfidf_cuisine_results_test.iloc[:, :6]
tfidf_cuisine_results_test2 = tfidf_cuisine_results_test.iloc[:, 6:]

In [11]:
# logreg
#best_cr_cvtrain, best_cr_test = get_classification_reports(models[2], 
#                                                           representations[0], 
#                                                           representation_endings[0])
# lSVC
best_cr_cvtrain, best_cr_test = get_classification_reports(models[1], 
                                                           representations[2], 
                                                           representation_endings[2])
best_cr_cvtrain
best_cr_test

Unnamed: 0_level_0,precision,recall,f1-score,support
wiki_fasttext-concat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western,0.726041,0.841751,0.779626,2673.0
Eastern,0.41535,0.658318,0.509343,559.0
South-Asian,0.470645,0.540288,0.503067,7285.0
Southern,0.905641,0.864528,0.884607,40333.0
micro avg,0.814612,0.814612,0.814612,50850.0
macro avg,0.629419,0.726221,0.669161,50850.0


Unnamed: 0_level_0,precision,recall,f1-score,support
wiki_fasttext-concat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western,0.60339,0.601351,0.602369,296.0
Eastern,0.430108,0.645161,0.516129,62.0
South-Asian,0.423529,0.444994,0.433996,809.0
Southern,0.881633,0.867663,0.874592,4481.0
micro avg,0.790722,0.790722,0.790722,5648.0
macro avg,0.584665,0.639792,0.606772,5648.0


In [None]:
filename_addition = 'SVC'
#filename_addition = 'logReg'

cuisine_results_cvtrain_tex1 = tfidf_cuisine_results_cvtrain1.to_latex()
cuisine_results_cvtrain_tex2 = tfidf_cuisine_results_cvtrain2.to_latex()
with open("cuisine_results_cvtrain" + filename_addition + "1.tex", "w") as cuisine_results_cvtrain_tex_file1:
    cuisine_results_cvtrain_tex_file1.write(str(cuisine_results_cvtrain_tex1))
with open("cuisine_results_cvtrain" + filename_addition + "2.tex", "w") as cuisine_results_cvtrain_tex_file2:
    cuisine_results_cvtrain_tex_file2.write(str(cuisine_results_cvtrain_tex2))

cuisine_results_test_tex1 = tfidf_cuisine_results_test1.to_latex()
cuisine_results_test_tex2 = tfidf_cuisine_results_test2.to_latex()
with open("cuisine_results_test" + filename_addition + "1.tex", "w") as cuisine_results_test_tex_file1:
    cuisine_results_test_tex_file1.write(str(cuisine_results_test_tex1))
with open("cuisine_results_test" + filename_addition + "2.tex", "w") as cuisine_results_test_tex_file2:
    cuisine_results_test_tex_file2.write(str(cuisine_results_test_tex2))

best_cr_cvtrain_tex = best_classification_report_cvtrain.to_latex()
best_cr_test_tex = best_classification_report_test.to_latex()
with open("cuisine_results_best_cr_cvtrain" + filename_addition + ".tex", "w") as best_cr_cvtrain_tex_file:
    best_cr_cvtrain_tex_file.write(str(best_cr_cvtrain_tex))
with open("cuisine_results_best_cr_test" + filename_addition + ".tex", "w") as best_cr_test_tex_file:
    best_cr_test_tex_file.write(str(best_cr_test_tex))