In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

# Opening the files and removing the labels as well as the columns with strings in them
x_train_original = pd.read_csv(r"recipe_train.csv", index_col = False, delimiter = ',', header=0)
X_test_original = pd.read_csv(r"recipe_test.csv", index_col = False, delimiter = ',', header=0)

y_train = x_train_original['duration_label']
X_train = x_train_original.drop('duration_label', axis = 1)

string_cols = ['name', 'ingredients', 'steps']
X_train1 = X_train.drop(string_cols, axis = 1)
X_test1 = X_test_original.drop(string_cols, axis = 1)



In [4]:
# Sparse train matrix for name, ingredients, steps
name_sparse_matrix = scipy.sparse.load_npz('recipe_text_features_countvec/train_name_vec.npz')
ingredients_sparse_matrix = scipy.sparse.load_npz('recipe_text_features_countvec/train_ingr_vec.npz')
steps_sparse_matrix = scipy.sparse.load_npz('recipe_text_features_countvec/train_steps_vec.npz')

# Sparse test matrix for name, ingredients, steps
name_test_matrix = scipy.sparse.load_npz('recipe_text_features_countvec/test_name_vec.npz')
ingredients_test_matrix = scipy.sparse.load_npz('recipe_text_features_countvec/test_ingr_vec.npz')
steps_test_matrix = scipy.sparse.load_npz('recipe_text_features_countvec/test_steps_vec.npz')\

#train doc2vec100 for name steps ingredients
d2v_name_df = pd.read_csv(r"recipe_text_features_doc2vec100/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
d2v_ingredients_df = pd.read_csv(r"recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
d2v_steps_df = pd.read_csv(r"recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)



#test doc2vec100 for name steps ingredients
test_d2v_name_df = pd.read_csv(r"recipe_text_features_doc2vec100/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_d2v_ingredients_df = pd.read_csv(r"recipe_text_features_doc2vec100/test_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_d2v_steps_df = pd.read_csv(r"recipe_text_features_doc2vec100/test_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

In [5]:
# Creating Select KBest models to fit to each feature (name, ingredient, steps)
Kbest_name = SelectKBest(f_classif, k=50)
Kbest_ingr = SelectKBest(f_classif, k=50)
Kbest_steps = SelectKBest(f_classif, k=50)

best_name_cols = Kbest_name.fit_transform(name_sparse_matrix,y_train)
best_ingredients_cols = Kbest_ingr.fit_transform(ingredients_sparse_matrix,y_train)
best_steps_cols = Kbest_steps.fit_transform(steps_sparse_matrix,y_train)

best_name_df = pd.DataFrame(best_name_cols.toarray())
best_ingredients_df = pd.DataFrame(best_ingredients_cols.toarray())
best_steps_df = pd.DataFrame(best_steps_cols.toarray())

X_train2 = pd.concat([X_train1,best_name_df, best_ingredients_df, best_steps_df], axis = 1, ignore_index = True)

In [6]:
# Same as the above cell except for the test data
best_name_cols_test = Kbest_name.transform(name_test_matrix)
best_ingredients_cols_test = Kbest_ingr.transform(ingredients_test_matrix)
best_steps_cols_test = Kbest_steps.transform(steps_test_matrix)

best_name_test_df = pd.DataFrame(best_name_cols_test.toarray())
best_ingredients_test_df = pd.DataFrame(best_ingredients_cols_test.toarray())
best_steps_test_df = pd.DataFrame(best_steps_cols_test.toarray())


X_test2 = pd.concat([X_test1,best_name_test_df, best_ingredients_test_df, best_steps_test_df], axis = 1, ignore_index = True)


In [7]:
#Merging all the Doc2Vec features and performing PCA
train_doc2vec100 = pd.concat([d2v_name_df, d2v_ingredients_df, d2v_steps_df], axis = 1, ignore_index = True)
test_doc2vec100 = pd.concat([test_d2v_name_df, test_d2v_ingredients_df, test_d2v_steps_df], axis = 1, ignore_index = True)

pca = PCA(n_components = 175)
train_pca_doc2vec = pd.DataFrame(pca.fit_transform(train_doc2vec100))
test_pca_doc2vec = pd.DataFrame(pca.transform(test_doc2vec100))

X_train3 = pd.concat([X_train2, train_pca_doc2vec], axis = 1, ignore_index = True)
X_test3 = pd.concat([X_test2, test_pca_doc2vec], axis = 1, ignore_index = True)


#var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
#var

In [8]:
# Running the different types of models and creating a file to store them
# The commented out code is for making the files for Kaggle
random_forest = RandomForestClassifier()
random_forest = random_forest.fit(X_train3, y_train)
rf_pred = random_forest.predict(X_test3)

Dtree_classifier = DecisionTreeClassifier()
Dtree_classifier = Dtree_classifier.fit(X_train3, y_train)
predictions = Dtree_classifier.predict(X_test3)

gaussNB = GaussianNB()
gaussNB = gaussNB.fit(X_train3, y_train)
gauss_pred = gaussNB.predict(X_test3)