### Count Vectoriser - Train Test

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Test to see approximate accuracy by splitting the training data to train and test data = 7:3
data = pd.read_csv("recipe_train.csv")
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X,y,test_size=0.33)

train_n_steps = pd.DataFrame(X_train_raw.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test_raw.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

train_n_ingredients = pd.DataFrame(X_train_raw.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test_raw.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# count vectorizer on feature 'name'
vec = CountVectorizer(stop_words='english')
X_train_name = vec.fit_transform(X_train_raw.name)
X_test_name = vec.transform(X_test_raw.name)
df_train_name = pd.DataFrame(X_train_name.todense(),columns = vec.get_feature_names())
df_test_name = pd.DataFrame(X_test_name.todense(),columns = vec.get_feature_names())

# count vectorizer on feature 'steps'
X_train_steps = vec.fit_transform(X_train_raw.steps)
X_test_steps = vec.transform(X_test_raw.steps)
df_train_steps = pd.DataFrame(X_train_steps.todense(),columns = vec.get_feature_names())
df_test_steps = pd.DataFrame(X_test_steps.todense(),columns = vec.get_feature_names())


# count vectorizer on feature 'ingredients'
X_train_ing = vec.fit_transform(X_train_raw.ingredients)
X_test_ing = vec.transform(X_test_raw.ingredients)
df_train_ing = pd.DataFrame(X_train_ing.todense(),columns = vec.get_feature_names())
df_test_ing = pd.DataFrame(X_test_ing.todense(),columns = vec.get_feature_names())

# put all into one dataframe
train = pd.concat([df_train_name, df_train_steps,df_train_ing,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name, df_test_steps,df_test_ing,test_n_steps,test_n_ingredients],axis=1)
print(train.shape)

#### Stacking

In [None]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)

#### No Feature Selection

In [None]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]

titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train_raw)
    acc = model.score(test,y_test_raw)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")

    
start = time.time()
print("START meta learner: Logistic Regression ",start)
stacker_lr1.fit(train, y_train)
stacker_lr1_res= stacker_lr1.predict(test)
end = time.time()
print("END meta learner: Logistic Regression ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr1_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_chi2_stack1_Log_Reg.csv')

MNB
START  1621409677.353326
END  1621409747.166736
MNB Accuracy: 0.7381060606060607 Time: 69.81340980529785  s
LinearSVC
START  1621409747.180112




END  1621409827.757466
LinearSVC Accuracy: 0.7533333333333333 Time: 80.57735419273376  s
Decision Tree
START  1621409827.76157
