In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Test to see approximate accuracy by splitting the training data to train and test data = 7:3

train_data = pd.read_csv("recipe_train.csv")
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
test_data = pd.read_csv("recipe_test.csv")
X_test = test_data.iloc[:,:]
# X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X,y,test_size=0.33)

train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# tfidf vectorizer on feature 'name'
print("Feature Name")
vec = TfidfVectorizer(stop_words='english')
print("Fit and transform")
X_train_name = vec.fit_transform(X_train.name)
X_test_name = vec.transform(X_test.name)
# put to dataframe
df_train_name = pd.DataFrame(X_train_name.todense(),columns = vec.get_feature_names())
df_test_name = pd.DataFrame(X_test_name.todense(),columns = vec.get_feature_names())

# tfidf vectorizer on feature 'steps'
X_train_steps = vec.fit_transform(X_train.steps)
X_test_steps = vec.transform(X_test.steps)
df_train_steps = pd.DataFrame(X_train_steps.todense(),columns = vec.get_feature_names())
df_test_steps = pd.DataFrame(X_test_steps.todense(),columns = vec.get_feature_names())

# count vectorizer on feature 'ingredients'
X_train_ing = vec.fit_transform(X_train.ingredients)
X_test_ing = vec.transform(X_test.ingredients)
df_train_ing = pd.DataFrame(X_train_ing.todense(),columns = vec.get_feature_names())
df_test_ing = pd.DataFrame(X_test_ing.todense(),columns = vec.get_feature_names())
print(X_train_ing.shape, X_test_ing.shape)

train = pd.concat([df_train_name, df_train_steps,df_train_ing,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name, df_test_steps,df_test_ing,test_n_steps,test_n_ingredients],axis=1)

Feature Name
Fit and transform
(40000, 2906) (10000, 2906)


In [2]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime
models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train)
    result = model.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_res_'+title+'_full.csv')

MNB
START  1621322258.743366
END  1621322326.376809
MNB Time: 67.63344287872314  s
LinearSVC
START  1621322326.531439




END  1621322452.970061
LinearSVC Time: 126.43862199783325  s
Decision Tree
START  1621322453.0159192
END  1621322588.3886662
Decision Tree Time: 135.3727469444275  s
Logistic Regression
START  1621322588.4469812


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


END  1621324193.710724
Logistic Regression Time: 1605.2637429237366  s


In [2]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

In [3]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    

print("#1 STACKING META LEARNER - TFIDF : LOGISTIC REG , BASE: MNB, DT ")
classifiers1 = [MultinomialNB(),
          DecisionTreeClassifier()]
titles1 = ['MNB',
          'Decision Tree',]

meta_classifier_lr1 = LogisticRegression()
stacker_lr1 = StackingClassifier(classifiers1, meta_classifier_lr1)

print("Base Learners: MNB & DT")
for title, clf in zip(titles1, classifiers1):
    print(title)
    start = time.time()
    print("START ",start)
    clf.fit(train,y_train)
    result = clf.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_base_1_'+title+'.csv')

start = time.time()
print("START meta learner: Logistic Regression ",start)
stacker_lr1.fit(train, y_train)
stacker_lr1_res= stacker_lr1.predict(test)
end = time.time()
print("END meta learner: Logistic Regression ",end)
t = end - start
print('Time Meta Learner:', t," s")
df_res = pd.DataFrame(stacker_lr1_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_TFIDF_stack1_Log_Reg.csv')



#1 STACKING META LEARNER - TFIDF : LOGISTIC REG , BASE: MNB, DT 
Base Learners: MNB & DT
MNB
START  1621329208.0258298
END  1621329278.503445
MNB Time: 70.47761511802673  s
Decision Tree
START  1621329278.660442
END  1621329409.7353961
Decision Tree Time: 131.07495403289795  s
START meta learner: Logistic Regression  1621329409.802856
END meta learner: Logistic Regression  1621329626.9910522
Time Meta Learner: 217.18819618225098  s


In [None]:
print("#2 STACKING META LEARNER: LOGISTIC REG , BASE: MNB, DT, SVM ")
classifiers2 = [MultinomialNB(),
                DecisionTreeClassifier(),
                svm.LinearSVC()]

titles2 = ['MNB',
          'Decision Tree',
          'LinearSVC',]

meta_classifier_lr2 = LogisticRegression()
stacker_lr2 = StackingClassifier(classifiers, meta_classifier_lr2)

for title, clf in zip(titles1, classifiers2):
    print(title)
    start = time.time()
    print("START ",start)
    clf.fit(train,y_train)
    result = clf.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_base_2_'+title+'.csv')
    
    
stacker_lr2.fit(train, y_train)
stacker_lr2_res= stacker_lr2.predict(test)
df_res = pd.DataFrame(stacker_lr2_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_TFIDF_stack2_Log_reg.csv')



In [None]:
print("#3 STACKING META LEARNER: DT , BASE: MNB, LOGISTIC REG ")
classifiers3 = [MultinomialNB(),
                LogisticRegression()]
titles3 = ['MNB',
          'Logistic Regression',]

meta_classifier_lr3 = DecisionTreeClassifier()
stacker_lr3 = StackingClassifier(classifiers, meta_classifier_lr3)

for title, clf in zip(titles1, classifiers3):
    print(title)
    start = time.time()
    print("START ",start)
    clf.fit(train,y_train)
    result = clf.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_base_3_'+title+'.csv')
    
stacker_lr3.fit(train, y_train)
stacker_lr3_res= stacker_lr3.predict(test)
df_res = pd.DataFrame(stacker_lr3_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_TFIDF_stack3_dt.csv')



In [None]:
print("#4 STACKING META LEARNER: DT , BASE: MNB, LOGISTIC REG, SVM ")
classifiers3 = [MultinomialNB(),
                LogisticRegression(),
                svm.LinearSVC()]
titles3 = ['MNB',
          'Logistic Regression',
          'LinearSVM']

meta_classifier_lr4 = DecisionTreeClassifier()
stacker_lr4 = StackingClassifier(classifiers, meta_classifier_lr4)

for title, clf in zip(titles1, classifiers4):
    print(title)
    start = time.time()
    print("START ",start)
    clf.fit(train,y_train)
    result = clf.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_base_4_'+title+'.csv')
    
    
stacker_lr4.fit(train, y_train)
stacker_lr4_res= stacker_lr4.predict(test)
df_res = pd.DataFrame(stacker_lr4_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_TFIDF_stack4_dt.csv')
