## COUNT VECTORIZER - Real Train & Test

### Raw Train & Test DataSet

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_file = "./COMP30027_2021_Project2_datasets/recipe_train.csv"
test_file = "./COMP30027_2021_Project2_datasets/recipe_test.csv"

train_data = pd.read_csv(train_file)
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

test_data = pd.read_csv(test_file)
X_test = test_data.iloc[:,:]


### Count Vectoriser for text features

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import scipy 

# Get the sparse matrix of the Bag-of-Word representation of text features for training data

count_vec_folder = './COMP30027_2021_Project2_datasets/recipe_text_features_countvec/'

# TRAIN & TEST DATASET - NAME
vocab_name = pickle.load(open(count_vec_folder+'train_name_countvectorizer.pkl', "rb"))
train_name_matrix = scipy.sparse.load_npz(count_vec_folder +'train_name_vec.npz')
test_name_matrix = scipy.sparse.load_npz(count_vec_folder +'test_name_vec.npz')
df_train_name = pd.DataFrame(train_name_matrix.todense(),columns = vocab_name.get_feature_names())
df_test_name = pd.DataFrame(test_name_matrix.todense(),columns = vocab_name.get_feature_names())

# TRAIN & TEST DATASET - STEPS
vocab_steps = pickle.load(open(count_vec_folder + 'train_steps_countvectorizer.pkl', "rb"))
train_steps_matrix = scipy.sparse.load_npz(count_vec_folder +'train_steps_vec.npz')
test_steps_matrix = scipy.sparse.load_npz(count_vec_folder +'test_steps_vec.npz')
df_train_steps = pd.DataFrame(train_steps_matrix.todense(),columns = vocab_steps.get_feature_names())
df_test_steps = pd.DataFrame(test_steps_matrix.todense(),columns = vocab_steps.get_feature_names())

# TRAIN & TEST DATASET- INGREDIENTS
vocab_ingr = pickle.load(open(count_vec_folder + 'train_ingr_countvectorizer.pkl', "rb"))
train_ingr_matrix = scipy.sparse.load_npz(count_vec_folder +'train_ingr_vec.npz')
test_ingr_matrix = scipy.sparse.load_npz(count_vec_folder +'test_ingr_vec.npz')
df_train_ingr = pd.DataFrame(train_ingr_matrix.todense(),columns = vocab_ingr.get_feature_names())
df_test_ingr = pd.DataFrame(test_ingr_matrix.todense(),columns = vocab_ingr.get_feature_names())

# TRAIN & TEST DATASET- N_STEPS
train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

# TRAIN & TEST DATASET- N_INGREDIENTS
train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# ALL FEATURES AND THEIR MATRICES
train = pd.concat([df_train_name,df_train_steps,df_train_ingr,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name,df_test_steps,df_test_ingr,test_n_steps,test_n_ingredients],axis=1)




### Stacking from W8 Prac

In [3]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


### Individual Classifiers - All features

In [4]:
# Predict using each individual classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [MultinomialNB(),
          DecisionTreeClassifier(),
          LogisticRegression()]

titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train)
    result = model.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_res_'+title+'_full.csv')
    

MNB
START  1621514011.905874
END  1621514102.371785
MNB Time: 90.46591091156006  s
Decision Tree
START  1621514102.547482
END  1621514249.014076
Decision Tree Time: 146.46659398078918  s
Logistic Regression
START  1621514249.061354
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
END  1621516298.5528681
Logistic Regression Time: 2049.4915142059326  s


### STACKING - Full Features

In [5]:
# Predict using Stacking 
# Base Classifiers : Multinomial Naive Bayes + Decision Tree
# Meta Classifier : Logistic Regression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime
classifiers = [MultinomialNB(),
                DecisionTreeClassifier()]

titles = ['MNB',
           'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)

start = time.time()
print("Meta learner: Logistic Regression - Start",start)
stacker_lr.fit(train, y_train)
stacker_lr_res = stacker_lr.predict(test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_stack1_Log_Reg.csv')


Meta learner: Logistic Regression - Start 1621516762.229557
Meta learner: Logistic Regression - End  1621517043.412077
Time: 281.1825199127197  s


In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
classifiers = [('MNB',MultinomialNB()),('DT',DecisionTreeClassifier())]
stack_clf = StackingClassifier(estimators=classifiers, final_estimator=LogisticRegression(),cv=5).fit(train, y_train)
result = stack_clf.predict(test)
df_res = pd.DataFrame(result, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_stack_sklearn_Log_Reg.csv')

### CHI SQUARE , K=1000

In [5]:
# KBEST -chi2
from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime

start = time.time()
print("start ",start)
kbest_chi2 = SelectKBest(chi2, k=1000).fit(train, y_train)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)
end= time.time()
print("end ",end)
print("time: ",end-start)


start  1621528395.0011818
end  1621528436.596156
time:  41.59497404098511


### Individual Classifiers - CHI SQUARE, K=1000

In [7]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
models = [MultinomialNB(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train)
    result = model.predict(X_test_kbest_chi2)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_res_chi2_'+title+'_full.csv')



MNB
START  1621517404.8140302
END  1621517405.427587
MNB Time: 0.6135568618774414  s
Decision Tree
START  1621517405.4564962
END  1621517415.3387551
Decision Tree Time: 9.882258892059326  s
Logistic Regression
START  1621517415.3659468
END  1621517425.815221
Logistic Regression Time: 10.44927430152893  s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Stacking - CHI SQUARE , K=1000

In [8]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB 

classifiers = [MultinomialNB(),
          DecisionTreeClassifier()]

titles = ['MNB',
          'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr1= StackingClassifier(classifiers, meta_classifier_lr)

start = time.time()
print("START meta learner: Logistic Regression ",start)
stacker_lr.fit(X_train_kbest_chi2, y_train)
stacker_lr_res= stacker_lr.predict(X_test_kbest_chi2)
end = time.time()
print("END meta learner: Logistic Regression ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_chi2_stack1_Log_Reg.csv')



START meta learner: Logistic Regression  1621517450.8781261
END meta learner: Logistic Regression  1621517461.244617
Time: 10.366490840911865  s


In [6]:
from sklearn.ensemble import StackingClassifier
classifiers = [('MNB',MultinomialNB()),('DT',DecisionTreeClassifier())]
stack_clf = StackingClassifier(estimators=classifiers, final_estimator=LogisticRegression(),cv=5).fit(train, y_train)
result = stack_clf.predict(test)
df_res = pd.DataFrame(result, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_chi2_stack_sklearn_Log_Reg.csv')