## COUNT VECTORIZER - Real Train & Test

### Raw Train & Test DataSet

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# Test to see approximate accuracy by splitting the training data to train and test data = 7:3

train_file = "./COMP30027_2021_Project2_datasets/recipe_train.csv"
test_file = "./COMP30027_2021_Project2_datasets/recipe_test.csv"

train_data = pd.read_csv(train_file)
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

test_data = pd.read_csv(test_file)
X_test = test_data.iloc[:,:]


### Count Vectoriser for text features

In [2]:
import pickle
import scipy 

# Get the sparse matrix of the Bag-of-Word representation of text features for training data

# TRAIN & TEST DATASET - NAME
name_file = "./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_name_countvectorizer.pkl"
vocab_name = pickle.load(open(name_file, "rb"))
train_name_matrix = scipy.sparse.load_npz(count_vec_path +'train_name_vec.npz')
test_name_matrix = scipy.sparse.load_npz(count_vec_path +'test_name_vec.npz')
df_train_name = pd.DataFrame(train_name_matrix.todense(),columns = vocab_name.get_feature_names())
df_test_name = pd.DataFrame(test_name_matrix.todense(),columns = vocab_name.get_feature_names())

# TRAIN & TEST DATASET - STEPS
steps_file = "./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_steps_countvectorizer.pkl"
vocab_steps = pickle.load(open(steps_file, "rb"))
train_steps_matrix = scipy.sparse.load_npz(count_vec_path +'train_steps_vec.npz')
test_steps_matrix = scipy.sparse.load_npz(count_vec_path +'test_steps_vec.npz')
df_train_steps = pd.DataFrame(train_steps_matrix.todense(),columns = vocab_steps.get_feature_names())
df_test_steps = pd.DataFrame(test_steps_matrix.todense(),columns = vocab_steps.get_feature_names())

# TRAIN & TEST DATASET- INGREDIENTS
ingr_file = "./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_ingr_countvectorizer.pkl"
vocab_ingr = pickle.load(open(ingr_file, "rb"))
train_ingr_matrix = scipy.sparse.load_npz(count_vec_path +'train_ingr_vec.npz')
test_ingr_matrix = scipy.sparse.load_npz(count_vec_path +'test_ingr_vec.npz')
df_train_ingr = pd.DataFrame(train_ingr_matrix.todense(),columns = vocab_ingr.get_feature_names())
df_test_ingr = pd.DataFrame(test_ingr_matrix.todense(),columns = vocab_ingr.get_feature_names())

# TRAIN & TEST DATASET- N_STEPS
train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

# TRAIN & TEST DATASET- N_INGREDIENTS
train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# STILL USING ALL FEATURES AND THEIR MATRICES
train = pd.concat([df_train_name,df_train_steps,df_train_ingr,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name,df_test_steps,df_test_ingr,test_n_steps,test_n_ingredients],axis=1)




In [3]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


### Individual Classifiers - Full features

In [None]:
# Predict using each individual classifiers
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [GaussianNB(),
          MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['GNB',
          'MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train)
    result = model.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_res_'+title+'_full.csv')
    

### STACKING

In [None]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [None]:
# Predict using Stacking 
# Base Classifiers : Multinomial Naive Bayes + Decision Tree
# Meta Classifier : Logistic Regression

classifiers = [MultinomialNB(),
                DecisionTreeClassifier()]

titles = ['MNB',
           'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)

start = time.time()
print("Meta learner: Logistic Regression - Start",start)
stacker_lr.fit(train, y_train)
stacker_lr_res = stacker_lr.predict(test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_stack1_Log_Reg.csv')



In [7]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB + LogReg

classifiers = [MultinomialNB(),
                DecisionTreeClassifier(),
                LogisticRegression()]

titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(train, y_train)
stacker_lr_res = stacker_lr.predict(test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
df_res = pd.DataFrame(stacker_lr_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_stack2_Log_Reg.csv')


Meta learner: Logistic Regression - Start  1621480133.683062


### CHI SQUARE , K=1000

In [8]:
# KBEST -chi2
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime

start = time.time()
print("start ",start)
kbest_chi2 = SelectKBest(chi2, k=1000).fit(train, y_train)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)
end= time.time()
print("end ",end)
print("time: ",end-start)


start  1621483534.493375
end  1621483583.506072
time:  49.012696981430054


### Individual Classifiers

In [11]:

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train)
    result = model.predict(X_test_kbest_chi2)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_res_chi2_'+title+'_full.csv')



MNB
START  1621402253.2543108
END  1621402253.6907241
MNB Time: 0.4364132881164551  s
LinearSVC
START  1621402253.770417




END  1621402280.263106
LinearSVC Time: 26.49268913269043  s
Decision Tree
START  1621402280.296611
END  1621402289.7775822
Decision Tree Time: 9.480971097946167  s
Logistic Regression
START  1621402289.8047051
END  1621402299.612122
Logistic Regression Time: 9.807416915893555  s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Stacking - Chi^2

In [10]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB 

classifiers = [MultinomialNB(),
          DecisionTreeClassifier()]

titles = ['MNB',
          'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr1= StackingClassifier(classifiers, meta_classifier_lr)

start = time.time()
print("START meta learner: Logistic Regression ",start)
stacker_lr.fit(X_train_kbest_chi2, y_train)
stacker_lr_res= stacker_lr.predict(X_test_kbest_chi2)
end = time.time()
print("END meta learner: Logistic Regression ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_chi2_stack1_Log_Reg.csv')



START meta learner: Logistic Regression  1621483605.00847
END meta learner: Logistic Regression  1621483616.383765
Time: 11.375294923782349  s


In [12]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB + LogReg

classifiers = [MultinomialNB(),
                DecisionTreeClassifier(),
                LogisticRegression()]

titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(X_train_kbest_chi2, y_train)
stacker_lr_res = stacker_lr.predict(X_test_kbest_chi2)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
df_res = pd.DataFrame(stacker_lr_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_chi2_stack2_Log_Reg.csv')


Meta learner: Logistic Regression - Start  1621484149.660085
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Meta learner: Logistic Regression - End  1621484174.799222
