## COUNT VECTORIZER - Real Train & Test

### Raw Train & Test DataSet

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# Test to see approximate accuracy by splitting the training data to train and test data = 7:3

base_path = "./COMP30027_2021_Project2_datasets/"

train_data = pd.read_csv(base_path+"recipe_train.csv")
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

test_data = pd.read_csv(base_path+"recipe_test.csv")
X_test = test_data.iloc[:,:]


### Count Vectoriser for text features

In [4]:
import pickle
import scipy 

count_vec_path = base_path+"recipe_text_features_countvec/"

# Get the sparse matrix of the Bag-of-Word representation of text features for training data

# TRAIN & TEST DATASET - NAME
name_file = count_vec_path + "train_name_countvectorizer.pkl"
vocab_name = pickle.load(open(name_file, "rb"))
train_name_matrix = scipy.sparse.load_npz(count_vec_path +'train_name_vec.npz')
test_name_matrix = scipy.sparse.load_npz(count_vec_path +'test_name_vec.npz')
df_train_name = pd.DataFrame(train_name_matrix.todense(),columns = vocab_name.get_feature_names())
df_test_name = pd.DataFrame(test_name_matrix.todense(),columns = vocab_name.get_feature_names())

# TRAIN & TEST DATASET - STEPS
steps_file = count_vec_path +"train_steps_countvectorizer.pkl"
vocab_steps = pickle.load(open(steps_file, "rb"))
train_steps_matrix = scipy.sparse.load_npz(count_vec_path +'train_steps_vec.npz')
test_steps_matrix = scipy.sparse.load_npz(count_vec_path +'test_steps_vec.npz')
df_train_steps = pd.DataFrame(train_steps_matrix.todense(),columns = vocab_steps.get_feature_names())
df_test_steps = pd.DataFrame(test_steps_matrix.todense(),columns = vocab_steps.get_feature_names())

# TRAIN & TEST DATASET- INGREDIENTS
ingr_file = count_vec_path + "train_ingr_countvectorizer.pkl"
vocab_ingr = pickle.load(open(ingr_file, "rb"))
train_ingr_matrix = scipy.sparse.load_npz(count_vec_path +'train_ingr_vec.npz')
test_ingr_matrix = scipy.sparse.load_npz(count_vec_path +'test_ingr_vec.npz')
df_train_ingr = pd.DataFrame(train_ingr_matrix.todense(),columns = vocab_ingr.get_feature_names())
df_test_ingr = pd.DataFrame(test_ingr_matrix.todense(),columns = vocab_ingr.get_feature_names())

# TRAIN & TEST DATASET- N_STEPS
train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

# TRAIN & TEST DATASET- N_INGREDIENTS
train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# STILL USING ALL FEATURES AND THEIR MATRICES
train = pd.concat([df_train_name,df_train_steps,df_train_ingr,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name,df_test_steps,df_test_ingr,test_n_steps,test_n_ingredients],axis=1)




In [None]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


### Individual Classifiers

In [None]:
# Predict using each individual classifiers
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [GaussianNB(),
          MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['GNB',
          'MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train,y_train)
    result = model.predict(X_test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_res_'+title+'_full.csv')
    

In [None]:
# Predict using Stacking 
# Base Classifiers : Multinomial Naive Bayes + Decision Tree
# Meta Classifier : Logistic Regression
classifiers1 = [MultinomialNB(),
                DecisionTreeClassifier()]

titles1 = ['MNB',
           'Decision Tree']

meta_classifier_lr1 = LogisticRegression()
stacker_lr1 = StackingClassifier(classifiers1, meta_classifier_lr1)

start = time.time()
print("Meta learner: Logistic Regression - Start",start)
stacker_lr1.fit(X_train, y_train)
stacker_lr1_res = stacker_lr1.predict(X_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr1_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_stack1_Log_Reg.csv')



In [10]:
# KBEST -chi2

from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime
start= time.time()
print("start ",start)
kbest_chi2 = SelectKBest(chi2, k=1000).fit(train, y_train)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)
# print(X_train_kbest_chi2.shape)
# print(X_test_kbest_chi2.shape)
end= time.time()
print("end ",end)
print("time: ",end-start)


start  1621402145.123966
end  1621402217.3950982
time:  72.27113223075867


In [11]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime
models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train)
    result = model.predict(X_test_kbest_chi2)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_res_chi2_'+title+'_full.csv')



MNB
START  1621402253.2543108
END  1621402253.6907241
MNB Time: 0.4364132881164551  s
LinearSVC
START  1621402253.770417




END  1621402280.263106
LinearSVC Time: 26.49268913269043  s
Decision Tree
START  1621402280.296611
END  1621402289.7775822
Decision Tree Time: 9.480971097946167  s
Logistic Regression
START  1621402289.8047051
END  1621402299.612122
Logistic Regression Time: 9.807416915893555  s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

In [13]:
print("#1 STACKING META LEARNER - COUNT VEC: LOGISTIC REG , BASE: MNB, DT ")
classifiers1 = [MultinomialNB(),
          DecisionTreeClassifier()]
titles1 = ['MNB',
          'Decision Tree']

meta_classifier_lr1 = LogisticRegression()
stacker_lr1 = StackingClassifier(classifiers1, meta_classifier_lr1)

print("Base Learners: MNB & DT")
for title, clf in zip(titles1, classifiers1):
    print(title)
    start = time.time()
    print("START ",start)
    clf.fit(X_train_kbest_chi2,y_train)
    result = clf.predict(X_test_kbest_chi2)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_CV_chi2_base_1_'+title+'.csv')

start = time.time()
print("START meta learner: Logistic Regression ",start)
stacker_lr1.fit(train, y_train)
stacker_lr1_res= stacker_lr1.predict(test)
end = time.time()
print("END meta learner: Logistic Regression ",end)
t = end - start
print('Time:', t," s")
df_res = pd.DataFrame(stacker_lr1_res, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_CV_chi2_stack1_Log_Reg.csv')



#1 STACKING META LEARNER - COUNT VEC: LOGISTIC REG , BASE: MNB, DT 
Base Learners: MNB & DT
MNB
START  1621402361.190933
END  1621402361.4258718
MNB Time: 0.2349388599395752  s
Decision Tree
START  1621402361.454274
END  1621402371.034938
Decision Tree Time: 9.580664157867432  s
START meta learner: Logistic Regression  1621402371.061424
END meta learner: Logistic Regression  1621402588.660233
Time: 217.59880900382996  s


# KBEST - F_classifier

In [4]:
#MUTUAL INFORMATION
# from sklearn.feature_selection import SelectKBest, mutual_info_classif
# from matplotlib import pyplot

# mi = SelectKBest(score_func=mutual_info_classif, k=1000)
# print("FITTING")
# X_train_mi = mi.fit_transform(df_train_name,y_train)
    
# for feat_num in mi.get_support(indices=True):
#     print(vocab_name.get_feature_names()[feat_num])

from sklearn.feature_selection import SelectKBest, chi2,f_classif
import time
from time import ctime
start= time.time()
print("start ",start)
X_new = SelectKBest(f_classif, k=1000).fit_transform(train, y_train)
X_new.shape
end= time.time()
print("end ",end)
print("time: ",end-start)


start  1621398619.578366
end  1621398789.440448
time:  169.86208200454712


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# Test to see approximate accuracy by splitting the training data to train and test data = 7:3

data = pd.read_csv("recipe_train.csv")
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X,y,test_size=0.33)

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Test to see approximate accuracy by splitting the training data to train and test data = 7:3

data = pd.read_csv("recipe_train.csv")
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X,y,test_size=0.33)

train_n_steps = pd.DataFrame(X_train_raw.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test_raw.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

train_n_ingredients = pd.DataFrame(X_train_raw.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test_raw.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# count vectorizer on feature 'name'
vec = CountVectorizer(stop_words='english')
X_train_name = vec.fit_transform(X_train_raw.name)
X_test_name = vec.transform(X_test_raw.name)
df_train_name = pd.DataFrame(X_train_name.todense(),columns = vec.get_feature_names())
df_test_name = pd.DataFrame(X_test_name.todense(),columns = vec.get_feature_names())

# count vectorizer on feature 'steps'
X_train_steps = vec.fit_transform(X_train_raw.steps)
X_test_steps = vec.transform(X_test_raw.steps)
df_train_steps = pd.DataFrame(X_train_steps.todense(),columns = vec.get_feature_names())
df_test_steps = pd.DataFrame(X_test_steps.todense(),columns = vec.get_feature_names())


# count vectorizer on feature 'ingredients'
X_train_ing = vec.fit_transform(X_train_raw.ingredients)
X_test_ing = vec.transform(X_test_raw.ingredients)
df_train_ing = pd.DataFrame(X_train_ing.todense(),columns = vec.get_feature_names())
df_test_ing = pd.DataFrame(X_test_ing.todense(),columns = vec.get_feature_names())
print(X_train_ing.shape, X_test_ing.shape)

# put all into one dataframe
train = pd.concat([df_train_name, df_train_steps,df_train_ing,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name, df_test_steps,df_test_ing,test_n_steps,test_n_ingredients],axis=1)
print(train.shape)

(26800, 2686) (13200, 2686)
(26800, 26361)


In [17]:
#KBEST - f_classif

from sklearn.feature_selection import SelectKBest, chi2,f_classif
import time
from time import ctime
start= time.time()
print("start ",start)
kbest_f_classif = SelectKBest(f_classif, k=5000).fit(train, y_train_raw)
X_train_kbest_fclassif = kbest_f_classif.transform(train)
X_test_kbest_fclassif = kbest_f_classif.transform(test)
print(X_train_kbest_fclassif.shape)
print(X_test_kbest_fclassif.shape)
end= time.time()
print("end ",end)
print("time: ",end-start)


start  1621403008.1928792
(26800, 5000)
(13200, 5000)
end  1621403090.18732
time:  81.99444079399109


In [18]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [GaussianNB(),
          MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['GNB',
          'MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_fclassif,y_train_raw)
    acc = model.score(X_test_kbest_fclassif,y_test_raw)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")

GNB
START  1621403093.73764
END  1621403100.624009
GNB Accuracy: 0.5983333333333334 Time: 6.886368989944458  s
MNB
START  1621403100.624608
END  1621403102.08365
MNB Accuracy: 0.7119696969696969 Time: 1.4590420722961426  s
LinearSVC
START  1621403102.083935




END  1621403119.116918
LinearSVC Accuracy: 0.7775757575757576 Time: 17.03298306465149  s
Decision Tree
START  1621403119.117365
END  1621403136.802617
Decision Tree Accuracy: 0.7318939393939394 Time: 17.68525218963623  s
Logistic Regression
START  1621403136.8031108


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


END  1621403170.9354
Logistic Regression Accuracy: 0.7925757575757576 Time: 34.13228917121887  s


In [19]:
# KBEST -chi2

from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime
start= time.time()
print("start ",start)
kbest_chi2 = SelectKBest(chi2, k=5000).fit(train, y_train_raw)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)
print(X_train_kbest_chi2.shape)
print(X_test_kbest_chi2.shape)
end= time.time()
print("end ",end)
print("time: ",end-start)


start  1621403257.132013
(26800, 5000)
(13200, 5000)
end  1621403275.383455
time:  18.251441955566406


In [20]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train_raw)
    acc = model.score(X_test_kbest_chi2,y_test_raw)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")

MNB
START  1621403285.7115688
END  1621403288.357465
MNB Accuracy: 0.7119696969696969 Time: 2.6458961963653564  s
LinearSVC
START  1621403288.363976




END  1621403304.349289
LinearSVC Accuracy: 0.7742424242424243 Time: 15.985312938690186  s
Decision Tree
START  1621403304.3556879
END  1621403321.744792
Decision Tree Accuracy: 0.7340909090909091 Time: 17.38910412788391  s
Logistic Regression
START  1621403321.744971


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


END  1621403354.9733171
Logistic Regression Accuracy: 0.7936363636363636 Time: 33.22834610939026  s
