### Count Vectoriser - Train Test

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Test to see approximate accuracy by splitting the training data to train and test data = 7:3
train_file = "./COMP30027_2021_Project2_datasets/recipe_train.csv"
train_data = pd.read_csv(train_file)
X_train_raw = train_data.iloc[:,:-1]
y_train_raw = train_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_train_raw,y_train_raw,test_size=0.33)

train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# count vectorizer on feature 'name'
vec = CountVectorizer(stop_words='english')
X_train_name = vec.fit_transform(X_train.name)
X_test_name = vec.transform(X_test.name)
df_train_name = pd.DataFrame(X_train_name.todense(),columns = vec.get_feature_names())
df_test_name = pd.DataFrame(X_test_name.todense(),columns = vec.get_feature_names())

# count vectorizer on feature 'steps'
X_train_steps = vec.fit_transform(X_train.steps)
X_test_steps = vec.transform(X_test.steps)
df_train_steps = pd.DataFrame(X_train_steps.todense(),columns = vec.get_feature_names())
df_test_steps = pd.DataFrame(X_test_steps.todense(),columns = vec.get_feature_names())


# count vectorizer on feature 'ingredients'
X_train_ing = vec.fit_transform(X_train.ingredients)
X_test_ing = vec.transform(X_test.ingredients)
df_train_ing = pd.DataFrame(X_train_ing.todense(),columns = vec.get_feature_names())
df_test_ing = pd.DataFrame(X_test_ing.todense(),columns = vec.get_feature_names())

# put all into one dataframe
train = pd.concat([df_train_name, df_train_steps,df_train_ing,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name, df_test_steps,df_test_ing,test_n_steps,test_n_ingredients],axis=1)


(26800, 26272)


In [21]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)

#### Individual Classifiers -  Full Features

In [3]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]

titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train)
    acc = model.score(test,y_test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")


MNB
START  1621410722.59531
END  1621410786.0641549
MNB Accuracy: 0.7396212121212121 Time: 63.46884489059448  s
LinearSVC
START  1621410786.168375




END  1621410862.7662442
LinearSVC Accuracy: 0.7507575757575757 Time: 76.59786915779114  s
Decision Tree
START  1621410862.772287
END  1621410934.6870122
Decision Tree Accuracy: 0.7366666666666667 Time: 71.9147253036499  s
Logistic Regression
START  1621410934.6885839


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


END  1621411650.031492
Logistic Regression Accuracy: 0.7907575757575758 Time: 715.3429081439972  s


#### Stacking - Full Features

In [5]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

classifiers = [MultinomialNB(),
          DecisionTreeClassifier()]

titles = ['MNB',
          'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(train, y_train)
acc = stacker_lr.score(test,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

Meta learner: Logistic Regression - Start  1621411833.8975801
Meta learner: Logistic Regression - End  1621411962.687659
Accuracy  0.7346969696969697
Time: 128.7900788784027  s


In [6]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB + LogReg

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

classifiers = [MultinomialNB(),
                DecisionTreeClassifier(),
                LogisticRegression()]

titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(train, y_train)
acc = stacker_lr.score(test,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

Meta learner: Logistic Regression - Start  1621412250.669444


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Meta learner: Logistic Regression - End  1621413118.7024372
Accuracy  0.738030303030303
Time: 868.0329930782318  s


### CHI SQUARE - K Best , k=1000, chi^2

In [9]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime

kbest_chi2 = SelectKBest(chi2, k=1000).fit(train, y_train)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)


### Individual Classifiers - CHI SQUARE

In [11]:

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]

titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train)
    acc = model.score(X_test_kbest_chi2,y_test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")


MNB
START  1621415143.89592
END  1621415144.485644
MNB Accuracy: 0.706060606060606 Time: 0.589724063873291  s
LinearSVC
START  1621415144.4858701




END  1621415160.1332102
LinearSVC Accuracy: 0.8024242424242424 Time: 15.647340059280396  s
Decision Tree
START  1621415160.1334689
END  1621415165.569706
Decision Tree Accuracy: 0.7315909090909091 Time: 5.436237096786499  s
Logistic Regression
START  1621415165.569865
END  1621415173.124801
Logistic Regression Accuracy: 0.8043181818181818 Time: 7.554935932159424  s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Stacking - CHI SQUARE

In [13]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB

classifiers = [MultinomialNB(),
          DecisionTreeClassifier()]

titles = ['MNB',
          'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(X_train_kbest_chi2, y_train)
acc = stacker_lr.score(X_test_kbest_chi2,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

Meta learner: Logistic Regression - Start  1621415265.917923
Meta learner: Logistic Regression - End  1621415271.678397
Accuracy  0.7287121212121213
Time: 5.760473966598511  s


In [15]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB + LogReg

classifiers1 = [MultinomialNB(),
                DecisionTreeClassifier(),
                LogisticRegression()]

titles1 = ['MNB',
          'Decision Tree',
          'Logistic Regression']

meta_classifier_lr1 = LogisticRegression()
stacker_lr1 = StackingClassifier(classifiers1, meta_classifier_lr1)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr1.fit(X_train_kbest_chi2, y_train)
acc = stacker_lr1.score(X_test_kbest_chi2,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

Meta learner: Logistic Regression - Start  1621415443.963565


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Meta learner: Logistic Regression - End  1621415456.7150111
Accuracy  0.7253030303030303
Time: 12.751446008682251  s
