In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Test to see approximate accuracy by splitting the training data to train and test data = 7:3
train_file = "./COMP30027_2021_Project2_datasets/recipe_train.csv"
train_data = pd.read_csv(train_file)
X_train_raw = train_data.iloc[:,:-1]
y_train_raw = train_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_train_raw,y_train_raw,test_size=0.33)

train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# tfidf vectorizer on feature 'name'
vec = TfidfVectorizer(stop_words='english')
X_train_name = vec.fit_transform(X_train.name)
X_test_name = vec.transform(X_test.name)
df_train_name = pd.DataFrame(X_train_name.todense(),columns = vec.get_feature_names())
df_test_name = pd.DataFrame(X_test_name.todense(),columns = vec.get_feature_names())

# tfidf vectorizer on feature 'steps'
X_train_steps = vec.fit_transform(X_train.steps)
X_test_steps = vec.transform(X_test.steps)
df_train_steps = pd.DataFrame(X_train_steps.todense(),columns = vec.get_feature_names())
df_test_steps = pd.DataFrame(X_test_steps.todense(),columns = vec.get_feature_names())

# tfidf vectorizer on feature 'ingredients'
X_train_ing = vec.fit_transform(X_train.ingredients)
X_test_ing = vec.transform(X_test.ingredients)
df_train_ing = pd.DataFrame(X_train_ing.todense(),columns = vec.get_feature_names())
df_test_ing = pd.DataFrame(X_test_ing.todense(),columns = vec.get_feature_names())

train = pd.concat([df_train_name, df_train_steps,df_train_ing,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name, df_test_steps,df_test_ing,test_n_steps,test_n_ingredients],axis=1)


(26800, 8673)
  (0, 1197)	0.47032132889335654
  (0, 7475)	0.8824952394137813
  (1, 8241)	0.595015923164173
  (1, 1169)	0.4825289171029887
  (1, 1527)	0.49405807350593456
  (1, 3374)	0.41113077644966994
  (2, 5666)	0.4616437996180066
  (2, 6570)	0.734230661685901
  (2, 7223)	0.49778543340934917
  (3, 5717)	0.5020235947000298
  (3, 3053)	0.3339094003460844
  (3, 4008)	0.5575995033484329
  (3, 72)	0.5705783176659973
  (4, 1537)	0.5103312951184149
  (4, 6208)	0.6817027998159024
  (4, 4794)	0.5242549589140011
  (5, 6271)	0.4778762230335413
  (5, 1340)	0.5243123721404698
  (5, 1906)	0.7047913534370512
  (6, 1021)	0.4899819855758095
  (6, 1942)	0.871732558650408
  (7, 7353)	0.43345233355962043
  (7, 8017)	0.7355919530208637
  (7, 7422)	0.5205992251076361
  (8, 670)	0.392186645379059
  :	:
  (26794, 7571)	0.5677399820296545
  (26794, 661)	0.43868471740891885
  (26794, 4130)	0.5789916031942052
  (26794, 3658)	0.3872928542426632
  (26795, 8628)	0.5039621600993294
  (26795, 8198)	0.51676254958584

In [1]:
#STACKING FROM W8 Prac
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)

### Individual Classifiers - FULL FEATURES

In [11]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression(max_iter=1000)]

titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train)
    acc = model.score(test,y_test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")


MNB
START  1621478755.5711548
END  1621478756.01682
MNB Accuracy: 0.718939393939394 Time: 0.4456651210784912  s
LinearSVC
START  1621478756.01766
END  1621478771.5146492
LinearSVC Accuracy: 0.7908333333333334 Time: 15.496989250183105  s
Decision Tree
START  1621478771.5150971
END  1621478777.147224
Decision Tree Accuracy: 0.7165151515151515 Time: 5.632126808166504  s
Logistic Regression
START  1621478777.147409
END  1621478882.139201
Logistic Regression Accuracy: 0.7925757575757576 Time: 104.99179196357727  s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Stacking - FULL FEATURES

In [15]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

classifiers = [MultinomialNB(),
          DecisionTreeClassifier()]

titles = ['MNB',
          'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(train,y_train)
acc = stacker_lr.score(test,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

Meta learner: Logistic Regression - Start  1621479573.625083
Meta learner: Logistic Regression - End  1621479579.623137
Accuracy  0.7187878787878788
Time: 5.998054027557373  s


In [16]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB + LogReg
classifiers1 = [MultinomialNB(),
                DecisionTreeClassifier(),
                LogisticRegression()]

titles1 = ['MNB',
          'Decision Tree',
          'Logistic Regression']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(train, y_train)
acc = stacker_lr.score(test,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

Meta learner: Logistic Regression - Start  1621479604.098793
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Meta learner: Logistic Regression - End  1621479620.193346
Accuracy  0.7221969696969697
Time: 16.094552993774414  s


### CHI SQUARE

In [9]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime

kbest_chi2 = SelectKBest(chi2, k=1000).fit(train, y_train)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)

### Individual Classifiers - CHI SQUARE

In [9]:

models = [MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          LogisticRegression()]
          
titles = ['MNB',
          'LinearSVC',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train)
    acc = model.score(X_test_kbest_chi2,y_test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title, "Accuracy:",acc, 'Time:', t," s")

MNB
START  1621475063.5190709
END  1621475092.2789571
MNB Time: 28.759886264801025  s
LinearSVC
START  1621475093.6496482


### STACKING - CHI SQUARE

In [None]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB

classifiers1 = [MultinomialNB(),
          DecisionTreeClassifier()]

titles1 = ['MNB',
          'Decision Tree']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(X_train_kbest_chi2,y_train)
acc = stacker_lr.score(X_test_kbest_chi2,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")

In [None]:
# Meta Learner : Logistic Regression 
# Base Learners : DT + MNB + LogReg

classifiers = [MultinomialNB(),
           DecisionTreeClassifier(),
           LogisticRegression()]

titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)
start = time.time()
print("Meta learner: Logistic Regression - Start ",start)
stacker_lr.fit(X_train_kbest_chi2,y_train)
acc = stacker_lr.score(X_test_kbest_chi2,y_test)
end = time.time()
print("Meta learner: Logistic Regression - End ",end)
print("Accuracy ",acc)
t = end - start
print('Time:', t," s")