## TF IDF - Real Train & Test

### TF-IDF on Text Features

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

train_file = "./COMP30027_2021_Project2_datasets/recipe_train.csv"
test_file = "./COMP30027_2021_Project2_datasets/recipe_test.csv"

train_data = pd.read_csv(train_file)
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
test_data = pd.read_csv(test_file)
X_test = test_data.iloc[:,:]

train_n_steps = pd.DataFrame(X_train.n_steps)
train_n_steps.reset_index(drop=True, inplace=True)
test_n_steps = pd.DataFrame(X_test.n_steps)
test_n_steps.reset_index(drop=True, inplace=True)

train_n_ingredients = pd.DataFrame(X_train.n_ingredients)
train_n_ingredients.reset_index(drop=True, inplace=True)
test_n_ingredients = pd.DataFrame(X_test.n_ingredients)
test_n_ingredients.reset_index(drop=True, inplace=True)

# tfidf vectorizer on feature 'name'
vec = TfidfVectorizer(stop_words='english')
X_train_name = vec.fit_transform(X_train.name)
X_test_name = vec.transform(X_test.name)
df_train_name = pd.DataFrame(X_train_name.todense(),columns = vec.get_feature_names())
df_test_name = pd.DataFrame(X_test_name.todense(),columns = vec.get_feature_names())

# tfidf vectorizer on feature 'steps'
X_train_steps = vec.fit_transform(X_train.steps)
X_test_steps = vec.transform(X_test.steps)
df_train_steps = pd.DataFrame(X_train_steps.todense(),columns = vec.get_feature_names())
df_test_steps = pd.DataFrame(X_test_steps.todense(),columns = vec.get_feature_names())

# tfidf vectorizer on feature 'ingredients'
X_train_ing = vec.fit_transform(X_train.ingredients)
X_test_ing = vec.transform(X_test.ingredients)
df_train_ing = pd.DataFrame(X_train_ing.todense(),columns = vec.get_feature_names())
df_test_ing = pd.DataFrame(X_test_ing.todense(),columns = vec.get_feature_names())

train = pd.concat([df_train_name, df_train_steps,df_train_ing,train_n_steps,train_n_ingredients],axis=1)
test = pd.concat([df_test_name, df_test_steps,df_test_ing,test_n_steps,test_n_ingredients],axis=1)


### Individual Classifiers - All Features

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from time import ctime

models = [MultinomialNB(),
          DecisionTreeClassifier(),
          LogisticRegression()]
          
titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(train,y_train)
    result = model.predict(test)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_res_'+title+'_full.csv')

MNB
START  1621517737.2941608
END  1621517810.416828
MNB Time: 73.12266707420349  s
Decision Tree
START  1621517810.560971
END  1621517952.0632908
Decision Tree Time: 141.50231981277466  s
Logistic Regression
START  1621517952.111003
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
END  1621519735.858182
Logistic Regression Time: 1783.747179031372  s


### Stacking - All Features

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
classifiers = [('MNB',MultinomialNB()),('DT',DecisionTreeClassifier())]
stack_clf = StackingClassifier(estimators=classifiers, final_estimator=LogisticRegression(),cv=5).fit(train, y_train)
result = stack_clf.predict(test)
df_res = pd.DataFrame(result, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_TFIDF_stack_sklearn_Log_Reg.csv')

### CHI SQUARE , K=1000

In [4]:
from sklearn.feature_selection import SelectKBest, chi2
import time
from time import ctime

kbest_chi2 = SelectKBest(chi2, k=1000).fit(train, y_train)
X_train_kbest_chi2 = kbest_chi2.transform(train)
X_test_kbest_chi2 = kbest_chi2.transform(test)


### Individual Classifiers - CHI SQUARE , K=1000

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Predict using each individual classifiers
models = [MultinomialNB(),
          DecisionTreeClassifier(),
          LogisticRegression()]
          
titles = ['MNB',
          'Decision Tree',
          'Logistic Regression']

for title, model in zip(titles, models):
    print(title)
    start = time.time()
    print("START ",start)
    model.fit(X_train_kbest_chi2,y_train)
    result = model.predict(X_test_kbest_chi2)
    end = time.time()
    print("END ",end)
    t = end - start
    print(title,'Time:', t," s")
    df_res_full = pd.DataFrame(result, columns = ['duration_label'])
    df_res_full.index = df_res_full.index + 1
    df_res_full.index.name='id'
    df_res_full.to_csv('df_TFIDF_res_chi2_'+title+'.csv')
    

MNB
START  1621520571.495339
END  1621520571.795874
MNB Time: 0.3005352020263672  s
Decision Tree
START  1621520571.833409
END  1621520582.188133
Decision Tree Time: 10.354723930358887  s
Logistic Regression
START  1621520582.217689
END  1621520593.2858791
Logistic Regression Time: 11.068190097808838  s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Stacking - CHI SQUARE , K =1000

In [5]:
from sklearn.ensemble import StackingClassifier
classifiers = [('MNB',MultinomialNB()),('DT',DecisionTreeClassifier())]
stack_clf = StackingClassifier(estimators=classifiers, final_estimator=LogisticRegression(),cv=5).fit(X_train_kbest_chi2, y_train)
result = stack_clf.predict(X_test_kbest_chi2)
df_res = pd.DataFrame(result, columns = ['duration_label'])
df_res.index = df_res.index + 1
df_res.index.name='id'
df_res.to_csv('df_TFIDF_chi2_stack_sklearn_Log_Reg.csv')