#### Copyright (C) 2022 Sobhan Moradian Daghigh
#### Date: 2/2/2022

### Import Libraries

In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import BaggingClassifier

In [2]:
datasets_name = ['Amazon', 'IMDB  ', 'Yelp  ']

### Logistic Regression

In [3]:
def logestic_regression(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets):
    print("Accuracy:")
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        clf = LogisticRegression()
        clf.fit(x_tr, y_tr)
        score = clf.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

### SVM-Linear 

In [4]:
def linear_svm(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets):
    '''
     GridSearchCV for this classifier returned best {C = 1.7}
     # tuned_parameters = {"C": np.arange(1, 5, 0.1)}
     # linear_svm = GridSearchCV(svm.SVC(kernel='linear'), tuned_parameters, verbose=1)
    '''
    
    print("Accuracy:")
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        
        linear_svm = svm.SVC(kernel='linear', C=1.7)
        linear_svm = linear_svm.fit(x_tr, y_tr)
        score = linear_svm.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

### SVM-RBF

In [5]:
def rbf_svm(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets):
    '''
     GridSearchCV for this classifier returned best {C = 1.5, gamma = 1}
     # tuned_parameters = {"C": np.arange(1, 3, 0.1), "gamma": np.arange(1, 20)}
     # rbf_svm = GridSearchCV(svm.SVC(kernel='rbf'), tuned_parameters, cv=2)
    '''
    
    print("Accuracy:")
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        rbf_svm = svm.SVC(kernel='rbf', C=1.5, gamma=1)
        rbf_svm = rbf_svm.fit(x_tr, y_tr)
        score = rbf_svm.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

### DT

In [6]:
def decision_tree(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets):
    '''
     GridSearchCV for this classifier returned best {max_depth = 20, min_samples_split = 15}
     # tuned_parameters = {"max_depth": np.arange(1, 30), "min_samples_split": np.arange(2, 30)}
     # dtree = GridSearchCV(DecisionTreeClassifier(criterion='entropy'), tuned_parameters, cv=3)
    '''
    
    print("Accuracy:")        
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        dtree = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_split=15, random_state=0)
        dtree = dtree.fit(x_tr, y_tr)
        score = dtree.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

### Multinomial Naive Bayes

In [7]:
def multinomial_naive_bayes(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets):
    '''
     GridSearchCV for this classifier returned best {alpha = 0.5}
     # tuned_parameters = {"alpha": np.arange(1, 0, -0.1)}
     # mnb = GridSearchCV(MultinomialNB(), tuned_parameters, cv=2)
    '''
    
    print("Accuracy:")
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        mnb = MultinomialNB(alpha=0.5)
        mnb = mnb.fit(x_tr, y_tr)
        score = mnb.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

### Bernoulli Naive Bayes

In [8]:
def bernoulli_naive_bayes(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets):
    print("Accuracy:")
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        bnb = BernoulliNB()
        bnb = bnb.fit(x_tr, y_tr)
        score = bnb.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

In [9]:
def get_estimator(name):
    if name == 'LR':
        return LogisticRegression(fit_intercept=False)
    elif name == 'SVM-Linear':
        return svm.SVC(kernel='linear', C=1.7)
    elif name == 'SVM-RBF':
        return svm.SVC(kernel='rbf', C=1.5, gamma=1)
    elif name == 'DT':
        return DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_split=15, random_state=0)
    elif name == 'MNB':
        return MultinomialNB(alpha=0.5)
    elif name == 'BNB':
        return BernoulliNB()

### Bagging Ensemble

In [10]:
def bagging(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets, base_estimator):
    print("Accuracy:")
    for i, (x_tr, x_ts, y_tr, y_ts) in enumerate(zip(x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
        bag = BaggingClassifier(base_estimator=get_estimator(base_estimator), n_estimators=8, random_state=111)
        bag.fit(x_tr, y_tr)
        score = bag.score(x_ts, y_ts)
        print(' |_  {}: {:.2f}'.format(datasets_name[i], score))

In [11]:
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(model, x_tr, y_tr, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')