In [None]:
import numpy as np
import itertools
from scipy import sparse
from scipy.sparse import hstack, vstack
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
np.random.seed(1234)


# Cleaning Data: 20 News Group

In [None]:
# 20newsgroupdataset

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

twenty_train = fetch_20newsgroups(subset='train',remove =['headers','footers','quotes'])

count_vect_train = CountVectorizer()
X_train_counts = count_vect_train.fit_transform(twenty_train.data)
Y_train_counts = twenty_train.target

twenty_test = fetch_20newsgroups(subset='test',remove =['headers','footers','quotes'])
#count_vect_test = CountVectorizer()
X_test_counts = count_vect_train.transform(twenty_test.data)
Y_test_counts = twenty_test.target




Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
#Splitting the training dataset for the 20 news group

#20% of the training data
X_train20,X_20,Y_train20,Y_20 = train_test_split(X_train_counts, Y_train_counts, train_size = 0.2)

#40% of the training data
X_train40,X_40,Y_train40,Y_40 = train_test_split(X_train_counts, Y_train_counts, train_size = 0.4)

#60% of the training data
X_train60,X_60,Y_train60,Y_60 = train_test_split(X_train_counts, Y_train_counts, train_size = 0.6)

#80% of the training data
X_train80,X_80,Y_train80,Y_80 = train_test_split(X_train_counts, Y_train_counts, train_size = 0.8)


# Cleaning Data: IMDb Reviews

In [None]:
# IMDB Reviews
# Used: https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184

from google.colab import drive

drive.mount('/content/drive/',force_remount=True)
!ls "/content/drive/My Drive/COMP551/"


reviews_train = []
for line in open('/content/drive/My Drive/COMP551/full_train.txt', 'r'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('/content/drive/My Drive/COMP551/full_test.txt', 'r'):
    reviews_test.append(line.strip())


Mounted at /content/drive/
breast_cancer_wisconsin.csv  full_test.txt  full_train.txt  hepatitis.csv


In [None]:
# Preprocess IMDB data

import re
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)




In [None]:
# Vectorizing IMDB data

cv = CountVectorizer(binary=True) # Binary classification
cv.fit(reviews_train_clean)
imdb_train = cv.transform(reviews_train_clean)
imdb_test = cv.transform(reviews_test_clean)

In [None]:
# IMDB Reviews splitting the data

target = [1 if i < 12500 else 0 for i in range(25000)]

# 100% of training data
x_train, y_train = imdb_train, target
x_test, y_test = imdb_test, target

# 20 % of training data
x_train20 = vstack((x_train[:2500],x_train[12500:15000]), format='csr') 
target20 = [1 if i < 2500 else 0 for i in range(5000)]
y_train20 = target20

# 40 % of training data
x_train40 = vstack((x_train[:5000],x_train[12500:17500]), format='csr')
target40 = [1 if i < 5000 else 0 for i in range(10000)]
y_train40 = target40

# 60 % of training data
x_train60 = vstack((x_train[:7500],x_train[12500:20000]), format='csr') 
target60 = [1 if i < 7500 else 0 for i in range(15000)]
y_train60 = target60

# 80 % of training data
x_train80 = vstack((x_train[:10000],x_train[12500:22500]), format='csr') 
target80 = [1 if i < 10000 else 0 for i in range(20000)]
y_train80 = target80


#Multinomial Naive Bayes

In [None]:
# Multinomial Naive Bayes
# Used: https://stackoverflow.com/questions/60969884/multinomial-naive-bayes-for-python-from-scratch
from sklearn.base import BaseEstimator, ClassifierMixin

def logsumexp(Z):                                             
  Zmax = np.max(Z,axis=0)                     
  log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
  return log_sum_exp

class MultinomialNaiveBayes(BaseEstimator, ClassifierMixin):

  def __init__(self, alpha=0.001):
    self.alpha = alpha 


  def fit(self, X_train, y_train):
    H, L = X_train.shape
    self.classes = np.unique(y_train)
    numberOfClasses = len(self.classes)

    self.priors = np.zeros(numberOfClasses)
    self.likelihoods = np.zeros((numberOfClasses, L))

    for i, c in enumerate(self.classes):
        X_train_c = X_train[c == y_train]
        self.priors[i] = X_train_c.shape[0] / H 
        self.likelihoods[i, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))

  def predict(self, X_test):
    return [self.helperPredict(x_test) for x_test in X_test]

  def helperPredict(self, x_test):
    posteriors = []
    for i, c in enumerate(self.classes):
        logprior = np.log(self.priors[i])
        loglikelihood = self.likelihoodCalculator(self.likelihoods[i,:], x_test)
        sumposteriors = np.sum(loglikelihood) + logprior
        posteriors.append(sumposteriors)
    sumexp = np.exp(posteriors - logsumexp(posteriors))
    return self.classes[np.argmax(posteriors)]

  def likelihoodCalculator(self, loglike, x_test):
    A = np.log(loglike)
    sA = sparse.csr_matrix(A) 
    return sA.multiply(x_test)

  def score(self, X_test, y_test):
    y_pred = self.predict(X_test)
    count = 0;
    for i in range(len(y_test)):
      if y_pred[i]==y_test[i]:
        count = count + 1
    accuracy = count/len(y_test)
    return accuracy

# Multinomial Naive Bayes Experiments

In [None]:
# Experiment for IMDB Reviews with optimal alpha

model = MultinomialNaiveBayes(alpha=1.0)
model.fit(x_train20,y_train20)
accuracy = model.score(x_test,y_test)
print(f'Test accuracy for 20% training data: {accuracy}')

model.fit(x_train40,y_train40)
accuracy = model.score(x_test,y_test)
print(f'Test accuracy for 40% training data: {accuracy}')

model.fit(x_train60,y_train60)
accuracy = model.score(x_test,y_test)
print(f'Test accuracy for 60% training data: {accuracy}')

model.fit(x_train80,y_train80)
accuracy = model.score(x_test,y_test)
print(f'Test accuracy for 80% training data: {accuracy}')

model.fit(x_train,y_train)
accuracy = model.score(x_test,y_test)
print(f'Test accuracy for 100% training data: {accuracy}')





In [None]:
# Experiment for twenty news group data with optimal alpha

model = MultinomialNaiveBayes(alpha = 0.011)
model.fit(X_train20,Y_train20)
accuracy = model.score(X_test_counts,Y_test_counts)
print(f'Test accuracy for 20% training data: {accuracy}') 

model.fit(X_train40,Y_train40)
accuracy = model.score(X_test_counts,Y_test_counts)
print(f'Test accuracy for 40% training data: {accuracy}') 

model.fit(X_train60,Y_train60)
accuracy = model.score(X_test_counts,Y_test_counts)
print(f'Test accuracy for 60% training data: {accuracy}') 

model.fit(X_train80,Y_train80)
accuracy = model.score(X_test_counts,Y_test_counts)
print(f'Test accuracy for 80% training data: {accuracy}') 

model.fit(X_train_counts,Y_train_counts)
accuracy = model.score(X_test_counts,Y_test_counts)
print(f'Test accuracy for 100% training data: {accuracy}') 


# Logistic Regression Experiments

In [None]:
# Logistic regression IMDB Reviews


# Logistic regression with 100% of training data
for c in [0.01, 0.05, 0.25, 0.5, 1]:
   logistic_reg = LogisticRegression(C=c, max_iter=1000)
   logistic_reg.fit(x_train, y_train)
   print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, logistic_reg.predict(x_test))))

# Logistic regression with 80% of training data
for c in [0.01, 0.05, 0.25, 0.5, 1]:
 logistic_reg = LogisticRegression(C=c, max_iter=1000)
 logistic_reg.fit(x_train80, y_train80)
 print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, logistic_reg.predict(x_test))))

# Logistic regression with 60% of training data
for c in [0.01, 0.05, 0.25, 0.5, 1]:
   logistic_reg = LogisticRegression(C=c, max_iter=1000)
   logistic_reg.fit(x_train60, y_train60)
   print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, logistic_reg.predict(x_test))))

# Logistic regression with 40% of training data
for c in [0.01, 0.05, 0.25, 0.5, 1]:
   logistic_reg = LogisticRegression(C=c, max_iter=1000)
   logistic_reg.fit(x_train40, y_train40)
   print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, logistic_reg.predict(x_test))))

# Logistic regression with 20% of training data
for c in [0.01, 0.05, 0.25, 0.5, 1]:
   logistic_reg = LogisticRegression(C=c, max_iter=1000)
   logistic_reg.fit(x_train20, y_train20)
   print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, logistic_reg.predict(x_test))))


In [None]:
# Logistic regression twenty news group data


#Logistic regression with 100% of training data
for c in [0.18,0.20,0.15,0.01, 0.05, 0.25, 0.5, 1]:
    logistic_reg = LogisticRegression(C=c, max_iter=1000)
    logistic_reg.fit(X_train_counts, Y_train_counts)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(Y_test_counts, logistic_reg.predict(X_test_counts))))


# Logistic Regression from Scratch

In [None]:
# Logistic Regression from scratch

import numpy as np
#%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')

logistic = lambda z: 1./ (1 + np.exp(-z))       #logistic function

def gradient(self, x, y):
    N,D = x.shape
    yh = logistic(np.dot(x, self.w))   
    grad = np.dot(x.T, yh - y)/N       
    return grad   

def cost_fn(x, y, w):
    N, D = x.shape                                                       
    z = np.dot(x, w)
    J = np.mean(y * np.log1p(np.exp(-z)) + (1-y) * np.log1p(np.exp(z))) 
    return J                     

class ScratchLogisticRegression:

  def __init__(self, add_bias=True, learning_rate=0.1, epsilon=1e-4, max_iters=1e5, verbose=False):
    self.add_bias = add_bias
    self.learning_rate = learning_rate
    self.epsilon = epsilon
    self.max_iters = max_iters
    self.verbose = verbose

  def fit(self, x, y):
    if x.ndim ==1:
      x = x[:, None]
    if self.add_bias:
      N = x.shape[0]
      x = np.vstack([x, np.ones(N)])
    N,D = x.shape
    self.w = np.zeros(D)
    g = np.inf
    t = 0
    # gradient descent
    while np.linalg.norm(g) > self.epsilon and t < self.max_iters:
      g = self.gradient(x, y)
      self.w = self.w - self.learning_rate * g
      t += 1
    
    if self.verbose:
      print(f'Terminated after {t} iterations, norm of gradient: {np.linalg.norm(g)}')
      print(f'weight found: {self.w}')
    
    return self

  def predict(self, x):
    if x.ndim==1:
      x = x[:, None]
    Nt = x.shape[0]
    if self.add_bias:
      x = np.column_stack([x, np.ones(Nt)])
    yh = logistic(np.dot(xs,self.w))
    return yh

ScratchLogisticRegression.gradient = gradient

  


In [None]:
# comparing different learning rates for Logistic Regression
# this is our implementation, but we were unable to run it due to running out of RAM errors

def L2_loss(y_pred, y_actual):
  return (1/2)((y_actual - y_pred)**2)

def fit_and_run(model, x_train, y_train, x_test, y_test):
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  loss = L2_loss(y_pred, y_test)
  return loss

lr = 0.001
model = ScratchLogisticRegression(add_bias=False, learning_rate=lr)
loss = fit_and_run(model, X_train_counts, Y_train_counts, X_test_counts, Y_test_counts)
print(f'Loss for learning rate {lr}')

lr = 0.01
model = ScratchLogisticRegression(add_bias=False, learning_rate=lr)
loss = fit_and_run(model, X_train_counts, Y_train_counts, X_test_counts, Y_test_counts)
print(f'Loss for learning rate {lr}')

lr = 0.1
model = ScratchLogisticRegression(add_bias=False, learning_rate=lr)
loss = fit_and_run(model, X_train_counts, Y_train_counts, X_test_counts, Y_test_counts)
print(f'Loss for learning rate {lr}')

lr = 1.0
model = ScratchLogisticRegression(add_bias=False, learning_rate=lr)
loss = fit_and_run(model, X_train_counts, Y_train_counts, X_test_counts, Y_test_counts)
print(f'Loss for learning rate {lr}')



# Linear Regression Experiments

In [None]:
# Linear Regression for imdb data
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def runLinear(x, y):
  model = LinearRegression()
  model.fit(x,y)
  y_pred = model.predict(x_test)
  score = model.score(x_test, y_test)
  print(f'score: {score}')
  print(f'mean squared error: {mean_squared_error(y_test, y_pred)}')

print('Linear regression for 20% imdb training data')
x, y = x_train20, y_train20
runLinear(x, y)

print('Linear regression for 40% imdb training data')
x, y = x_train40, y_train40
runLinear(x, y)

print('Linear regression for 60% imdb training data')
x, y = x_train60, y_train60
runLinear(x, y)

print('Linear regression for 80% imdb training data')
x, y = x_train80, y_train80
runLinear(x, y)

print('Linear regression for 100% imdb training data')
x, y = x_train, y_train
runLinear(x, y)




# Cross Validation

In [None]:
import numpy as np
from scipy.sparse import csr_matrix, vstack
# split the data for cross validation
def cross_validation_split(x_data: csr_matrix, y_data, num_folds):
  (num_instances, num_features) = x_data.get_shape()
  n_val = num_instances // num_folds # num of data samples in each split
  # print(n_val)
  inds = np.random.permutation(num_instances) # shuffle 
  folds_x = [None] * num_folds
  folds_y = [None] * num_folds
  for f in range(num_folds):
    print(f'Creating fold {f}')
    r1 = int(f*n_val)
    r2 = int((f+1)*n_val)
    folds_x[f] = x_data.getrow(inds[r1])
    for i in range(r1+1,r2):
      row = x_data.getrow(inds[i])
      folds_x[f] = vstack([folds_x[f], row]) # add row to folds
    folds_x[f].tocsr
    folds_y[f] = (y_data[inds[r1 : r2]])
  return folds_x, np.array(folds_y)
    

In [None]:
#define a function for the MSE loss
loss = lambda y, yh: np.mean((y-yh)**2)

In [None]:
# Cross Validation kfoldCV
import numpy as np
from scipy.sparse import csr_matrix, vstack



def kfoldCV(model, folds_x, folds_y):
  num_folds = len(folds_x)
  print(f'Num folds: {num_folds}')
  (rows_in_fold, cols_in_fold) = folds_x[0].get_shape()
  acc = 0 # holds sum of accuracies
  # go through each fold and run
  err = []
  for f in range(num_folds):
    # get validation set
    validation_set_x = folds_x[f]
    validation_set_y = folds_y[f]
    # assemble training set from the other folds
    training_set_x = csr_matrix(((num_folds-1)*rows_in_fold, cols_in_fold))
    training_set_y = [None] * (num_folds-1)*rows_in_fold 
    below = csr_matrix(((max(0,f-1))*rows_in_fold, cols_in_fold))
    above = csr_matrix((max((0, num_folds-(f+1)))*rows_in_fold, cols_in_fold))
    if f > 0 :
      accum1 = folds_x[0]
      for i in range(1, f-1):
        accum1 = vstack([accum1, folds_x[i]])
      below = accum1

    if f < num_folds-1:
      accum2 = folds_x[f+1]
      for i in range(f+2, num_folds):
        accum2 = vstack([accum2, folds_x[i]])
      above = accum2;
    
    if f == 0:
      training_set_y = folds_y[f+1:].flatten()
    elif f >0 and f < num_folds -1:
      first = folds_y[:f-1].flatten()
      second = folds_y[f+1:].flatten()
      training_set_y = np.concatenate((first, second), axis=None)
    else:
      training_set_y = folds_y[:f-1].flatten()
    
    # create training set
    training_set_x = vstack([below, above])
    print(f'Running on fold {f}')
    #then run model
    model.fit(training_set_x, training_set_y)
    #get accuracy

    pred = model.predict(validation_set_x)
    f_loss = loss(validation_set_y, pred)
    err.append(f_loss)
    print(f'Fold {f} loss: {f_loss}')

  return err

In [None]:
#actually do the cross validation
from sklearn.utils.fixes import loguniform
import matplotlib.pyplot as plt

def cross_validate(x_train, y_train, x_test, y_test):

  (num_instances, num_features) = x_train.get_shape()
  print(f'num instances: {num_instances}, num_features: {num_features}')

  # get folds for 5-fold cross validation
  folds_x, folds_y = cross_validation_split(x_train, y_train, 5)

  # this is where we try different hyperparams
  alpha_list =[0.0001, 0.001, 0.01, 0.1, 1.0]
  accuracies = []
  err_test, err_valid = np.zeros(len(alpha_list)), np.zeros((len(alpha_list), 5))
  i = 0
  for a in alpha_list:
    print(f'alpha: {a}')
    model = MultinomialNaiveBayes(alpha = a)
    err_valid[i] = kfoldCV(model, folds_x, folds_y)
    #test error
    model.fit(x_train, y_train)
    err_test[i] = loss(y_test, model.predict(x_test))
    i += 1

  plt.plot(alpha_list, err_test, label='test')
  plt.plot(alpha_list, err_valid, axis = 1, label='validation')

  plt.legend()
  plt.xlabel('alpha')
  plt.ylabel('mean squared error')
  plt.show()
    


In [None]:
# Cross Validation from scratch

import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# twenty news group data
train_x = X_train_counts
train_y = Y_train_counts

test_x = X_test_counts
test_y = Y_test_counts

cross_validate(train_x, train_y, test_x, test_y)

# imdb data
train_x = x_train
train_y = np.array(y_train)

test_x = x_test
test_y = np.array(y_test)

cross_validate(train_x, train_y, test_x, test_y)



In [None]:
#cross validation using sklearn

import numpy as np

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier

# 20 data
X, y = X_train_counts, Y_train_counts

# build a classifier
clf = MultinomialNaiveBayes()


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {'alpha': [0.001, 0.01, 0.1, 1.0]}

# run randomized search
n_iter_search = 4
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs = -1, cv=5)
print(f'Running CV on 20 data set')
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# imdb data
X, y = x_train, y_train

# run randomized search
n_iter_search = 4
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5)
print(f'Running CV on imdb data set')
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)


In [None]:
# logistic regression cross validation

# specify parameters and distributions to sample from
param_dist = {'C': [0.001, 0.01, 0.1, 0.25, 0.5, 1.0]}

# run randomized search
clf = LogisticRegression(max_iter = 1000)
n_iter_search = 6
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs = -1, cv=5)

X, y = X_train_counts, Y_train_counts
print(f'Running CV on 20 data set for logistic regression')
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

X, y = x_train, y_train
print(f'Running CV on imdb data set for logistic regression')
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

