Name, Student ID: Miow Fong Sim, 881623

# COMP30027 Machine Learning Assignment 2

Besides using pre-computed features provided for Assignment 2, Practical Workshop Week 9 was also used as a reference.

# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# sklearn
import sklearn
from sklearn.model_selection import cross_val_score, train_test_split 
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import accuracy_score

# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# Logistic Regression
from sklearn.linear_model import LogisticRegression

import pickle
import scipy

# Part 1: Meta Files

## Meta Files : 'review_meta_train.csv' and 'review_meta_train.csv'

In [2]:
# Read meta training file into dataframe 
meta_train_dataframe = pd.read_csv('review_meta_train.csv', encoding = 'ISO-8859-1')

# TRAINING
# Then, remove date, review_id, reviewer_id, business_id columns 
meta_X_train = meta_train_dataframe.iloc[:, 4:-1]
meta_Y_train = meta_train_dataframe.iloc[:, -1]

# TESTING 
# Fit meta testing file into dataframe
meta_X_test = pd.read_csv('review_meta_test.csv', encoding = 'ISO-8859-1').iloc[:, 4:]

## 1) Multinomial Naive Bayes

### 10-fold Cross Validation

In [3]:
mnb = MultinomialNB()
scores = cross_val_score(mnb, meta_X_train, meta_Y_train, cv = 10, scoring ='accuracy')
print(scores.mean())

0.6886133485144689


### Meta Test Prediction

In [4]:
mnb_classifier = mnb.fit(meta_X_train, meta_Y_train)
mnb_prediction = mnb_classifier.predict(meta_X_test)


# Store predictions in .csv
result = pd.DataFrame(np.arange(1,len(mnb_prediction) +1), columns = ['Instance_id'])
result ['rating'] = mnb_prediction

result.to_csv("MultinomialNB_meta_prediction.csv", index = False)

## 2) Multi-Class Logistic Regression

### 10-fold Cross Validation

In [5]:
mlr = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000)
scores = cross_val_score(mlr, meta_X_train, meta_Y_train, cv = 10, scoring ='accuracy')
print(scores.mean())

0.6915703308676685


### Meta Test Prediction


In [6]:
mlr_classifier = mlr.fit(meta_X_train, meta_Y_train)
mlr_prediction = mlr_classifier.predict(meta_X_test)


# Store predictions in .csv
result = pd.DataFrame(np.arange(1,len(mnb_prediction) +1), columns = ['Instance_id'])
result ['rating'] = mlr_prediction

result.to_csv("MulticlassLR_meta_prediction.csv", index = False)

# Part 2: Text Files 

## Count Vectoriser Files



- train_countvectorizer.pkl
- Contains the CountVectorizer extracted using the review text of the
  training data.


In [7]:
vocab = pickle.load(open("train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_



In [8]:
print(len(vocab_dict)) # no of possible values

41648


- review_text_train_vec.npz
- Contains a sparse matrix of the Bag-of-Word representation of the review text for TRAINING data.
- [28068 * size of vocabulary]

In [9]:
X_train = scipy.sparse.load_npz('review_text_train_vec.npz')

- review_text_test_vec.npz
- contains a sparse matrix of the Bag-of-Word representation of the review text for TEST data
- [7018 * size of vocabulary]

In [10]:
X_test = scipy.sparse.load_npz('review_text_test_vec.npz')

### Multinomial Naive Bayes

#### 10-fold Cross Validation Using 41648 features

In [11]:
# Using 41648 features
text_mnb_classifier = mnb.fit(X_train.todense(), meta_Y_train)
text_accuracy_scores_mnb = cross_val_score(text_mnb_classifier, X_train, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 41648 features is ",text_accuracy_scores_mnb.mean())

Multinomial NB average accuracy using 41648 features is  0.8390337159849587


#### Prediction of X_test

In [12]:
text_mnb_prediction = text_mnb_classifier.predict(X_test.todense())

In [14]:
text_mnb_prediction

array([5, 5, 1, ..., 5, 5, 5], dtype=int64)

In [15]:
len(text_mnb_prediction)

7018

In [16]:
# Store predictions in .csv
result = pd.DataFrame(np.arange(1,len(mnb_prediction) +1), columns = ['Instance_id'])
result ['rating'] = text_mnb_prediction

result.to_csv("MultinomialNB_text_prediction.csv", index = False)

### Multi-class Logistic Regression

In [17]:
# 10-fold Cross Validation is not carried out for 41468 features

#### 90-10 holdout

In [18]:
mlr_X_train, mlr_X_test, mlr_Y_train, mlr_Y_test = train_test_split(X_train, meta_Y_train, test_size = 0.1, random_state = 20)

In [19]:
mlr_holdout = mlr.fit(mlr_X_train, mlr_Y_train)

In [20]:
mlr_Y_predict = mlr_holdout.predict(mlr_X_test)
print(f"Accuracy Score of Multi-class Logistic Regression using 41648 features is ", accuracy_score(mlr_Y_test, mlr_Y_predict))

Accuracy Score of Multi-class Logistic Regression using 41648 features is  0.8457427858924118


#### Prediction of X_test

In [21]:
text_mlr_classifier = mlr.fit(X_train, meta_Y_train)

In [22]:
text_mlr_prediction = text_mlr_classifier.predict(X_test)

In [23]:
text_mlr_prediction

array([5, 5, 1, ..., 3, 5, 5], dtype=int64)

In [24]:
len(text_mlr_prediction)

7018

In [25]:
# Store predictions in .csv
result = pd.DataFrame(np.arange(1,len(mnb_prediction) +1), columns = ['Instance_id'])
result ['rating'] = text_mlr_prediction

result.to_csv("MulticlassLR_text_prediction.csv", index = False)

# Feature Selection

## 50, 100, 150 features using Chi-Squared

In [26]:
# 50 features
x2_50features = SelectKBest(chi2, k = 50)
x2_50features.fit(X_train, meta_Y_train)

X_train_50_x2 = x2_50features.transform(X_train)
X_test_50_x2 = x2_50features.transform(X_test)

# 100 features
x2_100features = SelectKBest(chi2, k = 100)
x2_100features.fit(X_train, meta_Y_train)

X_train_100_x2 = x2_100features.transform(X_train)
X_test_100_x2 = x2_100features.transform(X_test)

# 200 features
x2_200features = SelectKBest(chi2, k = 200)
x2_200features.fit(X_train, meta_Y_train)

X_train_200_x2 = x2_200features.transform(X_train)
X_test_200_x2 = x2_200features.transform(X_test)

## 50, 100, 150 features using Mutual Information

In [27]:
# 50 features
mi_50features = SelectKBest(score_func = mutual_info_classif, k = 50)
mi_50features.fit(X_train, meta_Y_train)

X_train_50_mi = mi_50features.transform(X_train)
X_test_50_mi = mi_50features.transform(X_test)

In [28]:
# 100 features
mi_100features = SelectKBest(score_func = mutual_info_classif, k = 100)
mi_100features.fit(X_train, meta_Y_train)

X_train_100_mi = mi_100features.transform(X_train)
X_test_100_mi = mi_100features.transform(X_test)

In [29]:
# 200 features
mi_200features = SelectKBest(score_func = mutual_info_classif, k = 200)
mi_200features.fit(X_train, meta_Y_train)

X_train_200_mi = mi_200features.transform(X_train)
X_test_200_mi = mi_200features.transform(X_test)

## Multinomial Naive Bayes

#### ii) 50, 100, 200 features according to Chi-Squared

In [30]:
# Using 50 features
text_mnb_classifier_50_x2 = mnb.fit(X_train_50_x2.todense(), meta_Y_train)
text_mnb_accuracy_scores_50_x2 = cross_val_score(text_mnb_classifier_50_x2, X_train_50_x2, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 50 features according to chi-squared is ",text_mnb_accuracy_scores_50_x2.mean())

# Using 100 features
text_mnb_classifier_100_x2 = mnb.fit(X_train_100_x2.todense(), meta_Y_train)
text_mnb_accuracy_scores_100_x2 = cross_val_score(text_mnb_classifier_100_x2, X_train_100_x2, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 100 features according to chi-squared is ",text_mnb_accuracy_scores_100_x2.mean())

# Using 200 features
text_mnb_classifier_200_x2 = mnb.fit(X_train_200_x2.todense(), meta_Y_train)
text_mnb_accuracy_scores_200_x2 = cross_val_score(text_mnb_classifier_200_x2, X_train_200_x2, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 200 features according to chi-squared is ",text_mnb_accuracy_scores_200_x2.mean())


Multinomial NB average accuracy using 50 features according to chi-squared is  0.7803910953702191
Multinomial NB average accuracy using 100 features according to chi-squared is  0.7961027707688318
Multinomial NB average accuracy using 200 features according to chi-squared is  0.812705165606501


#### iii) 50,100,200 features according to Mutual Information

In [31]:
# Using 50 features
text_mnb_classifier_50_mi = mnb.fit(X_train_50_mi.todense(), meta_Y_train)
text_mnb_accuracy_scores_50_mi = cross_val_score(text_mnb_classifier_50_mi, X_train_50_mi, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 50 features according to mutual information is ",text_mnb_accuracy_scores_50_mi.mean())

# Using 100 features
text_mnb_classifier_100_mi = mnb.fit(X_train_100_mi.todense(), meta_Y_train)
text_mnb_accuracy_scores_100_mi = cross_val_score(text_mnb_classifier_100_mi, X_train_100_mi, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 100 features according to mutual information is ",text_mnb_accuracy_scores_100_mi.mean())

# Using 200 features
text_mnb_classifier_200_mi = mnb.fit(X_train_200_mi.todense(), meta_Y_train)
text_mnb_accuracy_scores_200_mi = cross_val_score(text_mnb_classifier_200_mi, X_train_200_mi, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multinomial NB average accuracy using 200 features according to mutual information is ",text_mnb_accuracy_scores_200_mi.mean())


Multinomial NB average accuracy using 50 features according to mutual information is  0.7888344889735747
Multinomial NB average accuracy using 100 features according to mutual information is  0.8012331837141694
Multinomial NB average accuracy using 200 features according to mutual information is  0.8159474798392472


## Multi-class Logistic Regression

#### i) 50, 100, 200 features according to Chi-Squared

In [32]:
# Using 50 features
text_mlr_classifier_50_x2 = mlr.fit(X_train_50_x2.todense(), meta_Y_train)
text_mlr_accuracy_scores_50_x2 = cross_val_score(text_mlr_classifier_50_x2, X_train_50_x2, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multi-class Logistic Regression average accuracy using 50 features according to chi-squared is ",text_mlr_accuracy_scores_50_x2.mean())

# Using 100 features
text_mlr_classifier_100_x2 = mlr.fit(X_train_100_x2.todense(), meta_Y_train)
text_mlr_accuracy_scores_100_x2 = cross_val_score(text_mlr_classifier_100_x2, X_train_100_x2, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multi-class Logistic Regression average accuracy using 100 features according to chi-squared is ",text_mlr_accuracy_scores_100_x2.mean())

# Using 200 features
text_mlr_classifier_200_x2 = mlr.fit(X_train_200_x2.todense(), meta_Y_train)
text_mlr_accuracy_scores_200_x2 = cross_val_score(text_mlr_classifier_200_x2, X_train_200_x2, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multi-class Logistic Regression average accuracy using 200 features according to chi-squared is ",text_mlr_accuracy_scores_200_x2.mean())


Multi-class Logistic Regression average accuracy using 50 features according to chi-squared is  0.7878014083008547
Multi-class Logistic Regression average accuracy using 100 features according to chi-squared is  0.8034776743103039
Multi-class Logistic Regression average accuracy using 200 features according to chi-squared is  0.8238567363284082


#### ii) 50,100,200 features according to Mutual Information

In [33]:
# Using 50 features
text_mlr_classifier_50_mi = mlr.fit(X_train_50_mi.todense(), meta_Y_train)
text_mlr_accuracy_scores_50_mi = cross_val_score(text_mlr_classifier_50_mi, X_train_50_mi, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multi-class Logistic Regression average accuracy using 50 features according to mutual information is ",text_mlr_accuracy_scores_50_mi.mean())

# Using 100 features
text_mlr_classifier_100_mi = mlr.fit(X_train_100_mi.todense(), meta_Y_train)
text_mlr_accuracy_scores_100_mi = cross_val_score(text_mlr_classifier_100_mi, X_train_100_mi, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multi-class Logistic Regression average accuracy using 100 features according to mutual information is ",text_mlr_accuracy_scores_100_mi.mean())

# Using 200 features
text_mlr_classifier_200_mi = mlr.fit(X_train_200_mi.todense(), meta_Y_train)
text_mlr_accuracy_scores_200_mi = cross_val_score(text_mlr_classifier_200_mi, X_train_200_mi, meta_Y_train, cv = 10, scoring ='accuracy')
print(f"Multi-class Logistic Regression average accuracy using 200 features according to mutual information is ",text_mlr_accuracy_scores_200_mi.mean())


Multi-class Logistic Regression average accuracy using 50 features according to mutual information is  0.7920767270297934
Multi-class Logistic Regression average accuracy using 100 features according to mutual information is  0.8109240060423222
Multi-class Logistic Regression average accuracy using 200 features according to mutual information is  0.8292366400971403
