In [1]:
# Base libraries
import numpy as np
import os
import pandas as pd
import scipy

# Processing & feature generation
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import normalize

# Classifiers
from sklearn.naive_bayes import BernoulliNB, GaussianNB # NOTE: You will require Python3 64-bit kernel to run GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

## Dataset Summary:
### Yelp:
        Training set: 7000
        Valid set: 1000
        Test set: 2000

        Type: 5 class classification problem (1:worst - 5:best)

### IMDB:
        Training set: 15000
        Valid set: 10000
        Test set: 25000

        Type: 2 class problem (1: positive, 0: negative)

In [2]:
# Constants
PATH = r'./Datasets'
M_FEATURES = 10000

## Generalized methods for all datasets

In [3]:
def read_dataset(filename, column_names=['Review', 'Label']):
    '''
    Parses the file into a pandas dataframe object.
    :param filename: The file to be parsed
    :param column_names: The desired column names of the dataframe
    :return: The file as a dataframe object
    '''
    return pd.read_table(os.path.join(PATH, filename), sep='\t', lineterminator='\n', header=None, names=column_names)

In [4]:
def preprocess(dataframe, column):
    '''
    Preprocesses the reviews by stripping away all non-word, non-space characters. Additionally removes <br /> tags for IMDB set
    :param dataframe: The dataframe object
    :param column: The column to be preprocessed. This will be 'Review' for this assignment.
    :return: None
    '''
    dataframe[column] = dataframe[column].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '').str.lower()

In [5]:
def get_vocabulary(training_dict, column, features=M_FEATURES, save_to_file=False):
    '''
    For each training set in training_dict, return the corresponding vocabulary to be used as the feature set.
    The method first counts the frequencies of all words across each training set, and then chooses the top most
    frequent words as the feature set.
    :param training_dict: The dictionary of training sets. We have yelp and IMDB training sets.
    :param column: The column to get the words from. This is 'Review'
    :param features: The number of top most frequent features to be used
    :param save_to_file: If True, saves the feature set, along with the frequencies and IDs to their corresponding file
                         as required by the assignment
    :return: Dictionary of vocabularies for both yelp and IMDB
    '''
    most_common = {}
    for dataset in training_dict:
        all_words_list = [word for sentence in training_dict[dataset][column].str.split().tolist() for word in sentence]
        top_k = Counter(all_words_list).most_common(features)
        most_common[dataset] = {word[0]: i for i, word in enumerate(top_k)}

        if save_to_file:
            # Write to file for submission
            vocab = pd.DataFrame(top_k)
            vocab[2] = np.arange(0, features)  # These are the word IDs
            vocab.to_csv('./Submission/' + dataset + '-vocab.txt', sep='\t', header=False, index=False, columns=[0, 2, 1])

    return most_common

In [6]:
def bag_of_words(datasets, vocabulary, xname='Review', yname='Label'):
    '''
    Converts each dataset in datasets to both binary and frequency bag of words representations.
    :param datasets: Dictionary of datasets to be converted
    :param vocabulary: The vocabulary extracted by :func:`get_vocabulary`
    :param xname: The name of the feature column ('Review')
    :param yname: The name of the label column ('Label')
    :return: Binary and frequency BoW dictionaries. Each dictionary has keys corresponding to the keys of datasets.
    '''
    binary_bow = {}
    freq_bow = {}
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    for name in datasets:
        vec = vectorizer.fit_transform(datasets[name][xname])
        freq_bow[name] = [normalize(vec), datasets[name][yname]]
        vec[vec > 1] = 1
        binary_bow[name] = [vec, datasets[name][yname]]

    return binary_bow, freq_bow

In [7]:
def write_converted_dataset(datasets, vocab_dict, dataset_name):
    '''
    Replaces the words of the datasets by their unique IDs from the vocabulary, as required by the assignment. It then
    writes the converted datasets to file.
    :param datasets: Dictionary containing the datasets to be converted
    :param vocab_dict: The vocabulary dictionary obtained from :func:`get_vocabulary`
    :param dataset_name: 'yelp' or 'imdb'
    :return: None
    '''
    for dataset in datasets[dataset_name]:
        with open('./Submission/' + dataset_name + '-' + dataset + '.txt', 'w') as file:
            for i in range(len(datasets[dataset_name][dataset])):
                file.write(' '.join([str(vocab_dict[dataset_name][word]) for word in datasets[dataset_name][dataset].iloc[i, 0].split()
                                     if word in vocab_dict[dataset_name]]) + '\t' + str(datasets[dataset_name][dataset].iloc[i, 1]) + '\n')

In [8]:
def random_classifier(train_on, predict_on):
    '''
    Classifies predict_on into a random class.
    :param train_on: The training set
    :param predict_on: The test set
    :return: Predicted labels of the test set
    '''
    return np.random.choice(np.unique(train_on[1]), len(predict_on[1]))

In [9]:
def majority_class_classifier(train_on, predict_on):
    '''
    Classifies predict_on into the majority class (mode) of the training set.
    :param train_on: The training set
    :param predict_on: The test set
    :return: Predicted labels of the test set
    '''
    return np.full(len(predict_on[1]), scipy.stats.mode(train_on[1])[0][0])

In [10]:
def print_format(s, *args):
    '''
    Helper to print formatted strings
    :param s: The unformatted string with placeholders
    :param args: The args that go into the placeholders of s
    :return: None
    '''
    print(s.format(*args))

In [11]:
def do_clf_test(clf, dataset_dict, tune_params=None, tune=True, average='micro'):
    '''
    Fits the classifier clf to the training set, and predicts on the test set. Prints the F1 scores for each set.
    :param clf: An sklearn classifier or :func:`random_classifier` or :func:`majority_class_classifier`
    :param dataset_dict: Dictionary of datasets to predict on (must contain keys 'train', 'valid', 'test')
    :param tune_params: If not None, will tune the parameters on the validation set before predicting on test. Tuning
                        is done using sklearn's GridSearchCV
    :param tune: Must be true in addition to tune_params not being None to perform tuning
    :param average: The average paramters of sklearn's f1_score. Defaults to 'micro'
    :return: None
    '''
    name = clf.__name__.upper() if callable(clf) else clf.__class__.__name__.upper()
    print_format('\tScores using {}:', name)

    if not callable(clf):
        train_x = dataset_dict['train'][0]
        train_y = dataset_dict['train'][1]
        if tune and tune_params is not None:
            # Set up GridSearch with validation set
            valid_x, valid_y = dataset_dict['valid'][0], dataset_dict['valid'][1]
            ps = PredefinedSplit(test_fold=[-1 if i < len(train_y) else 0 for i in range(len(train_y) + len(valid_y))])
            clf = GridSearchCV(clf, tune_params, cv=ps, n_jobs=2)
            train_x = scipy.sparse.vstack([train_x, valid_x])
            train_y = np.concatenate([train_y, valid_y])

        clf.fit(train_x, train_y)
        if tune and tune_params is not None: print('\t\tBest params:', clf.best_params_)

    for dname, dset in dataset_dict.items():
        score = f1_score(dset[1], clf(dataset_dict['train'], dset) if callable(clf) else clf.predict(dset[0]), average=average)
        print_format('\t\t{}: {}', dname.upper(), score)

In [12]:
# Useful flags
WRITE = True # If true, write to file
PERFORM_TUNING = True

## Read in the datasets using Pandas

In [13]:
# Yelp datsets
yelp_train = read_dataset('yelp-train.txt')
yelp_valid = read_dataset('yelp-valid.txt')
yelp_test = read_dataset('yelp-test.txt')

# IMDB datasets
imdb_train = read_dataset('IMDB-train.txt')
imdb_valid = read_dataset('IMDB-valid.txt')
imdb_test = read_dataset('IMDB-test.txt')

# All sets
datasets = {
    'yelp': {'train': yelp_train, 'valid': yelp_valid, 'test': yelp_test},
    'imdb': {'train': imdb_train, 'valid': imdb_valid, 'test': imdb_test}
}

# Group sets
training = {'yelp': yelp_train, 'imdb': imdb_train}
valid = {'yelp': yelp_valid, 'imdb': imdb_valid}
test = {'yelp': yelp_test, 'imdb': imdb_test}

In [14]:
# Sanity Check
print('CHECK DATA:')
for name, training_set in training.items():
    print(name.upper(), 'size:', str(len(training_set)))
    print(training_set.head(), '\n', '-' * 80)

CHECK DATA:
YELP size: 7000
                                              Review  Label
0  I can't believe I haven't yelped about the pla...      5
1  Best nights to go to Postino's are Mondays and...      5
2  Went here tonight with the padres and husband....      5
3  I must be spoiled and realize that this is not...      3
4  Normally, love this store & have been a member...      2 
 --------------------------------------------------------------------------------
IMDB size: 15000
                                              Review  Label
0  For a movie that gets no respect there sure ar...      1
1  Bizarre horror movie filled with famous faces ...      1
2  A solid, if unremarkable film. Matthau, as Ein...      1
3  It's a strange feeling to sit alone in a theat...      1
4  You probably all already know this by now, but...      1 
 --------------------------------------------------------------------------------


## Question 1: Convert both datasets in binary and frequency BoW

### Preprocess the data

In [15]:
for s in datasets.values():
    for df in s.values():
        preprocess(df, 'Review')

# Verify preprocessing
print('\nAFTER PREPROCESSING:')
for name, training_set in training.items():
    print(training_set.head(), '\n', '-' * 80)


AFTER PREPROCESSING:
                                              Review  Label
0  i cant believe i havent yelped about the place...      5
1  best nights to go to postinos are mondays and ...      5
2  went here tonight with the padres and husband ...      5
3  i must be spoiled and realize that this is not...      3
4  normally love this store  have been a member f...      2 
 --------------------------------------------------------------------------------
                                              Review  Label
0  for a movie that gets no respect there sure ar...      1
1  bizarre horror movie filled with famous faces ...      1
2  a solid if unremarkable film matthau as einste...      1
3  its a strange feeling to sit alone in a theate...      1
4  you probably all already know this by now but ...      1 
 --------------------------------------------------------------------------------


### Bag of words

In [16]:
# Generate vocabulary for yelp and imdb datasets from training data, and write to file
vocabulary = get_vocabulary(training, 'Review', M_FEATURES, WRITE)

# Write converted datasets to file
if WRITE:
    write_converted_dataset(datasets, vocabulary, 'yelp')
    write_converted_dataset(datasets, vocabulary, 'imdb')
    
# Shuffle data
for s in datasets.values():
    for key in s:
        s[key] = s[key].sample(frac=1).reset_index(drop=True)

# Bag of words. _binary is for binary BoW. _freq is for frequency BoW
yelp_binary, yelp_freq = bag_of_words(datasets['yelp'], vocabulary['yelp'])
imdb_binary, imdb_freq = bag_of_words(datasets['imdb'], vocabulary['imdb'])

## Question 2: Yelp Binary BoW f1 scores on random uniform classifier, majority-class classifier, Naive Bayes, Decision Trees, LinearSVM w/ hyperparameter tuning using GridSearchCV

In [17]:
print('Yelp Binary Bag of Words Performances')
do_clf_test(random_classifier, yelp_binary) # Random uniform classifier
do_clf_test(majority_class_classifier, yelp_binary) # Majority class classifier

Yelp Binary Bag of Words Performances
	Scores using RANDOM_CLASSIFIER:
		TRAIN: 0.20385714285714285
		VALID: 0.20999999999999996
		TEST: 0.197
	Scores using MAJORITY_CLASS_CLASSIFIER:
		TRAIN: 0.3525714285714286
		VALID: 0.356
		TEST: 0.351


In [18]:
# BernoulliNB
params = {'alpha': np.arange(0.01, 1.01, 0.01)}
do_clf_test(BernoulliNB(), yelp_binary, params, tune=PERFORM_TUNING)

	Scores using BERNOULLINB:
		Best params: {'alpha': 0.02}
		TRAIN: 0.7281428571428571
		VALID: 0.672
		TEST: 0.4355


In [19]:
# Decision Tree
params = {'max_depth': np.arange(13, 17),
          'max_features': np.arange(0.1, 0.5, 0.1),
          'min_samples_leaf': np.arange(3, 6)}
do_clf_test(DecisionTreeClassifier(), yelp_binary, params, tune=PERFORM_TUNING)

	Scores using DECISIONTREECLASSIFIER:
		Best params: {'max_depth': 16, 'max_features': 0.30000000000000004, 'min_samples_leaf': 5}
		TRAIN: 0.5712857142857143
		VALID: 0.594
		TEST: 0.376


In [20]:
# Linear SVM
params = {'C': np.logspace(-2, 2, num=8),
          'max_iter': np.arange(10, 100, 10)}
do_clf_test(LinearSVC(), yelp_binary, params, tune=PERFORM_TUNING)

	Scores using LINEARSVC:
		Best params: {'C': 0.01, 'max_iter': 10}
		TRAIN: 0.8327142857142857
		VALID: 0.827
		TEST: 0.5085


## Question 3: Yelp Frequency BoW

In [21]:
print('Yelp Frequency Bag of Words Performances')
do_clf_test(random_classifier, yelp_freq)
do_clf_test(majority_class_classifier, yelp_freq)

Yelp Frequency Bag of Words Performances
	Scores using RANDOM_CLASSIFIER:
		TRAIN: 0.20257142857142857
		VALID: 0.193
		TEST: 0.191
	Scores using MAJORITY_CLASS_CLASSIFIER:
		TRAIN: 0.3525714285714286
		VALID: 0.356
		TEST: 0.351


In [22]:
# GaussianNB: Requires dense arrays
do_clf_test(GaussianNB(), {key: [value[0].toarray(), value[1]] for key, value in yelp_freq.items()})

	Scores using GAUSSIANNB:
		TRAIN: 0.747
		VALID: 0.278
		TEST: 0.284


In [23]:
# Decision Tree
params = {'max_depth': np.arange(13, 17),
          'max_features': np.arange(0.1, 0.5, 0.1),
          'min_samples_leaf': np.arange(3, 6)}
do_clf_test(DecisionTreeClassifier(), yelp_freq, params, tune=PERFORM_TUNING)

	Scores using DECISIONTREECLASSIFIER:
		Best params: {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 4}
		TRAIN: 0.549
		VALID: 0.519
		TEST: 0.38649999999999995


In [24]:
# Linear SVM
params = {'C': np.logspace(-2, 2, num=8),
          'max_iter': np.arange(10, 100, 10)}
do_clf_test(LinearSVC(), yelp_freq, params, tune=PERFORM_TUNING)

	Scores using LINEARSVC:
		Best params: {'C': 0.517947467923121, 'max_iter': 20}
		TRAIN: 0.7477142857142857
		VALID: 0.752
		TEST: 0.5345


## Question 4: Repeat Q2 and Q3 with IMDB

In [25]:
print('IMDB Binary Bag of Words Performances')
do_clf_test(random_classifier, imdb_binary)
# Majority class classifier doesn't make sense for IMDB since it is a balanced dataset

IMDB Binary Bag of Words Performances
	Scores using RANDOM_CLASSIFIER:
		TRAIN: 0.497
		VALID: 0.4997
		TEST: 0.49744


In [26]:
# BernoulliNB
params = {'alpha': np.arange(0.01, 1.01, 0.01)}
do_clf_test(BernoulliNB(), imdb_binary, params, tune=PERFORM_TUNING)

	Scores using BERNOULLINB:
		Best params: {'alpha': 0.11}
		TRAIN: 0.8692
		VALID: 0.8668
		TEST: 0.84036


In [27]:
# Decision Tree
params = {'max_depth': np.arange(13, 17),
          'max_features': np.arange(0.1, 0.5, 0.1),
          'min_samples_leaf': np.arange(3, 6)}
do_clf_test(DecisionTreeClassifier(), imdb_binary, params, tune=PERFORM_TUNING)

	Scores using DECISIONTREECLASSIFIER:
		Best params: {'max_depth': 16, 'max_features': 0.4, 'min_samples_leaf': 3}
		TRAIN: 0.8032
		VALID: 0.8105
		TEST: 0.7314


In [28]:
# Linear SVM
params = {'C': np.logspace(-2, 2, num=8),
          'max_iter': np.arange(10, 100, 10)}
do_clf_test(LinearSVC(), imdb_binary, params, tune=PERFORM_TUNING)

	Scores using LINEARSVC:
		Best params: {'C': 0.01, 'max_iter': 10}
		TRAIN: 0.9538666666666666
		VALID: 0.9523
		TEST: 0.87952


In [29]:
print('IMDB Frequency Bag of Words Performances')
do_clf_test(random_classifier, imdb_freq)

IMDB Frequency Bag of Words Performances
	Scores using RANDOM_CLASSIFIER:
		TRAIN: 0.5029333333333333
		VALID: 0.4964
		TEST: 0.50316


In [30]:
# GaussianNB: Requires dense arrays
# No parameters to tune
do_clf_test(GaussianNB(), {key: [value[0].toarray(), value[1]] for key, value in imdb_freq.items()})

	Scores using GAUSSIANNB:
		TRAIN: 0.8693333333333333
		VALID: 0.7673
		TEST: 0.70544


In [31]:
# Decision Tree
params = {'max_depth': np.arange(13, 17),
          'max_features': np.arange(0.1, 0.5, 0.1),
          'min_samples_leaf': np.arange(3, 6)}
do_clf_test(DecisionTreeClassifier(), imdb_freq, params, tune=PERFORM_TUNING)

	Scores using DECISIONTREECLASSIFIER:
		Best params: {'max_depth': 14, 'max_features': 0.4, 'min_samples_leaf': 5}
		TRAIN: 0.7824666666666666
		VALID: 0.7844
		TEST: 0.71776


In [32]:
# Linear SVM
params = {'C': np.logspace(-2, 2, num=8),
          'max_iter': np.arange(10, 100, 10)}
do_clf_test(LinearSVC(), imdb_freq, params, tune=PERFORM_TUNING)

	Scores using LINEARSVC:
		Best params: {'C': 1.9306977288832496, 'max_iter': 40}
		TRAIN: 0.9468666666666666
		VALID: 0.9421
		TEST: 0.88276
