In [92]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize

# Classifiers
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
'''
Read in the data
'''


def read_dataset(path):
    return pd.read_table(path, sep='\t', lineterminator='\n', header=None, names=['Review', 'Label'])


# Yelp datsets
yelp_train = read_dataset(r'./Datasets/yelp-train.txt')
yelp_valid = read_dataset(r'./Datasets/yelp-valid.txt')
yelp_test = read_dataset(r'./Datasets/yelp-test.txt')

# IMDB datasets
imdb_train = read_dataset(r'./Datasets/IMDB-train.txt')
imdb_valid = read_dataset(r'./Datasets/IMDB-valid.txt')
imdb_test = read_dataset(r'./Datasets/imdb-test.txt')

# Aggregate the datasets for easier processing
datasets = {
    'yelp': {'train': yelp_train, 'valid': yelp_valid, 'test': yelp_test},
    'imdb': {'train': imdb_train, 'valid': imdb_valid, 'test': imdb_test}
}

# Group sets
training = {'yelp': yelp_train, 'imdb': imdb_train}
valid = {'yelp': yelp_valid, 'imdb': imdb_valid}
test = {'yelp': yelp_test, 'imdb': imdb_test}

# Sanity Check
for name, training_set in training.items():
    print(name.upper(), 'size:', str(len(training_set)))
    print(training_set.head(), '\n', '-' * 50)

YELP size: 7000
                                              Review  Label
0  I can't believe I haven't yelped about the pla...      5
1  Best nights to go to Postino's are Mondays and...      5
2  Went here tonight with the padres and husband....      5
3  I must be spoiled and realize that this is not...      3
4  Normally, love this store & have been a member...      2 
 --------------------------------------------------
IMDB size: 15000
                                              Review  Label
0  For a movie that gets no respect there sure ar...      1
1  Bizarre horror movie filled with famous faces ...      1
2  A solid, if unremarkable film. Matthau, as Ein...      1
3  It's a strange feeling to sit alone in a theat...      1
4  You probably all already know this by now, but...      1 
 --------------------------------------------------


In [93]:
'''
Question 1: Prepare the data
'''
M_FEATURES = 10000

# Preprocess the sets by removing punctuation and converting to lowercase:   
for dataset in datasets.values():
    for dataset_type in dataset.values():
        dataset_type['Review'] = dataset_type['Review'].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '').str.lower()


# Verify preprocessing
for name, training_set in training.items():
    print(name.upper(), 'size:', str(len(training_set)))
    print(training_set.head(), '\n', '-' * 50)

YELP size: 7000
                                              Review  Label
0  i cant believe i havent yelped about the place...      5
1  best nights to go to postinos are mondays and ...      5
2  went here tonight with the padres and husband ...      5
3  i must be spoiled and realize that this is not...      3
4  normally love this store  have been a member f...      2 
 --------------------------------------------------
IMDB size: 15000
                                              Review  Label
0  for a movie that gets no respect there sure ar...      1
1  bizarre horror movie filled with famous faces ...      1
2  a solid if unremarkable film matthau as einste...      1
3  its a strange feeling to sit alone in a theate...      1
4  you probably all already know this by now but ...      1 
 --------------------------------------------------


In [94]:
# Get the frequencies in descending order (TRAINING SETS ONLY)
most_common = {}
for dataset in training:
    all_words_list = [word for sentence in training[dataset]['Review'].str.split().tolist() for word in sentence]
    top_k = Counter(all_words_list).most_common(M_FEATURES)
    most_common[dataset] = {word[0]: i for i, word in enumerate(top_k)}
    # Write to file for submission
    vocab = pd.DataFrame(top_k)
    vocab[2] = np.arange(0, M_FEATURES)  # These are the word IDs
    vocab.to_csv('./Submission/' + dataset + '-vocab.txt', sep='\t', header=False, index=False, columns=[0, 2, 1])

In [91]:
# Binary and frequency bag-of-words representation
yelp_binary = {}
yelp_frequency = {}
yelp_vectorizer_bin = CountVectorizer(vocabulary=most_common['yelp'], binary=True)
yelp_vectorizer_freq = CountVectorizer(vocabulary=most_common['yelp'])
# Loop over training, test, and valid sets for yelp and convert to bag-of-words
for dataset in datasets['yelp']:
    yelp_binary[dataset] = yelp_vectorizer_bin.fit_transform(datasets['yelp'][dataset]['Review'].values)
    yelp_frequency[dataset] = yelp_vectorizer_freq.fit_transform(datasets['yelp'][dataset]['Review'].values)
# Do the same for IMDB
imdb_binary = {}
imdb_frequency = {}
imdb_vectorizer_bin = CountVectorizer(vocabulary=most_common['imdb'], binary=True)
imdb_vectorizer_freq = CountVectorizer(vocabulary=most_common['imdb'])
for dataset in datasets['imdb']:
    imdb_binary[dataset] = imdb_vectorizer_bin.fit_transform(datasets['imdb'][dataset]['Review'].values)
    imdb_frequency[dataset] = imdb_vectorizer_freq.fit_transform(datasets['imdb'][dataset]['Review'].values)

MemoryError: 

In [86]:
# Write to file for submission
def write_converted_dataset(name):
    for dataset in datasets[name]:
        with open('./Submission/' + name + '-' + dataset + '.txt', 'w') as file:
            for i in range(len(datasets[name][dataset])):
                file.write(' '.join([str(most_common[name][word]) for word in datasets[name][dataset].iloc[i, 0].split() 
                                     if word in most_common[name]]) + '\t' + str(datasets[name][dataset].iloc[i, 1]) + '\n')
                
                
write_converted_dataset('yelp')
write_converted_dataset('imdb')

In [87]:
'''
Question 2: Yelp Dataset, binary bag-of-words
'''


def random_classifier(labels, test_length):
    return np.random.choice(labels, test_length)


def majority_class_classifier(y_train_labels, test_length):
    most_freq = y_train_labels.value_counts().idxmax()
    return np.full(test_length, most_freq)


# Baseline F1 Scores for random and majority class classifiers 
pred_rand = random_classifier([1, 2, 3, 4, 5], len(yelp_test))
score_rand = f1_score(yelp_test['Label'], pred_rand, average='micro')

pred_majority = majority_class_classifier(yelp_train['Label'], len(yelp_test))
score_majority = f1_score(yelp_test['Label'], pred_majority, average='micro')

print('BASELINE SCORES')
print('Random Classifier:', score_rand)
print('Majority Classifier:', score_majority)

BASELINE SCORES
Random Classifier: 0.18249999999999997
Majority Classifier: 0.351


In [89]:
def do_clf_test(clf, X_train, Y_train, X_test, Y_test):
    clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    return f1_score(Y_test, pred, average='macro')


# print('Yelp_test Score:', do_clf_test(GaussianNB(), yelp_binary['train'], yelp_train['Label'], yelp_binary['test'], yelp_test['Label']))
print('IMDB_test Score:', do_clf_test(GaussianNB(), imdb_frequency['train'], imdb_train['Label'], imdb_frequency['test'], imdb_test['Label']))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.