In [1]:
from urllib import request
import tarfile

import pathlib as pl

from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.utils import simple_preprocess
from sklearn.utils import shuffle

In [2]:
request.urlretrieve('http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz',
                    filename = 'review_polarity.tar.gz')

with tarfile.open('review_polarity.tar.gz', 'r') as gz_archive:
    gz_archive.extractall()

In [3]:
def tag_reviews():

    pos = []
    neg = []

    for p in pl.Path('./txt_sentoken/pos').iterdir():
        with open(p, 'r') as file:
            pos.append(TaggedDocument(words = simple_preprocess(file.read()), tags = '+'))
                
    for p in (pl.Path('./txt_sentoken/neg').iterdir()):
        with open(p, 'r') as file:
            neg.append(TaggedDocument(words = simple_preprocess(file.read()), tags = '-'))
            
    return pos, neg

In [4]:
def model_reviews(model_type, input_reviews, iter_rounds = 10):
    
    pos_reviews_tags = []
    neg_reviews_tags = []

    cum_pos_accuracy = 0.0
    cum_neg_accuracy = 0.0
    
    for n in range(iter_rounds):
        
        pos_reviews_tags = shuffle(input_reviews[0])
        neg_reviews_tags = shuffle(input_reviews[1])
    
        neg_sample_tags = neg_reviews_tags[:100]

        train_tag_data = shuffle((pos_reviews_tags[:800]) + (neg_sample_tags[:80]))
        test_tag_data = shuffle((pos_reviews_tags[800:]) + (neg_sample_tags[80:]))      

        if model_type == 'base':
            model = Doc2Vec(dbow_words = 1, dm = 0, epochs = 20, min_count = 1,
                    negative = 10, sample = 0.0005, vector_size = 300)
            
        elif model_type == 'improved':
            model = Doc2Vec(dbow_words = 0, dm = 0, hs = 1, epochs = 20, min_count = 1,
                    sample = 0.0005, vector_size = 300)
        
        model.build_vocab(train_tag_data)
        model.train(train_tag_data, total_examples=model.corpus_count, epochs=model.epochs)
        
        pos_total = 0
        neg_total = 0
        pos_correct = 0
        neg_correct = 0
        
        for review_id in range(len(test_tag_data)):
            inferred_vector = model.infer_vector(test_tag_data[review_id].words)
            sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
            
            if test_tag_data[review_id].tags == '+':
                pos_total += 1
                if sims[0][0] == test_tag_data[review_id].tags:
                    pos_correct += 1
                    
            if test_tag_data[review_id].tags == '-':
                neg_total += 1
                if sims[0][0] == test_tag_data[review_id].tags:
                    neg_correct += 1
                    
        pos_accuracy = pos_correct / pos_total
        neg_accuracy = neg_correct / neg_total
                    
        print('Ratio of positive reviews classified correctly on round {0}: {1}'.format(n, pos_accuracy))
        print('Ratio of negative reviews classified correctly on round {0}: {1}\n\n'.format(n, neg_accuracy))
        
        cum_pos_accuracy += pos_accuracy
        cum_neg_accuracy += neg_accuracy       
        
    avg_pos_accuracy = cum_pos_accuracy / iter_rounds
    avg_neg_accuracy = cum_neg_accuracy / iter_rounds
        
    print('Average ratio of positive reviews classified correctly:', avg_pos_accuracy)
    print('Average ratio of negative reviews classified correctly:', avg_neg_accuracy)

In [5]:
tagged_reviews = tag_reviews()

In [6]:
model_reviews('base', tagged_reviews)

Ratio of positive reviews classified correctly on round 0: 0.945
Ratio of negative reviews classified correctly on round 0: 0.3


Ratio of positive reviews classified correctly on round 1: 0.945
Ratio of negative reviews classified correctly on round 1: 0.45


Ratio of positive reviews classified correctly on round 2: 0.97
Ratio of negative reviews classified correctly on round 2: 0.15


Ratio of positive reviews classified correctly on round 3: 0.865
Ratio of negative reviews classified correctly on round 3: 0.25


Ratio of positive reviews classified correctly on round 4: 0.95
Ratio of negative reviews classified correctly on round 4: 0.35


Ratio of positive reviews classified correctly on round 5: 0.925
Ratio of negative reviews classified correctly on round 5: 0.45


Ratio of positive reviews classified correctly on round 6: 0.935
Ratio of negative reviews classified correctly on round 6: 0.25


Ratio of positive reviews classified correctly on round 7: 0.955
Ratio of negative rev

In [7]:
model_reviews('improved', tagged_reviews)

Ratio of positive reviews classified correctly on round 0: 0.84
Ratio of negative reviews classified correctly on round 0: 0.65


Ratio of positive reviews classified correctly on round 1: 0.86
Ratio of negative reviews classified correctly on round 1: 0.6


Ratio of positive reviews classified correctly on round 2: 0.91
Ratio of negative reviews classified correctly on round 2: 0.55


Ratio of positive reviews classified correctly on round 3: 0.815
Ratio of negative reviews classified correctly on round 3: 0.55


Ratio of positive reviews classified correctly on round 4: 0.785
Ratio of negative reviews classified correctly on round 4: 0.7


Ratio of positive reviews classified correctly on round 5: 0.86
Ratio of negative reviews classified correctly on round 5: 0.7


Ratio of positive reviews classified correctly on round 6: 0.805
Ratio of negative reviews classified correctly on round 6: 0.7


Ratio of positive reviews classified correctly on round 7: 0.825
Ratio of negative reviews 