# Sentiment analysis in textual movie reviews

### Load data

In [5]:
import os.path as op
import re
import numpy as np

In [4]:
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt')))

def open_perso(f):
    with open(f, "r") as file:
        return file.read()

texts_neg = [open_perso(f) for f in filenames_neg]
texts_pos = [open_perso(f) for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Loading dataset
2000 documents


## Implementation of the classifier

### 1. Complete the count_words function that will count the number of occurrences of each distinct word in a list of string and return vocabulary (the python dictionary) and counts

In [77]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    
    n_samples = len(texts)
    
    # Determine n_features and delete punctuation.
    words = dict()
    n_features = 0
    texts_sub = []
    for text in texts:
        texts_sub.append(re.sub("[\n\r\-\_\@\$\&,:;.!?]", " ", text))
        
        for word in texts_sub[-1].split(" "):
            if word == "":
                continue
            if word not in words:
                words[word] = n_features
                n_features += 1
    
    
    # Count words by documents and store it in an array.
    counts = np.zeros((n_samples, n_features))
    
    for i in range(n_samples):
        for word in texts_sub[i].split(" "):
            if word == "":
                continue
            j = words[word]
            counts[i,j] += 1
    
    return words, counts


In [82]:
vocab, counts = count_words(texts)
print("Vocab size:", len(vocab))
list(sorted(vocab.items(), key=lambda x:x[1]))[0:1000]

Vocab size: 44062


[('plot', 0),
 ('two', 1),
 ('teen', 2),
 ('couples', 3),
 ('go', 4),
 ('to', 5),
 ('a', 6),
 ('church', 7),
 ('party', 8),
 ('drink', 9),
 ('and', 10),
 ('then', 11),
 ('drive', 12),
 ('they', 13),
 ('get', 14),
 ('into', 15),
 ('an', 16),
 ('accident', 17),
 ('one', 18),
 ('of', 19),
 ('the', 20),
 ('guys', 21),
 ('dies', 22),
 ('but', 23),
 ('his', 24),
 ('girlfriend', 25),
 ('continues', 26),
 ('see', 27),
 ('him', 28),
 ('in', 29),
 ('her', 30),
 ('life', 31),
 ('has', 32),
 ('nightmares', 33),
 ("what's", 34),
 ('deal', 35),
 ('watch', 36),
 ('movie', 37),
 ('"', 38),
 ('sorta', 39),
 ('find', 40),
 ('out', 41),
 ('critique', 42),
 ('mind', 43),
 ('fuck', 44),
 ('for', 45),
 ('generation', 46),
 ('that', 47),
 ('touches', 48),
 ('on', 49),
 ('very', 50),
 ('cool', 51),
 ('idea', 52),
 ('presents', 53),
 ('it', 54),
 ('bad', 55),
 ('package', 56),
 ('which', 57),
 ('is', 58),
 ('what', 59),
 ('makes', 60),
 ('this', 61),
 ('review', 62),
 ('even', 63),
 ('harder', 64),
 ('write', 

### 2. Explain how positive and negative classes have been assigned to movie reviews (see poldata.README.2.0 file)

Class has been assigned thanks to explicit words/rules.