In [44]:
import os
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from bs4 import BeautifulSoup # used for HTML parsing
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/niccolosottile/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [45]:
def read_reviews(folder_path):
    """Reads the reviews in the folder path, storing label (derived from star rating) and content in a dictionary."""
    reviews = {}

    # Retrieve all review files in directory
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Verify file path is valid before reading
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                # Extract star rating from filename (id_star.txt format)
                star_rating = int(filename.split('_')[-1].split('.')[0])
                # Assign labels given extracted star rating
                label = 1 if star_rating >= 7 else 0

                # Assign each review a dictionary of label and content
                reviews[filename] = {'content': content, 'label': label}

    return reviews

# Read positive and negative reviews
pos_reviews = read_reviews('../data/pos') 
neg_reviews = read_reviews('../data/neg') 

# Merge them in single dictionary
all_reviews = {}
all_reviews.update(pos_reviews)
all_reviews.update(neg_reviews)

In [46]:
def get_wordnet_pos(treebank_tag):
    """Converts the POS naming scheme from the Penn Treebank tag to a WordNet tag."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

stoplist = set(stopwords.words('english'))

def verify_stopwords_punctuation(token, a_stopwords, a_punctuation):
    """Verifies whether a token is a stopword or part of punctuation given filtering criteria."""
    return (not a_stopwords or not token in stoplist) and (not a_punctuation or not token in string.punctuation)

def preprocess_reviews(contents, a_stopwords = False, a_punctuation = False, a_stemming = False, a_lemmatization = False):
    """Applies tokenization, n-gram generation, and further preprocessing based on supplied criteria."""
    # Initialisation
    stemmer = LancasterStemmer()
    lemmatizer = WordNetLemmatizer()
    n_gram_size = 2 # not in parameters as not part of feature selection
    
    preprocessed_contents = []

    for content in contents:
        # Remove HTML tags (e.g. <br></br>)
        soup = BeautifulSoup(content, "html.parser")
        content = soup.get_text()

        # Apply tokenization
        tokens = nltk.word_tokenize(content)

        # Apply preprocessing based on criteria supplied
        if a_stemming:
            preprocessed_tokens = [stemmer.stem(token.lower()) for token in tokens if verify_stopwords_punctuation(token, a_stopwords, a_punctuation)]
        elif a_lemmatization:
            pos_tags = nltk.pos_tag(tokens)
            preprocessed_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)).lower() for token, pos in pos_tags if verify_stopwords_punctuation(token, a_stopwords, a_punctuation)]
        else:
            preprocessed_tokens = [token.lower() for token in tokens if verify_stopwords_punctuation(token, a_stopwords, a_punctuation)]

        # Generate n-grams (treating them as units due to _)
        n_grams = list(ngrams(preprocessed_tokens, n_gram_size))
        preprocessed_n_grams = ['_'.join(n_gram) for n_gram in n_grams]

        preprocessed_contents.append(' '.join(preprocessed_n_grams))
    
    return preprocessed_contents 

def generate_features(contents, params):
    """Generates a set of features using preprocessing and TF-IDF (or BoW if implemented)."""
    a_tfidf = params[-1]

    # Space to implement BoW
    if not a_tfidf:
        print("Error: BoW hasn't been implemented yet.")

    preprocessed_contents = preprocess_reviews(contents, *params[:-1])
    sparse_vector = preprocessed_contents # implement TF-IDF here

    return sparse_vector

In [47]:
# Separate content and label
contents = [review['content'] for review in all_reviews.values()]
labels = [review['label'] for review in all_reviews.values()]

# Generate 3 sets of features 
all_params = []
all_params.append([True, False, False, True, True]) # 1: stopwords, lemmatization, TFIDF
all_params.append([True, False, True, False, True]) # 2: stopwords, stemming, TFIDF
all_params.append([True, True, False, True, True]) # 4: stopwords, punctuation, lemmatization, TFIDF
# Further optimisations: remove HTML content? (done currently) implement BoW instead of TFIDF?

sets = []

# Preprocess reviews for each feature set
for params in all_params:
    generated_features = generate_features(contents, params)
    # Recombine the features with their corresponding labels
    labeled_features = list(zip(generated_features, labels))
    sets.append(labeled_features)

  soup = BeautifulSoup(content, "html.parser")


In [53]:
# Print a specific review for set 3
print(f"{len(sets[2])} reviews in total")
print(f"{len(sets[2][2][0].split(' '))} features in total")
print(sets[2][2])

4000 reviews in total
28 features in total
('a_solid solid_unremarkable unremarkable_film film_matthau matthau_einstein einstein_wonderful wonderful_my my_favorite favorite_part part_thing thing_would would_make make_go go_way way_see see_wonderful wonderful_scene scene_physicist physicist_play play_badmitton badmitton_i i_love love_sweater sweater_conversation conversation_wait wait_robbins robbins_retrieve retrieve_birdie', 1)
