In [1]:
# Standard library imports
from collections import defaultdict
from pathlib import Path
import string
from typing import Any, Dict, DefaultDict, List, Tuple

# Third part imports
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.interfaces import TransformedCorpus
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Local imports
from Assignment4 import *

In [2]:
DATA_DIR = Path("/work/data")
RAW_DIR = DATA_DIR / "raw"
SRC = RAW_DIR / "Musical_instruments_reviews.csv"
PROCESSED_DIR = DATA_DIR / "processed" 
DST = PROCESSED_DIR / "stemming_output.csv"
LIMIT = 15

In [3]:
data = pd.read_csv(SRC)

In [4]:
tokenized_reviews = tokenize(data["summary"])
nostops = remove_stopwords(tokenized_reviews)

stemmed = stem(nostops)
stemmed_dict = make_token_dict(stemmed)
stemmed_bow_corpus = make_bow_corpus(stemmed, stemmed_dict)
stemmed_model, stemmed_corpus_tfidf = make_corpus_tfidf(stemmed_bow_corpus)
stemmed_top_terms_by_doc = find_top_terms_by_doc(stemmed_corpus_tfidf, stemmed_dict)
stemmed_top_terms = find_top_terms(stemmed_top_terms_by_doc, limit=LIMIT)

lemmatized = lemmatize(nostops)
lemmatized_dict = make_token_dict(lemmatized)
lemmatized_bow_corpus = make_bow_corpus(lemmatized, lemmatized_dict)
lemmatized_model, lemmatized_corpus_tfidf = make_corpus_tfidf(lemmatized_bow_corpus)
lemmatized_top_terms_by_doc = find_top_terms_by_doc(lemmatized_corpus_tfidf, stemmed_dict)
lemmatized_top_terms = find_top_terms(lemmatized_top_terms_by_doc, limit=LIMIT)



## Stemmed top term scores

In [5]:
score_terms(nostops, lemmatize_=False, limit=15)

word
avoid                  1.0
les                    1.0
oversel                1.0
digiverbdigitech       1.0
p                      1.0
par                    1.0
limp                   1.0
perfectfor             1.0
flashback              1.0
useful                 1.0
usefulbut              1.0
agre                   1.0
perrrrrfect            1.0
toughest               1.0
flexablelightweight    1.0
Name: score, dtype: float64

## Lemmatized top term scores

In [6]:
score_terms(nostops, lemmatize_=True, limit=15)

word
mojo                   1.0
par                    1.0
greaaaaaat             1.0
al                     1.0
grail                  1.0
unobtrusive            1.0
allright               1.0
goofed                 1.0
aloha                  1.0
thick                  1.0
giggity                1.0
amaizing               1.0
underwhelmed           1.0
flexablelightweight    1.0
unbelieveable          1.0
Name: score, dtype: float64