In [97]:
import re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from collections import defaultdict
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [3]:
import nltk
from nltk.corpus import sentiwordnet as swn
# nltk.download('sentiwordnet')

In [77]:
import pickle
with open('data/vocab_key.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [105]:
with open('data/term_matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)
    f.close()

In [78]:
print(swn.senti_synset('absolutely.r.01'))

<absolutely.r.01: PosScore=0.5 NegScore=0.0>


In [79]:
# load cleaned reviews
reviews = pd.read_pickle("data/reviews_processed.pkl")

In [100]:
vocab_scores = defaultdict(float)
stemmer = nltk.PorterStemmer()
pos_interest = ['ADJ','NOUN', 'VERB', 'ADV']

def sentiment_score(doc_id, text):
    review = nlp(text)
    tokens = []
    for t in review:
        if not t.is_stop and t.is_alpha and t.pos_ in pos_interest:
            t_stem = stemmer.stem(t.text)
            if t_stem in vocab.keys():
                if t.pos_ == 'ADJ':
                    pos = 'a'
                elif t.pos_ == 'ADV':
                    pos = 'r'
                elif t.pos_ == 'VERB':
                    pos = 'v'
                elif t.pos == 'NOUN':
                    pos = 'n'
                try:
                    senti_text =  list(swn.senti_synsets(t.lemma_, pos))[0]
                    vocab_index = vocab[t_stem]
                    agg_score = senti_text.pos_score() - senti_text.neg_score()
                    # update matrix values
                    if agg_score != 0:
                        matrix[(doc_id, vocab_index)] *= agg_score
                except:
                    pass

In [101]:
# update matrix for all docs
for i, row in tqdm(reviews.iterrows(), desc='Updating Matrix Score', total=len(reviews)):
    sentiment_score(i, row['review_text'])

HBox(children=(IntProgress(value=1, bar_style='info', description='Updating Matrix Score', max=1, style=Progre…

  self._set_intXint(row, col, x.flat[0])





In [103]:
# with open('data/senti_matrix.pkl', 'wb') as f:
#     pickle.dump(matrix, f)
#     f.close()

In [104]:
with open('data/senti_matrix.pkl', 'rb') as f:
    senti_matrix = pickle.load(f)
    f.close()

In [107]:
# check if update works

idx = vocab['absolut']
print("Before: ", matrix[(0, idx)])
print("After: ", senti_matrix[(0, idx)])

Before:  0.1798195970823628
After:  0.0899097985411814


In [108]:
# create new df
y = reviews['star_rating']

# Try product department instead of class
X = pd.DataFrame({'class':reviews['product_category_department'],
                  'upvotes':reviews['upvotes']})

In [109]:
# create df for features
lemmas = pd.DataFrame(columns=vocab.keys())

In [110]:
NUM_DOCS = len(X)

for c in tqdm(lemmas.columns.values, desc='Adding data to columns'):
    vocab_index = vocab[c]
    data = []
    for i in range(NUM_DOCS):
        data.append(senti_matrix[(i, vocab_index)])
    lemmas[c] = data

HBox(children=(IntProgress(value=0, description='Adding data to columns', max=2000, style=ProgressStyle(descri…




In [111]:
X = X.reset_index(drop=True)
lemmas = lemmas.reset_index(drop=True)
X_feats = pd.concat([X, lemmas], axis=1)

In [112]:
# make product class dummy variable
prod_class = pd.get_dummies(X['class'])
prod_class = prod_class.reset_index(drop=True)

In [113]:
# drop original class columns
# concat prod_class
X_feats.drop('class', axis=1, inplace=True)
X_feats = pd.concat([X_feats, prod_class], axis=1)

In [114]:
X_feats.to_csv("data/senti_features.csv", index=False)