## Imports and Function Declarations

In [59]:
# python version 3.10.6

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) 
nltk.download('punkt', quiet=True) 

from sklearn.model_selection import train_test_split
from textacy.preprocessing import remove, normalize, replace

import warnings 
import contractions

import gensim.downloader as api
from gensim.models import Word2Vec

import pkg_resources
from symspellpy import SymSpell, Verbosity

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, LogisticRegression

import torch 
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler

warnings.filterwarnings('ignore')

In [2]:
# GLOBALS 

F_PATH = 'amazon_reviews_us_Jewelry_v1_00.tsv'

STAR_H = 'star_rating'
REVIEW_H = 'review_body'

COLS=[STAR_H, REVIEW_H]

VAL_STARS = {'1', '2', '3', '4', '5'}

WV = api.load('word2vec-google-news-300')

SPELLER = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

SPELLER.load_dictionary(dictionary_path, term_index=0, count_index=1)
SPELLER.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

WNL = WordNetLemmatizer()

In [5]:
# def read_data(f_path=F_PATH):
#    df = pd.read_csv(f_path, sep='\t', usecols=COLS, low_memory=False)
#    df.dropna(inplace=True)
# add in convert dtype to int for reviews?
#    return df

# def get_sample(df, s_size=20000):

#    grouped = df.groupby(STAR_H)
#    rat_dfs = [grouped.get_group(rating).sample(n=s_size) for rating in VAL_STARS]
#    return pd.concat(rat_dfs) 

def gen_clean(text):
    """
    gen text cleanup 
    incl removal: extended ws, html tags, urls
    """
    text = BeautifulSoup(text, "html.parser").text #rm html tags 
    text = replace.urls(text, '')
    text = contractions.fix(text)
    text = remove.punctuation(text)
    text = replace.numbers(text, '')
    text = normalize.whitespace(text).lower()
    text = replace.emojis(text, '')
    toks = rm_stops(text)
    
    return toks if toks else np.nan

def rm_stops(text): 
    """
    remove stop words from text 
    """
    stops = set(stopwords.words("english"))
    sans_stops = [tok for tok in word_tokenize(text) if tok not in stops]
    return sans_stops

def lemmatize(tok_list): 

    lemmas = [WNL.lemmatize(w) for w in tok_list]
    return " ".join(lemmas).strip()

def get_avg(tok_list):
    w2v = []
    skipped = 0
    new_toks = None
    
    for w in tok_list:
        try: 
            w2v.append(WV[w])
        except KeyError: 
            skipped = 1
            break
    
    if skipped: 
        w2v = []
        new_toks = spell_check(" ".join(tok_list))
        skipped = 0
        for w in new_toks: 
            if w not in set(stopwords.words("english")):
                try: w2v.append(WV[w])
                except KeyError: continue
                
    w2v_arr = np.array(w2v) if w2v else np.zeros((1,300))
    
    mean = np.mean(w2v_arr, axis=0)

    return mean, new_toks

def make_avg_vecs(df): 
    vecs = []
    for _, row in df.iterrows(): 
        avg, new_toks = get_avg(row['cl_toks'])
        
        vecs.append(avg)

        if new_toks is not None: 
            row['cl_toks'] = new_toks
            
    df['avg_vecs'] = vecs

    return df
# , torch.tensor(vecs)

def spell_check(text): 
    sugs = SPELLER.lookup_compound(text, max_edit_distance=2)
    term = [sug.term for sug in sugs]
    n_str = "  ".join(term).split()
    return [t for t in n_str if t in SPELLER.words.keys()]



## Question 1: Dataset Generation

In [4]:
# df = read_data()
# sampled = get_sample(df)
# sampled.to_pickle('samp.pkl')
sampled = pd.read_pickle('samp.pkl')
sampled['cl_toks'] = sampled[REVIEW_H].apply(gen_clean)
sampled.dropna(inplace=True)
sampled['cl_lemmas'] = sampled['cl_toks'].apply(lemmatize)


In [15]:
# sampled.to_pickle('samp_w2v_tdif.pkl')

## Question 2: Word Embedding

### **2a examples:**

In [5]:
wv_bracelet = WV.most_similar(negative=["wrist"], positive=['bracelet', 'neck'], topn=1)
wv_girl = WV.most_similar( positive=['girl', 'age'], topn=1)
wv_family = WV.most_similar(negative=['child'], positive=['family'], topn=1)

print(f"Bracelet - Wrist + Neck = {wv_bracelet}")
print(f"Girl + age = {wv_girl}")
print(f"Family - Child = {wv_family}")

Bracelet - Wrist + Neck = [('necklace', 0.5466936826705933)]
Girl + age = [('boy', 0.7243723273277283)]
Family - Child = [('friends', 0.3765709400177002)]


## Question: 2b

embedding size = 300 
window size = 11. 
minimum word count = 10

- Check the semantic similarities for the same two examples in part (a)
- What do you conclude from comparing vectors generated by yourself and the pretrained model? 
- Which of the Word2Vec models seems to encode semantic similarities between words better?

- For the rest of this assignment, use the pretrained “word2vec-google-news-300” Word2Vec features

In [6]:
model = Word2Vec(sentences=sampled[REVIEW_H], vector_size=300, window=11, min_count=10, workers=4)

In [7]:
m_bracelet = model.wv.most_similar(negative=["wrist"], positive=['bracelet', 'neck'], topn=1)
m_girl = model.wv.most_similar(positive=['girl', 'age'], topn=1)
m_family = model.wv.most_similar(negative=['child'], positive=['family'], topn=1)

print(f"Bracelet - Wrist + Neck = {m_bracelet}")
print(f"Girl + Age = {m_girl}")
print(f"Family - Child = {m_family}")

Bracelet - Wrist + Neck = [('necklace', 0.7294794917106628)]
Girl + Age = [('teen', 0.9324922561645508)]
Family - Child = [('co', 0.622450053691864)]


## Question 3: Simple Models
- Preceptron 
- SVM 
- What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained Word2Vec features)?

In [7]:
sampled = make_avg_vecs(sampled)

In [16]:
v = TfidfVectorizer(use_idf=False)
feat = v.fit_transform(sampled['cl_lemmas'])
T_train, T_test, W_train, W_test, train_labels, test_labels = train_test_split(feat, sampled['avg_vecs'].tolist(), sampled[STAR_H], test_size=0.2, random_state=42)

In [23]:
p = Perceptron(random_state=42, class_weight='balanced', max_iter=100, n_iter_no_change=3)

p.fit(T_train, train_labels)
t_pred = p.predict(T_test)
print(accuracy_score(test_labels, t_pred))

p.fit(W_train, train_labels)
w2v_pred = p.predict(W_test)
print(accuracy_score(test_labels, w2v_pred))

0.4051538653990493
0.4042531898924193


In [24]:
svm = LinearSVC(penalty='l1', dual=False, random_state=42, max_iter=300)

svm.fit(T_train, train_labels)
t_svm_pred = svm.predict(T_test)
print(accuracy_score(test_labels, t_svm_pred))


svm.fit(W_train, train_labels)
w2v_svm_pred = svm.predict(W_test)
print(accuracy_score(test_labels, w2v_svm_pred))

0.5024768576432325
0.47240430322742055


## Tuning

In [26]:
# from sklearn.model_selection import GridSearchCV
# params = {
#     'penalty':['l1','l2'], 
#     'dual':[True, False], 
#     'C':[0.05, 0.1, 1], 
#     'class_weight':['balanced', None], 
#     'fit_intercept': [True, False]
#     # 'max_iter': [40, 50, 60], 
# }
# svc = LinearSVC(random_state=42)
# # p = Perceptron(random_state=42)
# clf = GridSearchCV(svc, params, scoring='accuracy', n_jobs=-1, verbose=1)
# clf.fit(X_train, train_labels)

# print("Best score: %0.3f" % clf.best_score_)
# print("Best parameters set:")
# best_parameters = clf.best_estimator_.get_params()
# for param_name in sorted(params.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 48 candidates, totalling 240 fits




Best score: 0.468
Best parameters set:
	C: 0.05
	class_weight: None
	dual: True
	fit_intercept: True
	penalty: 'l2'


## Question 4: Feedforward Neural Networks
- multi-layer perceptron 
- 2 hidden layers, 50 and 10 nodes respectively 
- cross entropy loss and choice of hyperparams 
### Part A: 
- generate input features using avg Word2Vec vectors
- train NN 
- report accuracy values on the testing split for the nn

In [63]:
X_train, X_test, train_labels, test_labels = train_test_split(sampled['avg_vecs'].tolist(), sampled[STAR_H].tolist(), test_size=0.2, random_state=42)

In [95]:
class TrainFNN(Dataset):
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):

        review = torch.from_numpy(self.data[index])
        label = self.labels[index]
            
        return review, int(label)
    
class TestFNN(Dataset):
    
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
            
        return torch.from_numpy(self.data[index])

In [96]:
train_data = TrainFNN(X_train, train_labels)
test_data = TestFNN(X_test)

num_workers = 0
batch_size = 20
valid_size = 0.2

num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers)
valid_loader = DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers)
test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=num_workers)

In [101]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 5)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # flatten image input
        # x = x.T
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.dropout(x)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

# initialize the NN
model = Net()
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [102]:
# number of epochs to train the model
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval() # prep model for evaluation
    for data, target in valid_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

IndexError: Target 5 is out of bounds.