## Imports and Function Declarations

In [1]:
# python version 3.10.6

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) 
nltk.download('punkt', quiet=True) 

from sklearn.model_selection import train_test_split
from textacy.preprocessing import remove, normalize, replace

import warnings 
import contractions

from gensim.models import Word2Vec

warnings.filterwarnings('ignore')

In [2]:
# GLOBALS 

F_PATH = 'amazon_reviews_us_Jewelry_v1_00.tsv'

STAR_H = 'star_rating'
REVIEW_H = 'review_body'

COLS=[STAR_H, REVIEW_H]

VAL_STARS = {'1', '2', '3', '4', '5'}

WNL = WordNetLemmatizer()

In [3]:
# def read_data(f_path=F_PATH):
#    df = pd.read_csv(f_path, sep='\t', usecols=COLS, low_memory=False)
#    df.dropna(inplace=True)
#    return df

# def get_sample(df, s_size=20000):

#    grouped = df.groupby(STAR_H)
#    rat_dfs = [grouped.get_group(rating).sample(n=s_size) for rating in VAL_STARS]
#    return pd.concat(rat_dfs) 

def gen_clean(text):
    """
    gen text cleanup 
    incl removal: extended ws, html tags, urls
    """
    text = BeautifulSoup(text, "html.parser").text #rm html tags 
    text = replace.urls(text, '')
    text = contractions.fix(text)
    text = remove.punctuation(text)
    text = normalize.whitespace(text)
   
    return text.lower()
   
# def rm_stops(text): 
#    """
#    remove stop words from text 
#    """
#    stops = set(stopwords.words("english"))
#    sans_stops = [tok for tok in word_tokenize(text) if tok not in stops]
#    return " ".join(sans_stops).strip()

# def lemmatize(text): 

#    lemmas = [WNL.lemmatize(w) for w in word_tokenize(text)]
#    return " ".join(lemmas).strip()

def split_sentences(text):
    sentences = text.split('.')
    cl_sents = [gen_clean(sent) for sent in sentences if sent]
    return cl_sentences

In [4]:
# df = read_data()
# sampled = get_sample(df)
# sampled.applymap(gen_clean, na_action='ignore')
# sampled.to_pickle('samp.pkl')
sampled = pd.read_pickle('samp.pkl')


In [5]:
X_train, X_test, train_labels, test_labels = train_test_split(sampled[REVIEW_H], sampled[STAR_H], test_size=0.2, random_state=42)

In [6]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')


### Question: 2a examples

In [7]:
wv_king = wv.most_similar(negative=["man"], positive=['king', 'woman'], topn=5)
wv_girl = wv.most_similar( positive=['girl', 'age'], topn=5)
wv_family = wv.most_similar(negative=['child'], positive=['family'], topn=5)
print(f"King - Man + Woman = {wv_king}")
print(f"Girl + age = {wv_girl}")
print(f"Family - Child = {wv_family}")

King - Man + Woman = [('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]
Girl + age = [('boy', 0.7243723273277283), ('teenager', 0.6263099312782288), ('woman', 0.6046332716941833), ('teenage_girl', 0.6039137244224548), ('daughter', 0.5845543146133423)]
Family - Child = [('friends', 0.3765709400177002), ('clan', 0.3685661852359772), ('close_knit', 0.36716675758361816), ('tight_knit', 0.36474621295928955), ('Altmans', 0.3335089385509491)]


### Question: 2b

embedding size = 300 
window size = 11. 
minimum word count = 10

- Check the semantic similarities for the same two examples in part (a)
- What do you conclude from comparing vectors generated by yourself and the pretrained model? 
- Which of the Word2Vec models seems to encode semantic similarities between words better?

- For the rest of this assignment, use the pretrained “word2vec-google-news-300” Word2Vec features

In [8]:
model = Word2Vec(sentences=sampled[REVIEW_H], vector_size=300, window=11, min_count=10)
# model.save("word2vec.model")

In [11]:
m_king = model.wv.most_similar(negative=["man"], positive=['king', 'woman'], topn=5)
m_girl = model.wv.most_similar( positive=['girl', 'age'], topn=5)
m_family = model.wv.most_similar(negative=['child'], positive=['family'], topn=5)

KeyError: "Key 'king' not present in vocabulary"

### Question 3: Simple Models

In [None]:
v = TfidfVectorizer(use_idf=False)
feat = v.fit_transform(sampled[REVIEW_H])
X_train, X_test, train_labels, test_labels = train_test_split(feat, sampled[STAR_H], test_size=0.2, random_state=42)