# Metadata

```yaml
Course:    DS 5001 
Module:    09 Lab
Topic:     Using GloVe
Author:    R.C. Alvarado
Date:      28 March 2023 (revised)
```

**Purpose:** We use some pretrained word vectors from [the developers of GloVe](https://nlp.stanford.edu/projects/glove/).

# Set Up

In [5]:
data_in = '../data/glove'
db_file = f'{data_in}/glove2.db'

In [3]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.metrics.pairwise import cosine_similarity

# Import GloVe data

In [6]:
with sqlite3.connect(db_file) as db:
    glove = pd.read_sql("SELECT * FROM glove50", db, index_col='term_str')

In [7]:
glove.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
",",0.013441,0.23682,-0.16899,0.40951,0.63812,0.47709,-0.42852,-0.55641,-0.364,-0.23938,...,-0.080262,0.63003,0.32111,-0.46765,0.22786,0.36034,-0.37818,-0.56657,0.044691,0.30392
.,0.15164,0.30177,-0.16763,0.17684,0.31719,0.33973,-0.43478,-0.31086,-0.44999,-0.29486,...,-6.4e-05,0.068987,0.087939,-0.10285,-0.13931,0.22314,-0.080803,-0.35652,0.016413,0.10216
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,...,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
to,0.68047,-0.039263,0.30186,-0.17792,0.42962,0.032246,-0.41376,0.13228,-0.29847,-0.085253,...,-0.094375,0.018324,0.21048,-0.03088,-0.19722,0.082279,-0.09434,-0.073297,-0.064699,-0.26044


# Remove non-words

There are a lot of useless tokens in the vocabulary. These may be good for generating the features, but we don't need them in our queries.

In [8]:
glove = glove.reset_index()
glove = glove[glove.term_str.str.match(r'^[a-z]+$')]
glove = glove.set_index('term_str')

In [9]:
glove.shape

(326891, 50)

# Define some semantic functions

In [10]:
def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    wv = glove.loc[term_str].values.reshape(-1, 1).T
    return wv

def get_sims(term_str, n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims = cosine_similarity(glove.values, wv)
    return pd.DataFrame(sims, index=glove.index, columns=['score'])\
        .sort_values('score', ascending=False).head(n)

def get_nearest_vector(wv):
    """Get the nearest word vector to a given word vector"""
    sims = cosine_similarity(glove.values, wv)
    return pd.DataFrame(sims, index=glove.index, columns=['score'])\
        .sort_values('score', ascending=False).head(2).iloc[1]

def get_analogy(a, b, c):
    """Infer missing analogical term"""
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        C = get_word_vector(c)
        D = np.add(np.subtract(B, A), C)
        X = get_nearest_vector(D)
        return X.name
    except ValueError as e:
        print(e)
        return None

# Test similarity function

In [11]:
get_sims('queen')

Unnamed: 0_level_0,score
term_str,Unnamed: 1_level_1
queen,1.0
princess,0.851517
lady,0.805061
elizabeth,0.787304
king,0.783904
prince,0.782186
coronation,0.769278
consort,0.76261
royal,0.744286
crown,0.738265


In [14]:
get_sims('king')

Unnamed: 0_level_0,score
term_str,Unnamed: 1_level_1
king,1.0
prince,0.823618
queen,0.783904
ii,0.774623
emperor,0.773625
son,0.766719
uncle,0.762715
kingdom,0.754216
throne,0.753991
brother,0.749241


# Test analogy functions

In [12]:
get_analogy('dog','male','cat')

'female'

In [13]:
get_analogy('male','doctor','female')

'nurse'

In [14]:
get_analogy('queen','female','king')

'male'

In [15]:
get_analogy('female','princess','male')

'duchess'

In [16]:
get_analogy('right','male','left')

'male'

In [17]:
get_analogy('right','left','male')

'male'

In [18]:
get_analogy('left','right','black')

'white'

In [19]:
get_analogy('left','right','white')

'black'

In [20]:
get_analogy('sun','moon','male')

'male'

In [21]:
get_analogy('day','sun','night')

'sky'