# Metadata

```yaml
Course:    DS 5001
Module:    09 Lab
Topic:     Skip Gram Representations
Author:    R.C. Alvarado
Date:      28 March 2023 (revised)
```

**Purpose:** A demonstration of how to create CBOW vector spaces.

# Set Up

In [1]:
data_in = '../data/novels'
data_prefix = 'novels'

In [2]:
OHCO = ['book', 'chapter', 'para_num', 'sent_num', 'token_num']
BAG = OHCO[2:5] # Paragraphs

In [3]:
# Word Embedding window
w = 2

In [4]:
import pandas as pd
import numpy as np
import scipy as sp

# Import Data

In [5]:
TOKENS = pd.read_csv(f'{data_in}/{data_prefix}-TOKENS.csv')

# Create DOCS as lists of tokens

In [6]:
DOCS = TOKENS.groupby(BAG).apply(lambda x: x.term_str.tolist()).reset_index(drop=True)

In [7]:
DOCS.head()

0    [fourth, in, i, the, i, the, keeping, while, h...
1    [narrative, the, spoke, question, am, first, m...
2    [first, of, of, truly, thing, private, was, on...
3    [part, my, how, sorry, i, sentiments, in, mome...
4    [of, lady, i, to, did, to, this, i, the, of, t...
dtype: object

# Create Windows

In [8]:
def get_windows(x):

    global WINDOWS
    
    bag_id  = x.name
    row = x[0]

    # Move through each word in the bag vector
    for i in range(len(row)):
    
        # Slide the window
        anchor = row[i]
        for j in range(-w, w+1):
            a = i + j
            if j != 0 and a >= 0 and a < len(row):
                WINDOWS.append((bag_id, i, anchor, j, row[a])) 

In [9]:
WINDOWS = []
DOCS.to_frame(0).apply(get_windows, 1)
W = pd.DataFrame(WINDOWS, columns=['bag_id', 'window_id', 'anchor', 'dist', 'probe']).set_index(['bag_id', 'window_id', 'anchor', 'dist'])

In [10]:
W.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,probe
bag_id,window_id,anchor,dist,Unnamed: 4_level_1
0,0,fourth,1,in
0,0,fourth,2,i
0,1,in,-1,fourth
0,1,in,1,i
0,1,in,2,the


In [11]:
N_terms = len(W.probe.unique())

# As CBOW / Skipgram

For predictive modeling, i.e. with a shallow NN.

In [12]:
CBOW = W.probe.unstack().fillna('<s>').reset_index().set_index(['bag_id','window_id'])

In [13]:
CBOW.head(20)

Unnamed: 0_level_0,dist,anchor,-2,-1,1,2
bag_id,window_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,fourth,<s>,<s>,in,i
0,1,in,<s>,fourth,i,the
0,2,i,fourth,in,the,i
0,3,the,in,i,i,the
0,4,i,i,the,the,keeping
0,5,the,the,i,keeping,while
0,6,keeping,i,the,while,here
0,7,while,the,keeping,here,june
0,8,here,keeping,while,june,one
0,9,june,while,here,one,when


# As Matrix

For matrix decomposition.

In [14]:
M = W.groupby(['anchor','probe']).probe.count().unstack(fill_value=0)

In [15]:
M

probe,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zoöphagous,zoöphagy,zufalle,zum,zusammen,à,æt,ætat,ça,émeutes
anchor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,1916,0,0,4,4,1,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
aback,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abaft,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandon,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandoned,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
à,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
æt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ætat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ça,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
