# Parsing Example

In [1]:
import torch
print(torch.__version__)

1.13.0


In [2]:
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Analogy of parsing

Words > Alphabet

Parsing > How words contructed by consonant and vowel

## Data

Word dataset is borrowed from Kaggle

https://www.kaggle.com/datasets/rtatman/english-word-frequency

In [3]:
df = pd.read_csv('unigram_freq.csv')

In [4]:
df.head()

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


We don't need frequency. So cut off count values from dataset

In [5]:
df = df.drop(labels='count',axis=1)

In [6]:
df

Unnamed: 0,word
0,the
1,of
2,and
3,to
4,a
...,...
333328,gooek
333329,gooddg
333330,gooblle
333331,gollgo


# Word embedding

Since we are using analogy between sentence and word, we should set some premise to argue.

We have to show dependency in sentence along words. so we have to assume that there is similiar dependency between alphabets and word.

In [35]:
#Shallow embedding

x = '3'

def alp_encoder(x,dim):
    embedding = np.random.rand(dim).tolist()
    return embedding
#if you want to use consonant/vowel feature, can add if sentence to add feature
b = alp_encoder(x,3)
print(b)

[0.9263152595745104, 0.2520694683983069, 0.5964829206778587]


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333333 entries, 0 to 333332
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   word    333331 non-null  object
dtypes: object(1)
memory usage: 2.5+ MB


In [19]:
a = 'abc'
print(a)
for i in a:
    print(i)

abc
a
b
c


In [20]:
alp_set = []
for i in df.word:
    for j in i:
        alp_set.append(j)
        alp_set = list(set(alp_set))
    if(len(alp_set)==26):
        break

In [21]:
alp_set.sort()

In [22]:
print(alp_set)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [36]:
alp_emb=[]
for i in alp_set:
    alp_emb.append(alp_encoder(i,5))

In [38]:
print(alp_emb[0]) # a's embedding

[0.40183791205796926, 0.5069520578912213, 0.7615309688860742, 0.34113169659305553, 0.4319906368303611]


In [39]:
alpdf = pd.DataFrame(index=alp_set, data = alp_emb)

In [40]:
print(alpdf)

          0         1         2         3         4
a  0.401838  0.506952  0.761531  0.341132  0.431991
b  0.622734  0.240157  0.277063  0.282669  0.063691
c  0.196480  0.317566  0.836407  0.198343  0.267990
d  0.851877  0.537936  0.608156  0.698547  0.145231
e  0.685986  0.420061  0.389564  0.315431  0.650672
f  0.927229  0.500740  0.937563  0.018958  0.865393
g  0.075539  0.538502  0.048975  0.558895  0.939938
h  0.077598  0.210521  0.291303  0.563479  0.044151
i  0.238873  0.956410  0.240673  0.488475  0.726563
j  0.995579  0.299535  0.288383  0.543469  0.802494
k  0.495889  0.727617  0.771185  0.345596  0.510607
l  0.991362  0.914903  0.121190  0.062097  0.963017
m  0.227881  0.773187  0.380889  0.617775  0.958126
n  0.609435  0.452570  0.168773  0.077320  0.442800
o  0.312267  0.914924  0.182331  0.715338  0.751379
p  0.402498  0.974586  0.333488  0.573895  0.009196
q  0.518850  0.758852  0.637033  0.697441  0.115998
r  0.441975  0.038056  0.576257  0.493024  0.965652
s  0.252577 