# Create Embeddings

## Load data

In [6]:
import pathlib
import pandas as pd

def load(path):
    with open(path, 'r') as fp:
        df = fp.read()
        df = df.split('\n')
        df = [line.strip().replace('"', '') for line in df if len(line.strip()) > 0]
        return pd.DataFrame({'title': df, 'y': path.stem})

df = pathlib.Path('./data').glob('*.txt')
df = [load(p) for p in df]
df = pd.concat(df).reset_index(drop=True)
df.shape

(90, 2)

In [82]:
df.head()

Unnamed: 0,title,y
0,Java: The Complete Reference,java
1,Core Java An Integrated Approach (Black Book),java
2,Head First Java,java
3,Effective Java,java
4,Thinking in Java,java


In [7]:
df.groupby(['y']).size()

y
computer_science    10
csharp              10
finance             10
java                10
javascript          10
machine_learning    10
python              10
real_estate         10
web                 10
dtype: int64

## Vectorize

In [8]:
%%time

import itertools
from openai import OpenAI

def embed_func(docs, model='text-embedding-ada-002'):
    docs = [d.replace('\n', ' ') for d in docs]
    res = client.embeddings.create(input=docs, model=model)
    return [d.embedding for d in res.data]

def split_list(input_list, chunk_size):
    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]

client = OpenAI()

embeddings = split_list(list(df['title']), 500)
embeddings = map(embed_func, embeddings)
embeddings = itertools.chain(*embeddings)
embeddings = list(embeddings)
len(embeddings)

CPU times: user 208 ms, sys: 56.2 ms, total: 264 ms
Wall time: 681 ms


90

In [10]:
Xy = pd.DataFrame({'vector': embeddings, 'y': df['y']})
Xy.shape

(90, 2)

In [11]:
Xy.head()

Unnamed: 0,vector,y
0,"[0.0022508783731609583, -0.004211004823446274,...",java
1,"[0.008307374082505703, -0.0205343309789896, 0....",java
2,"[0.010946471244096756, 0.010107582435011864, -...",java
3,"[-0.010318228043615818, 0.007110248785465956, ...",java
4,"[-0.001255613286048174, 0.002567031653597951, ...",java


## Compute similarities

In [50]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_avg_similarities(i, j, y0, y1):
    a = np.array(Xy[Xy['y']==y0]['vector'].tolist())
    b = np.array(Xy[Xy['y']==y1]['vector'].tolist())

    c = np.triu(cosine_similarity(a, b), k=1)
    c = np.array([c[i,j] for j in range(c.shape[1]) for i in range(c.shape[0]) if j > i])
    return {
        'i': i,
        'j': j,
        'y0': y0,
        'y1': y1,
        'sim': np.mean(c)
    }

In [62]:
import itertools

clazz = df['y'].unique()
sim_df = [(i, j, y0, y1) for j, y1 in enumerate(clazz) for i, y0 in enumerate(clazz) if j >= i]
sim_df = [get_avg_similarities(i, j, y0, y1) for i, j, y0, y1 in sim_df]
sim_df = pd.DataFrame(sim_df)

In [70]:
sim_mat = np.zeros((len(clazz), len(clazz)))

for i, _ in enumerate(clazz):
    for j, _ in enumerate(clazz):
        if j >= i:
            s = sim_df[(sim_df['i']==i) & (sim_df['j']==j)].iloc[0]['sim']
            sim_mat[i, j] = sim_mat[j, i] = s

sim_mat = pd.DataFrame(sim_mat, columns=clazz, index=clazz)
sim_mat

Unnamed: 0,java,real_estate,javascript,python,finance,machine_learning,csharp,computer_science,web
java,0.856407,0.769469,0.824093,0.805603,0.767229,0.789297,0.810542,0.812209,0.783262
real_estate,0.769469,0.875555,0.767489,0.751921,0.802737,0.750062,0.756452,0.761262,0.750552
javascript,0.824093,0.767489,0.877622,0.806343,0.760471,0.785221,0.801511,0.799254,0.825318
python,0.805603,0.751921,0.806343,0.872754,0.748293,0.829757,0.776803,0.780263,0.772255
finance,0.767229,0.802737,0.760471,0.748293,0.817658,0.757858,0.745366,0.773515,0.748979
machine_learning,0.789297,0.750062,0.785221,0.829757,0.757858,0.868658,0.768335,0.803196,0.764194
csharp,0.810542,0.756452,0.801511,0.776803,0.745366,0.768335,0.867506,0.794327,0.76979
computer_science,0.812209,0.761262,0.799254,0.780263,0.773515,0.803196,0.794327,0.823804,0.786443
web,0.783262,0.750552,0.825318,0.772255,0.748979,0.764194,0.76979,0.786443,0.822367


In [71]:
np.argmax(sim_mat, axis=1)

array([0, 1, 2, 3, 4, 5, 6, 7, 2])

In [83]:
columns = [f'x{i}' for i, _ in enumerate(Xy['vector'].iloc[0])]
_Xy = pd.DataFrame(Xy['vector'].values.tolist(), columns=columns).assign(title=df['title'], y=Xy['y'])
_Xy.shape

(90, 1538)

In [84]:
_Xy.to_csv('./stash/Xy.csv', index=False)