In [None]:
# Embeds cleaned CSVs using OpenAI's Ada-002 model
# Since the model has a token limit, the first 9000 characters are embedded if the model throws an error
# Only cleaned data that is over 85 characters, and recognized as english using Langdetect are embedded

In [None]:
import os
import ast
import math
import time
import tiktoken
import numpy as np
import pandas as pd
from numpy.linalg import norm

from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

import openai
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv('API_KEY')

In [None]:
def embed(input):
    response = openai.Embedding.create(
    input=input,
    model="text-embedding-ada-002"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# Embed Concatenated Segments
all_clean = 0
all_end = 0
for num, f in enumerate(sorted(os.listdir('Cleaned By ID'), key=len)):
    print(num+1, end = '\t')
    print('ID: ', str(f.rstrip('.csv')).ljust(10), end = '')
    start = time.time()

    if f in os.listdir('OpenAI Concat Embedded By ID'):
        print("ALREADY EMBEDDED")
        continue

    df = pd.read_csv(f'Cleaned By ID/{f}')
    df = df.dropna(subset=['Cleaned'])
    df['Cleaned'] = df['Cleaned'].apply(lambda x: ast.literal_eval(x))
    df['Cleaned'] = df['Cleaned'].apply(lambda x: '. '.join(x))
    df = df[df['Cleaned'] != '']
    count_cleaned = len(df)
    all_clean += count_cleaned
    print('\tCleaned: ', str(len(df)).ljust(7), end= '')

    df['Eng'] = df['Cleaned'].apply(lambda x: detect(x) == 'en')
    df = df[(df['Eng'] == True)]
    print('English: ', str(len(df)).ljust(7), end= '')

    df = df[df['Cleaned'].str.len() > 85]
    print('Over 85: ', str(len(df)).ljust(7), end= '')
    all_end += len(df)
    try:
        print('Ratio:', round(len(df) / count_cleaned, 3), end= '')
    except:
        print('Ratio:', 0)
    df = df.drop(columns=['Eng']).reset_index()
    df['Embedding'] = None
    for i, row in df.iterrows():
        try:
            emb = embed(row['Cleaned'])
        except:
            emb = embed(row['Cleaned'][:9000])
        df.at[i, 'Embedding'] = emb
        print(f'\t{i}', detect(row['Cleaned']) == 'en')

    # df.drop(columns=['index', 'Segments']).to_csv(f'Embedded By ID/{f}', index = False)
    # print(time.time() - start, 'seconds')
print('\n\n ---- Total Ratio:', round(all_end / all_clean, 3), '----')
