References:
- https://pythonhosted.org/PyDrive/quickstart.html

In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive


# gauth = GoogleAuth()
# gauth.CommandLineAuth()

# drive = GoogleDrive(gauth)
gauth = GoogleAuth()
# Try to load saved client credentials
gauth.LoadCredentialsFile("credentials.json")
if gauth.credentials is None:
    # Authenticate if they're not there
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    # Refresh them if expired
    gauth.Refresh()
else:
    # Initialize the saved creds
    gauth.Authorize()
# Save the current credentials to a file
gauth.SaveCredentialsFile("credentials.json")

drive = GoogleDrive(gauth)


In [2]:
# Read file from GDrive in the folder "slides" (with id folder_id)
import os
from dotenv import load_dotenv
load_dotenv()


folder_id = os.environ["FOLDER_ID"]

file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList() 
new_files = []
for file in file_list:
    if not os.path.isfile(f"{file['title']}"): 
        # print("title: %s, id: %s" % (file["title"],file["id"]))
        file.GetContentFile(file["title"])
        new_files.append(file)

# new_files


In [7]:
import PyPDF2


transcriptions = [] 
embeddings = {}

for file in new_files:
    pdfReader = PyPDF2.PdfReader(file["title"])

    count = len(pdfReader.pages)
    output = ""
    embeddings[file["title"]] = {"text":[]}
    for i in range(count):
        pageObj = pdfReader.pages[i]
       
        extr = pageObj.extract_text()
        embeddings[file["title"]]["text"].append(extr)
        output += "\n" + extr
        
    transcriptions.append(output)
print(embeddings)
    

{'02_serverless_ml.pdf': {'text': ['Serverless Machine Learning\nJim Dowling\njdowling@kth.se\n2022-11-04', 'Enterprise AI Value Chain\n1 / 54', 'Modern Enterprise Data and ML Infrastructure\n2 / 54', 'Monolithic ML Pipeline\n3 / 54', 'Problems with Monolithic ML Pipelines\n▶They are often not modular - their components are not modular and cannot be\nindependently scaled or deployed on different hardware (e.g., CPUs for feature engi-\nneering, GPUs for model training).\n▶They are difficult to test - production software needs automated tests to ensure\nfeatures and models are of high quality.\n▶They tightly couple the execution of feature engineering, model training, and infer-\nence steps - running them in the same pipeline program at the same time.\n▶They do not promote reuse of features/models/code. The code for computing fea-\ntures (feature logic) cannot be easily disentangled from its pipeline jungle.\n4 / 54', 'Modularity enables more Robust and Scalable Systems\nModular water pi

In [30]:
print(transcriptions)


['\nServerless Machine Learning\nJim Dowling\njdowling@kth.se\n2022-11-04\nEnterprise AI Value Chain\n1 / 54\nModern Enterprise Data and ML Infrastructure\n2 / 54\nMonolithic ML Pipeline\n3 / 54\nProblems with Monolithic ML Pipelines\n▶They are often not modular - their components are not modular and cannot be\nindependently scaled or deployed on different hardware (e.g., CPUs for feature engi-\nneering, GPUs for model training).\n▶They are difficult to test - production software needs automated tests to ensure\nfeatures and models are of high quality.\n▶They tightly couple the execution of feature engineering, model training, and infer-\nence steps - running them in the same pipeline program at the same time.\n▶They do not promote reuse of features/models/code. The code for computing fea-\ntures (feature logic) cannot be easily disentangled from its pipeline jungle.\n4 / 54\nModularity enables more Robust and Scalable Systems\nModular water pipes in a Google Datacenter. Instead of one

In [23]:
import openai
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
from math import ceil


api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)
responses = []

for t in transcriptions:
    context = t
    i = 0
    chunks = []

    for i in range(ceil(len(t)/4097)):
        chunks.append(t[i*4097:i*4097+4097])

    for c in chunks:
        context = c
        question = "The text above is the result of the transcription of slides in the PDF file format. Remove chapter names and slides numbers and rephrase the sentences. Once you do that generate 2 to 3 meaningful questions on the text and the respective answers. Plese reply in the JSON format {'questions':<questions generated>,'answers':<answers generated>}. DO NOT write anything else than the requested JSON and remember to write the full elaborated content and not just one part."
        #question = "The text above is the result of the transcription of slides in the PDF file format. Remove chapter names and slides numbers and rephrase the sentences. Once you do that generate 3 meaningful questions based on the new text and the respective answers. As for the reply, follow the following template FOR EACH pair of question and the respective answer: '[INST] <question> [/INST] <answer>'  and so on, let's call this template a 'block'.  NEVER use newlines other than separating blocks and NEVER write anything that is not formatted as the proposed template. DO NOT write anything else than the requested blocks and make sure everything is formatted correctly."
        # response = openai.Completion.create(
        # engine="gpt-3.5-turbo",
        prompt=f"\nContext: {context}\nQuestion: {question}"
        # )
        # answer = response.choices[0].text.strip()
        # print(answer)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt},
            ]
        )
        
        print(response.choices[0].message.content)
        responses.append(response.choices[0].message.content)


2023-12-19 20:44:14,539 INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
{
  "questions": [
    "What are some problems with monolithic ML pipelines?",
    "What is the purpose of modular code in ML pipelines?",
    "What are some examples of feature engineering steps in a feature pipeline?"
  ],
  "answers": [
    "Some problems with monolithic ML pipelines include lack of modularity, difficulty in testing, tight coupling of pipeline steps, and limited code reuse.",
    "The purpose of modular code in ML pipelines is to make the functionality separated into independent classes or functions that can be easily reused, tested, and understood.",
    "Some examples of feature engineering steps in a feature pipeline are cleaning and validating data, data deduplication and wrangling, feature extraction and aggregations, feature binning, and feature crosses."
  ]
}
2023-12-19 20:44:17,524 INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions 

In [31]:
# Generate instruction set
import json
import pandas as pd

row_result = {"prompt":[],"questions":[],"answers":[]}

for i, r in enumerate(responses):
    try:
        tmp = json.loads(r)
        for j in range(len(tmp["questions"])):
            instr = f"<s> [INST] {tmp['questions'][j]} [/INST] {tmp['answers'][j]} </s>"
            row_result["questions"].append(tmp['questions'][j])
            row_result["answers"].append(tmp['answers'][j])
            row_result["prompt"].append(instr)
            # print(instr)
    except:
        pass

instructions = pd.DataFrame(row_result)
pd.set_option('display.max_colwidth', None)
for index, row in instructions.iterrows():
    print(row["prompt"])

# print(instructions.iloc[0]["Instructions"])


<s> [INST] What are some problems with monolithic ML pipelines? [/INST] Some problems with monolithic ML pipelines include lack of modularity, difficulty in testing, tight coupling of pipeline steps, and limited code reuse. </s>
<s> [INST] What is the purpose of modular code in ML pipelines? [/INST] The purpose of modular code in ML pipelines is to make the functionality separated into independent classes or functions that can be easily reused, tested, and understood. </s>
<s> [INST] What are some examples of feature engineering steps in a feature pipeline? [/INST] Some examples of feature engineering steps in a feature pipeline are cleaning and validating data, data deduplication and wrangling, feature extraction and aggregations, feature binning, and feature crosses. </s>
<s> [INST] What are the two options for computing features in a model training or batch inference pipeline? [/INST] The two options for computing features in a model training or batch inference pipeline are on-deman

In [16]:
import hopsworks

project = hopsworks.login()


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/314521


In [15]:
import pandas as pd

emb = {"source":[],"page":[],"content":[]}
for e in embeddings:
    for idx,t in enumerate(embeddings[e]["text"]):
        emb["source"].append(e)
        emb["page"].append(idx)
        emb["content"].append(t)


embedding_df = pd.DataFrame(emb)

print(embedding_df)
    
    
    

                   source  page  \
0    02_serverless_ml.pdf     0   
1    02_serverless_ml.pdf     1   
2    02_serverless_ml.pdf     2   
3    02_serverless_ml.pdf     3   
4    02_serverless_ml.pdf     4   
..                    ...   ...   
131   01_introduction.pdf    76   
132   01_introduction.pdf    77   
133   01_introduction.pdf    78   
134   01_introduction.pdf    79   
135   01_introduction.pdf    80   

                                               content  
0    Serverless Machine Learning\nJim Dowling\njdow...  
1                    Enterprise AI Value Chain\n1 / 54  
2    Modern Enterprise Data and ML Infrastructure\n...  
3                       Monolithic ML Pipeline\n3 / 54  
4    Problems with Monolithic ML Pipelines\n▶They a...  
..                                                 ...  
131  Negative Log-Likelihood\n▶Likelihood: L(θ|X) =...  
132  Cross-Entropy\n▶Coss-entropy: quantify the dif...  
133  Cross-Entropy - Example\n▶Six tosses of a coin...  
134  Refe

In [17]:
fs = project.get_feature_store()
emb_fg = fs.get_or_create_feature_group(
    name="embeddings",
    version=1,
    primary_key=list(embedding_df), 
    description="Content of each page of each file")
emb_fg.insert(embedding_df)

Connected. Call `.close()` to terminate connection gracefully.
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/314521/fs/314440/fg/349524


Uploading Dataframe: 0.00% |          | Rows 0/136 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/314521/jobs/named/embeddings_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x133fdf810>, None)

In [33]:
fs = project.get_feature_store()
instructions_fg = fs.get_or_create_feature_group(
    name="instructionset",
    version=4,
    primary_key=list(instructions), 
    description="Instruction Set for fine tuning of llms")
instructions_fg.insert(instructions)

Connected. Call `.close()` to terminate connection gracefully.
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/314521/fs/314440/fg/336255


Uploading Dataframe: 100.00% |██████████| Rows 32/32 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: instructionset_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/314521/jobs/named/instructionset_4_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x130aac890>, None)

In [28]:
#print(embedding_df['source'].isin(['02_serverless_ml.pdf','Python']))

fs = project.get_feature_store()

True