In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/drizal/parli/keys/bitly-ai-experiments-f0eacba17094.json"

BUCKET_NAME = "bitly-enterprise-search-docs"

In [None]:
%pip install --upgrade google-cloud-storage
%pip install google-cloud-aiplatform
%pip install bs4

In [None]:
from google.cloud import storage

storage_client = storage.Client()

bucket = storage_client.get_bucket(BUCKET_NAME)

print(bucket)

In [None]:
from vertexai.language_models import TextGenerationModel
model = TextGenerationModel.from_pretrained('text-bison@latest')

In [None]:
# test
prompt = f"""Tell me a joke:  """
result = model.predict(prompt)
print(result)

In [None]:
# make a directory called prepared_data
!mkdir prepared_data
!mkdir prepared_data/clean

In [None]:
# walk through current directory and subsirectory
# get all md files and clean up any html code it may have
# save the cleaned up text to a file in the prepared_data/clean directory

import glob

from bs4 import BeautifulSoup

# get all md files
md_files = glob.glob("**/*.md", recursive=True)

# clean up the text using BeautifulSoup
def clean_text(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
print("Cleaning up text...")
for md_file in md_files:
    # get the text from the file
    with open(md_file, "r") as f:
        text = f.read()
    # clean up the text
    text = clean_text(text)
    print(text)
    # save the cleaned up text to a file in the prepared_data/clean directory
    clean_file = md_file.replace(".md", ".txt")
    clean_file = os.path.join("prepared_data/clean", clean_file)
    #create the file if it does not exist
    if not os.path.exists(os.path.dirname(clean_file)):
        os.makedirs(os.path.dirname(clean_file))
    # save the cleaned up text to the file
    with open(clean_file, "w") as f:
        f.write(text)

In [None]:
!mkdir prepared_data/summary_short

In [None]:
# get all files in the clean directory
clean_files = glob.glob("prepared_data/clean/**/*.txt", recursive=True)
text = ""
with open(clean_files[15],"r") as f:
    text = f.read()
    print(text)

In [None]:
# test
prompt_summary = """
Objective: Summarize an internal resource document tailored for developers at Bitly, ensuring brevity and retention of all critical information.

Task:
Generate a comprehensive yet succinct summary of the following document text, ensuring all vital information is retained for effective utilization in vector DB and LLM operations.

Document Text: 
{text}

Summary:
"""

In [None]:
result = model.predict(prompt)
print(result)

In [None]:
parameters = {
    "temperature": 0,  # Temperature controls the degree of randomness in token selection.
    "max_output_tokens": 2000,  # Token limit determines the maximum amount of text output.
    # "top_p": 0.8,  # Tokens are selected from most probable to least until the sum of their probabilities equals the top_p value.
    # "top_k": 40,  # A top_k of 1 means the selected token is the most probable among all tokens.
}

In [None]:
def summarize_file(clean_file):
    # get the text from the file
    with open(clean_file, "r") as f:
        text = f.read()
    # prepare prompt
    formatted_prompt = prompt_summary.format(text=text)
    summary = model.predict(formatted_prompt, **parameters)
    # save the summary to a file in the prepared_data/summary_short directory
    summary_file = clean_file.replace("clean", "summary_short")
    #create the file if it does not exist
    if not os.path.exists(os.path.dirname(summary_file)):
        os.makedirs(os.path.dirname(summary_file))
    # save the summary to the file
    with open(summary_file, "w") as f:
        f.write(summary.text)
    print("Completed summarizing file: ", clean_file)



In [None]:
print(len(clean_files))

In [None]:
import concurrent.futures

# If you want to limit it to 10 workers (files) at a time:
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    results = list(executor.map(summarize_file, clean_files[340:]))


In [None]:
# add the summary to each file in the clean directory
# save the file to the prepared_data/clean_plus_summary directory

def add_summary_to_file(clean_file):
    # get the text from the file
    with open(clean_file, "r") as f:
        text = f.read()
    # get the summary from the file
    summary_file = clean_file.replace("clean", "summary_short")
    # check if the file exists, if not, print error and return
    if not os.path.exists(summary_file):
        print("Error: Summary file does not exist: ", summary_file)
        return
    with open(summary_file, "r") as f:
        summary = f.read()
    # add the summary to the text
    text = "---------- Summary: \n" + summary + "\n----------End Summary \n\n" + text 
    # save the text to a file in the prepared_data/clean_plus_summary directory
    clean_plus_summary_file = clean_file.replace("clean", "clean_plus_summary")
    #create the file if it does not exist
    if not os.path.exists(os.path.dirname(clean_plus_summary_file)):
        os.makedirs(os.path.dirname(clean_plus_summary_file))
    # save the text to the file
    with open(clean_plus_summary_file, "w") as f:
        f.write(text)
    print("Completed adding summary to file: ", clean_file)

In [None]:
# for each file in the clean directory, add the summary to the file and save it to the clean_plus_summary directory

print("Adding summary to files...")
for clean_file in clean_files[]:
    add_summary_to_file(clean_file)
    

## Create a new blob container in the bucket

In [None]:
from google.cloud import storage

storage_client = storage.Client()

bucket = storage_client.get_bucket(BUCKET_NAME)

print(bucket)

In [None]:
# upload all files in the prepared_data/clean_plus_summary directory to the blob container

import glob
from google.cloud import storage

storage_client = storage.Client()

bucket = storage_client.get_bucket(BUCKET_NAME)

print(bucket)

# Get all files in the clean_plus_summary directory
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

# Upload all files in the clean_plus_summary directory to the blob container
for clean_plus_summary_file in clean_plus_summary_files:
    # remove 'prepared_data/clean_plus_summary/' from file name
    file_name = clean_plus_summary_file.replace("prepared_data/clean_plus_summary/", "")
    
    # create a blob directory name and append the filename
    blob_file_path = "prepared_data_clean_plus_summary/" + file_name
    
    print("Uploading file: ", clean_plus_summary_file, " to blob: ", blob_file_path)

    # create a new blob object and upload the file
    blob = bucket.blob(blob_file_path)  # create a blob object with the right name
    response = blob.upload_from_filename(clean_plus_summary_file)


https://console.cloud.google.com/storage/browser/bitly-enterprise-search-docs;tab=objects?forceOnBucketsSortingFiltering=true&project=bitly-ai-experiments&prefix=&forceOnObjectsSortingFiltering=false


In [None]:
import os
import concurrent.futures
import time
import json

from vertexai.language_models import TextGenerationModel
model = TextGenerationModel.from_pretrained('text-bison@latest')

# Assume model and parameters are defined elsewhere in your code.
# model = YourModelHere()
parameters = {
    "temperature": 0,
    "max_output_tokens": 500,
}

# Sample prompts
prompt = """
Objective: Generate a fitting title and description for the following document text. Description should be either 1 or 2 sentences long.
Response format should be JSON in this format: {{"title": "title", "description": "description"}}
Document Text: 
{text}

Response JSON: {{

"""


def extract_and_save_title_description(clean_file):
    with open(clean_file, "r") as f:
        text = f.read()
    
    # Title extraction
    formatted_prompt = prompt.format(text=text)
    response = model.predict(formatted_prompt, **parameters).text
     
    # add { to the beginning of the response
    response = "{" + response

    try:
        # parse the response as json
        response = json.loads(response)

        # Extract title and description from response
        title = response["title"]
        description = response["description"]
    except:
        # if it fails, print error and return
        print("Error: Failed to parse response as JSON: ", response)
        return
    
    # Save title
    title_file = clean_file.replace("clean_plus_summary", "title")
    title_dir = os.path.dirname(title_file)
    if not os.path.exists(title_dir):
        os.makedirs(title_dir)
    with open(title_file, "w") as f:
        f.write(title)
    
    # Save description
    description_file = clean_file.replace("clean_plus_summary", "description")
    description_dir = os.path.dirname(description_file)
    if not os.path.exists(description_dir):
        os.makedirs(description_dir)
    with open(description_file, "w") as f:
        f.write(description)

    print(f"Completed extracting title and description for: {clean_file}")

    time.sleep(10)  # Pauses execution for 5 seconds to avoid rate limiting



In [None]:
import glob

# Replace this with your actual list of files.
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

extract_and_save_title_description(clean_plus_summary_files[0])

In [None]:
# Replace this with your actual list of files.
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

# Using concurrent futures to parallelize the process.
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    results = list(executor.map(extract_and_save_title_description, clean_plus_summary_files[260:300]))

In [None]:
# ensure each clean_with_summary file has a title and description file

import glob

# Get all files in the clean_plus_summary directory
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

outstanding_files =[]

# check if each clean_plus_summary file has a title and description file
for clean_plus_summary_file in clean_plus_summary_files:
    # Replace 'clean_plus_summary' in the filename with 'title' and 'description' to get the title and description filenames.
    title_file = clean_plus_summary_file.replace("clean_plus_summary", "title")
    description_file = clean_plus_summary_file.replace("clean_plus_summary", "description")
    
    # check if the title and description files exist and save it in an array to be used by ThreadPoolExecutor later
    if not os.path.exists(title_file):
        outstanding_files.append(clean_plus_summary_file)

print (len(outstanding_files))


In [None]:
# Using concurrent futures to parallelize the process.
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    results = list(executor.map(extract_and_save_title_description, outstanding_files))

In [None]:
import json
import glob

def generate_metadata(file_name):
    """
    Generate metadata for a given file name by reading previously generated title
    and description from the filesystem.
    
    Parameters:
        file_name (str): The name of the file for which metadata is generated.
    
    Returns:
        dict: A dictionary containing metadata for the file.
    """
    # Replace 'clean_plus_summary' in the filename with 'title' and 'description' to get the title and description filenames.
    title_file = file_name.replace("clean_plus_summary", "title")
    description_file = file_name.replace("clean_plus_summary", "description")

    # make sure title and description files exist, if not return set title and description to the file name

    try:
        # Read the title and description from the filesystem.
        with open(title_file, "r") as f:
            title = f.read().strip()
        with open(description_file, "r") as f:
            description = f.read().strip()
    except:
        return {
            "title": file_name,
            "description": file_name,
        }
    
    return {
        "title": title,
        "description": description,
    }


def get_gcs_uri(file_name, gcs_bucket_name):
    """
    Construct the Google Cloud Storage URI for a given file name.
    
    Parameters:
        file_name (str): The name of the file for which the URI is constructed.
        gcs_bucket_name (str): The name of the Google Cloud Storage bucket.
    
    Returns:
        str: The GCS URI for the file.
    """
    # replace / with _ to create a valid GCS URI
    gcs_uri = file_name.replace("/", "_")

    # add the bucket name to the GCS URI
    gcs_uri = f"gs://{gcs_bucket_name}/prepared_data_clean_plus_summary/{gcs_uri}"

    return gcs_uri

def create_metadata_jsonl(clean_plus_summary_files, gcs_bucket_name, output_filename='metadata.jsonl'):
    """
    Create a JSONL file containing metadata for each file in `clean_plus_summary_files`.
    
    Parameters:
        clean_plus_summary_files (list of str): List of file paths to be processed.
        gcs_bucket_name (str): The name of the Google Cloud Storage bucket.
        output_filename (str): The name of the output JSONL file.
    """
    with open(output_filename, 'w') as jsonl_file:
        for i, clean_plus_summary_file in enumerate(clean_plus_summary_files):
            # Remove 'prepared_data/clean_plus_summary/' from file name
            file_name = clean_plus_summary_file.replace("prepared_data/clean_plus_summary/", "")
            
            # Construct the URI for the GCS location
            gcs_uri = get_gcs_uri(file_name, gcs_bucket_name)
            
            # Create metadata JSON object
            metadata = {
                "id": f"doc-{i}",
                "structData": generate_metadata(clean_plus_summary_file),
                "content": {
                    "mimeType": "text/plain",  # Adjust MIME type if needed
                    "uri": gcs_uri
                }
            }
            
            # Write JSON object to JSONL file as a new line
            jsonl_file.write(json.dumps(metadata) + '\n')

# Usage example:

# Get all files in the clean_plus_summary directory
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

# Create the JSONL file
create_metadata_jsonl(clean_plus_summary_files, BUCKET_NAME)

In [None]:
# upload all files in the prepared_data/clean_plus_summary directory to the blob container
# put all files in the same directory, replace file path / with _ to get the file name

import glob

# Get all files in the clean_plus_summary directory
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

# Upload all files in the clean_plus_summary directory to the blob container
for clean_plus_summary_file in clean_plus_summary_files:
    # remove 'prepared_data/clean_plus_summary/' from file name
    file_name = clean_plus_summary_file.replace("prepared_data/clean_plus_summary/", "")
    
    # create a blob directory name and append the filename
    blob_file_path = "prepared_data_clean_plus_summary/" + file_name
    
    print("Uploading file: ", clean_plus_summary_file, " to blob: ", blob_file_path)

    # create a new blob object and upload the file
    blob = bucket.blob(blob_file_path)  # create a blob object with the right name
    response = blob.upload_from_filename(clean_plus_summary_file)

In [None]:
# upload all files in the prepared_data/clean_plus_summary directory to the blob container

import glob
from google.cloud import storage

storage_client = storage.Client()

bucket = storage_client.get_bucket(BUCKET_NAME)

# Get all files in the clean_plus_summary directory
clean_plus_summary_files = glob.glob("prepared_data/clean_plus_summary/**/*.txt", recursive=True)

# Upload all files in the clean_plus_summary directory to the blob container
for clean_plus_summary_file in clean_plus_summary_files:
    # remove 'prepared_data/clean_plus_summary/' from file name
    file_name = clean_plus_summary_file.replace("prepared_data/clean_plus_summary/", "")

    #replace / to _ in filename
    file_name = file_name.replace("/", "_")
    
    # create a blob directory name and append the filename
    blob_file_path = "prepared_data_clean_plus_summary/" + file_name
    
    print("Uploading file: ", clean_plus_summary_file, " to blob: ", blob_file_path)

    # create a new blob object and upload the file
    blob = bucket.blob(blob_file_path)  # create a blob object with the right name
    blob.upload_from_filename(clean_plus_summary_file)


In [54]:
# upload metadata.jsonl to prepared_data_clean_plus_summary/

# create a blob directory name and append the filename
blob_file_path = "prepared_data_clean_plus_summary/metadata.jsonl"

print("Uploading file: ", "metadata.jsonl", " to blob: ", blob_file_path)

# create a new blob object and upload the file
blob = bucket.blob(blob_file_path)  # create a blob object with the right name
blob.upload_from_filename("metadata.jsonl")

Uploading file:  metadata.jsonl  to blob:  prepared_data_clean_plus_summary/metadata.jsonl
