**This cell installs vertexai for extracting our data through an LLM, as well as PyPDF2 to read pdf files.**

In [None]:
!pip install vertexai
!pip install PyPDF2



**This cell imports our unstructured data from our raw data warehouse, extracts it into a text file in JSON format, and outputs it to our GCS bucket.**

In [None]:
#Import everything we need to read, extract, and store data
import json, os
import time
from PyPDF2 import PdfReader, PdfWriter
from pathlib import Path
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from google.cloud import storage
from google.cloud.storage import transfer_manager

#Initialize variables for GCP information, file path, and LLM info/prompt
project_id = "group-5-448704"
region = "us-central1"
bucket_name = "football_cs_project_1"
raw_folder = "initial-loads/2024_weekly_stats/raw/"
llm_folder = "initial-loads/llm_text/"
local_folder = "/content/out-txt/"
model_name = "gemini-2.0-flash-exp"
prompt = "Conver the file to JSON format, then return the GAMES (W-L-T), FIRST DOWNS, Rushing, Passing, Penalty, YDS GAINED, Avg per Game, and Sacked"

#Copy our extracted data to our GCS bucket
def copy_to_GCS(local_folder, gcs_folder, file_extension):
  #Initialize storage client, bucket, file directory, and file path string variables
  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  directory_as_path = Path(local_folder)
  file_paths = directory_as_path.rglob(file_extension)
  relative_paths = [path.relative_to(local_folder) for path in file_paths]
  string_paths = [str(path) for path in relative_paths]
  print("Found {} files.".format(string_paths))
  #Upload our local file to our GCS bucket in the given destination folder
  results = transfer_manager.upload_many_from_filenames(bucket, string_paths, source_directory = local_folder, blob_name_prefix = gcs_folder, max_workers = 5)
  for name, result in zip(string_paths, results):
    if isinstance(result, Exception):
      print("Failed to upload {} due to exception {}".format(name, result))
    else:
      print("Uploaded {} to {}.".format(name, bucket.name))

#Extract the data from our unstructured data (in a pdf) and convert it to a text file in JSON format
def extract():
  #initialize instance of Vertex AI, Gemini, and Storage client
  vertexai.init(project = project_id, location = region)
  model = GenerativeModel(model_name)
  storage_client = storage.Client()
  #Create blobs from given data in case data is made of multiple parts
  blobs = storage_client.list_blobs(bucket_name, prefix = raw_folder)
  #Iterate for each blob
  for blob in blobs:
    #Ignore blob that is just name of folder
    if blob.name == raw_folder:
      continue
    #Replace pdf file extension with txt file extension of raw data
    filename = blob.name.replace(raw_folder, local_folder).replace(".pdf", ".txt")
    #Define path for our outputted text file
    f = Path(filename)
    #If file already exists, don't make a new one
    if f.exists():
      print(f"{filename} already exists")
      continue
    #Grab data from GCS bucket without uri
    print(f"extracting {blob.name}")
    file_content = Part.from_uri(f"gs://{bucket_name}/{blob.name}", "application/pdf")
    #Generate response from Gemini using content of file and given prompt
    resp = model.generate_content([file_content, prompt])
    #Get rid of references to JSON in outputted text file
    resp_str = str(resp.candidates[0].text).replace("```json", "").replace("```", "")
    print("got resp from LLM")
    #Open new file and write LLM response to prompt into it
    f = open(filename, "w")
    f.write(resp_str)
    #Close file
    f.close()
    print("wrote file", filename)

#Main driver method for extracting, then copying data
if __name__ == "__main__":
  extract()
  copy_to_GCS(local_folder, llm_folder, "*txt")

/content/out-txt/nfl_weekly_stats.txt already exists
Found ['nfl_weekly_stats.txt'] files.
Uploaded nfl_weekly_stats.txt to football_cs_project_1.
