<a href="https://colab.research.google.com/github/nradich/A_Streaming_Analysis/blob/collab_test/StreamingPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Step 1)**

 - Shifting the project to be focused on books as opposed to streaming service data, as a book API is more accessible.

 - The API call requires a q parameter, could play aound with the LLM generating a list of 10 adjectives or nouns to generate book lookup



In [9]:
import json
import requests
import pandas as pd
from googleapiclient.discovery import build
from google.colab import userdata
userdata.get('book_api_key')

# --- Configuration ---
# Replace 'YOUR_API_KEY_HERE' with your actual Google Books API key.
# You can obtain this from the Google Cloud Console.
API_KEY = userdata.get('book_api_key')

API_SERVICE_NAME = 'books'
API_VERSION = 'v1'

def get_books_service(api_key):
    """
    Builds and returns the Google Books API service client.
    """
    return build(API_SERVICE_NAME, API_VERSION, developerKey=api_key)

def search_recent_books(service, query, language='en', max_results_to_fetch=400):
    """
    Searches for books with a specific query, orders by newest,
    restricting the results to a specified language.
    Uses pagination to fetch up to `max_results_to_fetch` books.

    Returns the results as a list of dictionaries.
    """
    all_books = []
    start_index = 0
    max_results_per_call = 40  # Maximum allowed per call

    print(f"\nSearching for up to {max_results_to_fetch} recent books with query '{query}' in language '{language}'...")

    try:
        while len(all_books) < max_results_to_fetch:
            print(f"Fetching results from index {start_index}...")

            results = service.volumes().list(
                q=query,
                orderBy='newest',
                langRestrict=language,
                startIndex=start_index,  # Use pagination
                maxResults=max_results_per_call
            ).execute()

            books_list = results.get('items', [])

            # If no books were returned, we have reached the end
            if not books_list:
                break

            all_books.extend(books_list)

            # Update the start index for the next page
            start_index += max_results_per_call

        return all_books

    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return None

def convert_to_dataframe(books_list, query_term):
    """
    Converts a list of book dictionaries from the API into a pandas DataFrame,
    adding a column for the search query used.
    """
    # Create an empty list to store the processed book data
    processed_books = []

    for book in books_list:
        volume_info = book.get('volumeInfo', {})

        # Extract the necessary fields
        book_id = book.get('id', 'N/A')
        title = volume_info.get('title', 'N/A')
        subtitle = volume_info.get('subtitle', 'N/A')
        authors = ', '.join(volume_info.get('authors', ['N/A']))
        publisher = volume_info.get('publisher', 'N/A')
        published_date = volume_info.get('publishedDate', 'N/A')
        description = volume_info.get('description', 'N/A')
        page_count = volume_info.get('pageCount', 'N/A')

        # Extract and format the categories
        categories = ', '.join(volume_info.get('categories', ['N/A']))

        # Create a dictionary for the current book and append to the list
        processed_books.append({
            'Query_Term': query_term, # New column for the search query
            'ID': book_id,
            'Title': title,
            'Subtitle': subtitle,
            'Authors': authors,
            'Publisher': publisher,
            'Published_Date': published_date,
            'Description': description,
            'Page_Count': page_count,
            'Categories': categories
        })

    # Create the DataFrame from the list of dictionaries
    df = pd.DataFrame(processed_books)
    return df

def main():
    """
    Main function to run the script.
    """
    if API_KEY == 'YOUR_API_KEY_HERE':
        print("Please replace 'YOUR_API_KEY_HERE' with your actual API key.")
        return None

    books_service = get_books_service(API_KEY)

    # Search for recent books in English (e.g., from the year 2025)
    # The script will now attempt to fetch up to 120 books.
    # Define your search queries here.
    # The LLM can generate this list for you.
    search_queries = ['fiction', 'fantasy', 'adventure', 'thriller', 'Croatia']
    all_new_books = []

    for query_term in search_queries:
      print(f"\nSearching for books with query: '{query_term}'")
      books_list = search_recent_books(books_service, query=query_term, language='en', max_results_to_fetch=40)

    if books_list:
        # Convert the list of book data to a DataFrame
        books_df = convert_to_dataframe(books_list,query_term )

        print(f"\n--- Retrieved {len(books_df)} books ---")
        print("\n--- Books DataFrame ---")
        print(books_df)
        print("\n--- DataFrame Info ---")
        books_df.info()

        # Return the DataFrame so it can be used for further analysis
        return books_df

    else:
        print("\nNo books data was returned from the API.")
        return None

if __name__ == "__main__":
    books_df = main()


Searching for books with query: 'fiction'

Searching for up to 40 recent books with query 'fiction' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'fantasy'

Searching for up to 40 recent books with query 'fantasy' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'adventure'

Searching for up to 40 recent books with query 'adventure' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'thriller'

Searching for up to 40 recent books with query 'thriller' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'Croatia'

Searching for up to 40 recent books with query 'Croatia' in language 'en'...
Fetching results from index 0...

--- Retrieved 40 books ---

--- Books DataFrame ---
   Query_Term            ID  \
0     Croatia  cHpDTbdQFkQC   
1     Croatia  UxSnm-mUp40C   
2     Croatia  sfcpsAoSoewC   
3     Croatia  HJexhW3C0TIC   
4     Croatia  J

In [12]:
import json
import requests
import pandas as pd
from googleapiclient.discovery import build
from google.colab import userdata
userdata.get('book_api_key')

# --- Configuration ---
API_KEY = userdata.get('book_api_key')
API_SERVICE_NAME = 'books'
API_VERSION = 'v1'

def get_books_service(api_key):
    """
    Builds and returns the Google Books API service client.
    """
    return build(API_SERVICE_NAME, API_VERSION, developerKey=api_key)

def search_recent_books(service, query, language='en', max_results_to_fetch=400):
    """
    Searches for books with a specific query, orders by newest,
    restricting the results to a specified language.
    Uses pagination to fetch up to `max_results_to_fetch` books.

    Returns the results as a list of dictionaries.
    """
    all_books = []
    start_index = 0
    max_results_per_call = 40  # Maximum allowed per call

    print(f"\nSearching for up to {max_results_to_fetch} recent books with query '{query}' in language '{language}'...")

    try:
        while len(all_books) < max_results_to_fetch:
            print(f"Fetching results from index {start_index}...")

            results = service.volumes().list(
                q=query,
                orderBy='newest',
                langRestrict=language,
                startIndex=start_index,
                maxResults=max_results_per_call
            ).execute()

            books_list = results.get('items', [])

            # If no books were returned, we have reached the end
            if not books_list:
                break

            all_books.extend(books_list)

            # Update the start index for the next page
            start_index += max_results_per_call

        return all_books

    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return None

def convert_to_dataframe(books_list):
    """
    Converts a list of book dictionaries from the API into a pandas DataFrame.
    """
    # Create an empty list to store the processed book data
    processed_books = []

    for book_data in books_list:
        book = book_data['book']
        query_term = book_data['query_term']

        volume_info = book.get('volumeInfo', {})

        # Extract the necessary fields
        book_id = book.get('id', 'N/A')
        title = volume_info.get('title', 'N/A')
        subtitle = volume_info.get('subtitle', 'N/A')
        authors = ', '.join(volume_info.get('authors', ['N/A']))
        publisher = volume_info.get('publisher', 'N/A')
        published_date = volume_info.get('publishedDate', 'N/A')
        description = volume_info.get('description', 'N/A')
        page_count = volume_info.get('pageCount', 'N/A')
        categories = ', '.join(volume_info.get('categories', ['N/A']))

        # Create a dictionary for the current book and append to the list
        processed_books.append({
            'Query_Term': query_term,
            'ID': book_id,
            'Title': title,
            'Subtitle': subtitle,
            'Authors': authors,
            'Publisher': publisher,
            'Published_Date': published_date,
            'Description': description,
            'Page_Count': page_count,
            'Categories': categories
        })

    # Create the DataFrame from the list of dictionaries
    df = pd.DataFrame(processed_books)
    return df

def main():
    """
    Main function to run the script.
    """
    if API_KEY == 'YOUR_API_KEY_HERE':
        print("Please replace 'YOUR_API_KEY_HERE' with your actual API key.")
        return None

    books_service = get_books_service(API_KEY)
    search_queries = ['fiction', 'fantasy', 'adventure', 'thriller', 'Croatia']
    all_books_data = []

    for query_term in search_queries:
        print(f"\nSearching for books with query: '{query_term}'")
        books_list = search_recent_books(books_service, query=query_term, language='en', max_results_to_fetch=10)

        if books_list:
            # Store the book data along with the query term
            for book in books_list:
                all_books_data.append({'book': book, 'query_term': query_term})

    if all_books_data:
        # Convert the list of all book data to a DataFrame once
        books_df = convert_to_dataframe(all_books_data)

        print(f"\n--- Retrieved {len(books_df)} books ---")
        print("\n--- Books DataFrame ---")
        print(books_df)
        print("\n--- DataFrame Info ---")
        books_df.info()

        return books_df
    else:
        print("\nNo books data was returned from the API.")
        return None

if __name__ == "__main__":
    books_df = main()


Searching for books with query: 'fiction'

Searching for up to 10 recent books with query 'fiction' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'fantasy'

Searching for up to 10 recent books with query 'fantasy' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'adventure'

Searching for up to 10 recent books with query 'adventure' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'thriller'

Searching for up to 10 recent books with query 'thriller' in language 'en'...
Fetching results from index 0...

Searching for books with query: 'Croatia'

Searching for up to 10 recent books with query 'Croatia' in language 'en'...
Fetching results from index 0...

--- Retrieved 200 books ---

--- Books DataFrame ---
    Query_Term            ID  \
0      fiction  O4maBAAAQBAJ   
1      fiction  pqIJEAAAQBAJ   
2      fiction  yv11BgAAQBAJ   
3      fiction  J4QUEAAAQBAJ   
4      fic

In [16]:
query_counts = books_df.groupby('Query_Term')['ID'].count()
query_counts

Unnamed: 0_level_0,ID
Query_Term,Unnamed: 1_level_1
Croatia,40
adventure,40
fantasy,40
fiction,40
thriller,40


In [7]:
"#books_df_28_rows = books_df
books_df_28_rows

Unnamed: 0,ID,Title,Subtitle,Authors,Publisher,Published_Date,Description,Page_Count,Categories
0,SC4MEAAAQBAJ,Book Traces,Nineteenth-Century Readers and the Future of t...,Andrew M. Stauffer,University of Pennsylvania Press,2021-02-05,"In most college and university libraries, mate...",224.0,Literary Criticism
1,G1ePDwAAQBAJ,Standards for Internal Control in the Federal ...,,United States Government Accountability Office,Lulu.com,2019-03-24,Policymakers and program managers are continua...,88.0,Reference
2,j9M7EAAAQBAJ,Win Me Something,,Kyle Lucia Wu,Tin House Books,2021-11-02,"A NPR, Electric Lit, and Entropy Best Book of ...",241.0,Fiction
3,bU0lAQAAIAAJ,The World Book Encyclopedia,,,,1989,An encyclopedia designed to meet the needs of ...,554.0,Encyclopedias and dictionaries
4,-5GoQgAACAAJ,Essentials of Glycobiology,,Ajit Varki,,1999-01-01,Glycobiology has its roots in the nineteenth c...,653.0,Science
5,vlUoYAAACAAJ,Blood Groups and Red Cell Antigens,,Laura Dean,,2005,,,Blood group antigens
6,uIqsEAAAQBAJ,One Italian Summer,A Novel,Rebecca Serle,Simon and Schuster,2023-03-07,"""A moving and unforgettable exploration of the...",288.0,Fiction
7,Ro7zAgAAQBAJ,From Dissertation to Book,,William Germano,University of Chicago Press,2014-02-27,How to transform a thesis into a publishable w...,178.0,Language Arts & Disciplines
8,paeQnQEACAAJ,Webvision,The Organization of the Retina and Visual System,"Helga Kolb, Eduardo Fernandez, Ralph Nelson",,2007,,,
9,3WE1HD5AUDAC,The Tibetan Book Of Living And Dying,A Spiritual Classic from One of the Foremost I...,Sogyal Rinpoche,Random House,2012-02-29,Over 3 million copies sold ‘I couldn’t give th...,468.0,"Body, Mind & Spirit"


In [2]:
#adds column for the ingested at time
books_df['ingested_at'] =  pd.Timestamp.now(tz='America/Los_Angeles')

In [5]:
from datetime import datetime
import pytz
# Get the current datetime in the 'America/Los_Angeles' timezone
pst_datetime = datetime.now(pytz.timezone('America/Los_Angeles'))
# Format the date as a string
today = pst_datetime.strftime('%Y-%m-%d')
# Dynamically create the filename with the PST date
DATA_FILE = f'books_2025_data_{today}.csv'
books_df

Unnamed: 0,ID,Title,Subtitle,Authors,Publisher,Published_Date,Description,Page_Count,Categories,ingested_at
0,Y3cWEQAAQBAJ,Project 2025:,The BluePrint: Everything You Need To Know Abo...,John Madison,A.W Publishing,2024-07-27,Project 2025: Democracy at Risk: Trump and the...,157,Biography & Autobiography,2025-08-07 14:16:15.258476-07:00
1,YgY4K2lPDNYC,2025,Scenarios of U.S. and Global Society Reshaped ...,"Joseph Francis Coates, John B. Mahaffie, Andy ...",OakHill Press,1996,"Tapping the worlds of science and technology, ...",536,Science,2025-08-07 14:16:15.258476-07:00
2,A7Dc0AEACAAJ,Project 2025,A Hope for All Americans Comes 2025,Malachi Muller,Independently Published,2024-08-17,What if the future of America is being shaped ...,0,Political Science,2025-08-07 14:16:15.258476-07:00
3,uL0uzgEACAAJ,Zeitgeist 2025,Countdown to the Secret Destiny of America? th...,Thomas R. Horn,Defender,2021-07-15,Before and after the presidency of Donald Trum...,,,2025-08-07 14:16:15.258476-07:00
4,JWcpEQAAQBAJ,HOROSCOPE 2025,,Alina Rubí,Alina A Rubi,2024-10-13,"""Horoscope 2025"" gives you the precise informa...",222,"Body, Mind & Spirit",2025-08-07 14:16:15.258476-07:00
...,...,...,...,...,...,...,...,...,...,...
195,8SwLzgEACAAJ,2021-2025 Five Year Planner,2021-2025 Monthly Planner 5 Years Organizer | ...,Evalzoe P. Stella,,2020-10-25,Take control of your life today with this beau...,150,,2025-08-07 14:16:15.258476-07:00
196,IffYzQEACAAJ,2021-2025 Five Year Planner,Watercolor Flowers Cover Monthly and Weekly Pl...,Emma Olivia Family,,2020-08-18,2021-2025 Planner | Emma Olivia Family. Our ne...,135,,2025-08-07 14:16:15.258476-07:00
197,a6InzgEACAAJ,2021-2025 Five Year Planner,2021-2025 Monthly Planner 5 Years Organizer | ...,Evaneio Creations,,2020-10-25,Take control of your life today with this beau...,150,,2025-08-07 14:16:15.258476-07:00
198,h8MozgEACAAJ,2021-2025 Monthly Planner,Grey Marble 5 Year Motivational Organizer and ...,Simple Planners,,2020-09-27,Promotional Limited Time Offer 2021-2025 Month...,149,,2025-08-07 14:16:15.258476-07:00


In [6]:
from google.colab import drive
import pandas as pd # Make sure pandas is imported
from datetime import date
# 1. Mount your Google Drive
drive.mount('/content/drive')

# 2. Define the path and filename within your Drive
# Replace 'Your_Folder' with the name of the folder you want to save to.
file_path = f'/content/drive/My Drive/AIAnalysis/{DATA_FILE}.csv'



# 3. Save the DataFrame to the specified path
books_df.to_csv(file_path, index=False)

Mounted at /content/drive


In [24]:
df = pd.read_csv(file_path)
df

Unnamed: 0,ID,Title,Subtitle,Authors,Publisher,Published_Date,Description,Page_Count,Categories,ingested_at
0,Y3cWEQAAQBAJ,Project 2025:,The BluePrint: Everything You Need To Know Abo...,John Madison,A.W Publishing,2024-07-27,Project 2025: Democracy at Risk: Trump and the...,157.0,Biography & Autobiography,2025-08-06 17:10:38.934987-07:00
1,YgY4K2lPDNYC,2025,Scenarios of U.S. and Global Society Reshaped ...,"Joseph Francis Coates, John B. Mahaffie, Andy ...",OakHill Press,1996,"Tapping the worlds of science and technology, ...",536.0,Science,2025-08-06 17:10:38.934987-07:00
2,A7Dc0AEACAAJ,Project 2025,A Hope for All Americans Comes 2025,Malachi Muller,Independently Published,2024-08-17,What if the future of America is being shaped ...,0.0,Political Science,2025-08-06 17:10:38.934987-07:00
3,uL0uzgEACAAJ,Zeitgeist 2025,Countdown to the Secret Destiny of America? th...,Thomas R. Horn,Defender,2021-07-15,Before and after the presidency of Donald Trum...,,,2025-08-06 17:10:38.934987-07:00
4,JWcpEQAAQBAJ,HOROSCOPE 2025,,Alina Rubí,Alina A Rubi,2024-10-13,"""Horoscope 2025"" gives you the precise informa...",222.0,"Body, Mind & Spirit",2025-08-06 17:10:38.934987-07:00
...,...,...,...,...,...,...,...,...,...,...
195,4eAnzgEACAAJ,2021-2025 5 Year Monthly Planner,Fantastic Five-Year Organizer with 60 Months S...,Vanguard Planners,,2020-09-11,Promotional Price & Limited Time Offer 2021-20...,149.0,,2025-08-06 17:10:38.934987-07:00
196,Z1spzgEACAAJ,2021-2025 Five Year Planner,2021-2025 Monthly Planner 5 Years Organizer | ...,Evaneio Creations,,2020-10-25,Take control of your life today with this beau...,150.0,,2025-08-06 17:10:38.934987-07:00
197,gGjT0AEACAAJ,AutoCAD 2025,3D Drawing and Modeling (Mixed Units),Ascent - Center for Technical Knowledge,"Ascent, Center for Technical Knowledge",2024-07-23,The AutoCAD(R) 2025: 3D Drawing and Modeling g...,0.0,Computers,2025-08-06 17:10:38.934987-07:00
198,QCUozgEACAAJ,Get Shit Done 2021-2025 Planner,Motivational 5 Year Monthly Organizer and Sche...,Vanguard Planners,,2020-09-06,Promotional Limited Time Offer 2021-2025 Month...,149.0,,2025-08-06 17:10:38.934987-07:00


In [8]:
import pandas as pd
import glob

# Use a wildcard to match all files that follow the naming convention
file_pattern = '/content/drive/My Drive/AIAnalysis/*.csv'

# Get a list of all matching filenames
all_files = glob.glob(file_pattern)

# Create an empty list to hold the DataFrames
df_list = []

# Loop through each filename, read the CSV, and append the DataFrame to the list
for filename in all_files:
    df = pd.read_csv(filename)
    df_list.append(df)

# Concatenate all DataFrames in the list into a single, master DataFrame
master_df = pd.concat(df_list, ignore_index=True)

# --- Check the DataFrame before cleaning ---
print(f"Successfully loaded and combined {len(all_files)} files.")
print(f"The initial DataFrame has {len(master_df)} rows, before removing duplicates.")
print("\n--- Initial Master DataFrame (before cleaning) ---")
print(master_df.head()) # Print the head to inspect the data

# # --- Clean the DataFrame after inspecting ---
# # Remove any duplicates that might have been created
# master_df.drop_duplicates(subset=['ID'], inplace=True)

# print(f"\n--- Final DataFrame (after removing duplicates) ---")
# print(f"The final DataFrame has {len(master_df)} unique rows.")
# print(master_df.head())

Successfully loaded and combined 2 files.
The initial DataFrame has 400 rows, before removing duplicates.

--- Initial Master DataFrame (before cleaning) ---
             ID           Title  \
0  Y3cWEQAAQBAJ   Project 2025:   
1  YgY4K2lPDNYC            2025   
2  A7Dc0AEACAAJ    Project 2025   
3  uL0uzgEACAAJ  Zeitgeist 2025   
4  JWcpEQAAQBAJ  HOROSCOPE 2025   

                                            Subtitle  \
0  The BluePrint: Everything You Need To Know Abo...   
1  Scenarios of U.S. and Global Society Reshaped ...   
2                A Hope for All Americans Comes 2025   
3  Countdown to the Secret Destiny of America? th...   
4                                                NaN   

                                             Authors                Publisher  \
0                                       John Madison           A.W Publishing   
1  Joseph Francis Coates, John B. Mahaffie, Andy ...            OakHill Press   
2                                     Malachi Mulle

In [10]:
master_df.sort_values(by = "ID")

Unnamed: 0,ID,Title,Subtitle,Authors,Publisher,Published_Date,Description,Page_Count,Categories,ingested_at
51,-LdozgEACAAJ,2025 Post-Covid Scenarios,Latin America and the Caribbean,"Pepe Zhang, Peter Engelke",,2021-04-29,,,,2025-08-06 17:10:38.934987-07:00
249,-LdozgEACAAJ,2025 Post-Covid Scenarios,Latin America and the Caribbean,"Pepe Zhang, Peter Engelke",,2021-04-29,,,,2025-08-07 14:16:15.258476-07:00
226,-NwXSBGJo1wC,"The Future of North America, 2025",Outlook and Recommendations,Armand B. Peschard-Sverdrup,CSIS,2008-08-28,,360.0,Business & Economics,2025-08-07 14:16:15.258476-07:00
25,-NwXSBGJo1wC,"The Future of North America, 2025",Outlook and Recommendations,Armand B. Peschard-Sverdrup,CSIS,2008-08-28,,360.0,Business & Economics,2025-08-06 17:10:38.934987-07:00
90,-_rVCgAAQBAJ,Sentencing Fragments,"Penal Reform in America, 1975-2025",Michael H. Tonry,Oxford University Press,2016,Cover -- Contents -- Preface -- Acknowledgment...,315.0,Law,2025-08-06 17:10:38.934987-07:00
...,...,...,...,...,...,...,...,...,...,...
349,zd3_0AEACAAJ,Adobe Indesign 2025 Guide for Beginners,Mastering the Art of Creative Design for Publi...,Nava Asher,Independently Published,2024-11-24,Unlock the full potential of Adobe InDesign wi...,0.0,,2025-08-07 14:16:15.258476-07:00
341,zdLdzgEACAAJ,2021-2025 Five Year Monthly Planner,Large 5 Year Monthly Planner 2021-2025|60 Mont...,All YourPlanners,,2020-10-28,2021-2025 Five Year Planner 2021- 2025 5 Year ...,131.0,,2025-08-07 14:16:15.258476-07:00
137,zdLdzgEACAAJ,2021-2025 Five Year Monthly Planner,Large 5 Year Monthly Planner 2021-2025|60 Mont...,All YourPlanners,,2020-10-28,2021-2025 Five Year Planner 2021- 2025 5 Year ...,131.0,,2025-08-06 17:10:38.934987-07:00
54,zxxBEQAAQBAJ,BRL 2025 Baseball Rules and Regulations,,"Babe Ruth League, Inc.",Human Kinetics,2025-01-20,Thumbing through pages and pages of rules to f...,0.0,Sports & Recreation,2025-08-06 17:10:38.934987-07:00


##**Step 2)**

- Read in the Personsas Dataset from Nvidia

In [None]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
#don't need to authenticate as it is public
personas_df = pd.read_parquet("hf://datasets/nvidia/Nemotron-Personas/data/train-00000-of-00001.parquet")
#take only a small section of the DF for trial
small_personas_df = personas_df[:5]

In [None]:
small_personas_df

Unnamed: 0,uuid,persona,professional_persona,sports_persona,arts_persona,travel_persona,culinary_persona,skills_and_expertise,skills_and_expertise_list,hobbies_and_interests,hobbies_and_interests_list,career_goals_and_ambitions,sex,age,marital_status,education_level,bachelors_field,occupation,city,state,zipcode,country
0,df6b2b96-a938-48b0-83d8-75bfed059a3d,"A disciplined, sociable visionary, Jonathan ba...","A retired manufacturing manager, Jonathan now ...","An avid golfer, Jonathan plays weekly at the W...","A history enthusiast, Jonathan often leads tou...","A seasoned, meticulous planner, Jonathan favor...","A fan of hearty, Midwestern comfort food, Jona...",Jonathan's organizational skills and disciplin...,"['project management', 'budgeting and financia...",Jonathan enjoys a mix of social and solitary a...,"['golfing', 'woodworking', 'coin collecting', ...",After retiring from his career in manufacturin...,Male,72,widowed,high_school,,not_in_workforce,Wickliffe,OH,44092,USA
1,3b5691bf-07cd-4e58-b85b-cff62faba2fd,"Quintin, a 40-year-old logistician from Conver...","Quintin Pete Johnson, a logistician, combines ...","Quintin Pete Johnson, a dedicated fan of the S...",They appreciate the gritty realism of Texas ar...,"Quintin, a meticulous planner, balances family...",They delight in preparing complex Tex-Mex dish...,"Quintin Pete Johnson, a logistician from Conve...","['supply chain management', 'inventory control...",Quintin's balanced social nature extends to hi...,"['board games', 'art appreciation', 'history',...",Quintin aspires to become a director of logist...,Male,40,married_present,bachelors,arts_humanities,logistician,Converse,TX,78109,USA
2,8d6e788b-b0cf-42c1-9448-782fd12c6afe,"Ashley, a passionate community advocate, balan...","Ashley, an aspiring union representative, exce...","Ashley, a dedicated Detroit Lions fan, maintai...","Ashley, a self-proclaimed 'Motown music enthus...","Ashley, a budget-conscious traveler, dreams of...","Ashley, a skilled home cook, loves preparing h...",Ashley has developed strong organizational ski...,"['organizational skills', 'proficient in micro...",Ashley enjoys exploring new music and attendin...,"['exploring new music', 'cooking soul food', '...",Ashley aspires to become a union representativ...,Female,23,never_married,high_school,,laborer_or_freight_stock_or_material_mover,Detroit,MI,48219,USA
3,4617ca2c-673a-4a1b-a6cf-e171d542e113,"Stephanie, always the first to volunteer, bala...","Stephanie, a customer service representative, ...","Stephanie, a die-hard Minnesota Vikings fan, p...","Stephanie, an avid reader and amateur painter,...","Stephanie, despite her love for the outdoors, ...","Stephanie, a self-taught cook, enjoys experime...",Stephanie's ability to balance curiosity and p...,"['customer service', 'data analysis', 'multita...",Stephanie's outgoing nature and curiosity lead...,"['hiking', 'fishing', 'cooking', 'reading', 'h...",Stephanie enjoys her job as a customer service...,Female,41,married_present,some_college,,customer_service_representative,Littlefork,MN,56653,USA
4,21a01219-bace-4f40-9cca-79de787781d2,"Sonia, a 70-year-old retiree, is a vibrant, im...","Sonia, a retired organizer with a creative sou...","Sonia, though not athletic, enjoys watching ba...","Sonia, a passionate artist, finds inspiration ...","Sonia, a seasoned traveler, plans meticulous i...","Sonia, an avid cook, delights in preparing com...",Sonia has honed her organizational skills over...,"['event planning', 'group coordination', 'pain...",Sonia enjoys spending her free time creating a...,"['art creation', 'reading (poetry, biographies...","Though retired, Sonia still harbors ambitions ...",Female,70,married_present,9th_12th_no_diploma,,no_occupation,Cayucos,CA,93430,USA


##**Step 3**

  - Encode the personsas dataset so that it can be feed to a model for fine tunning
  - Looks like hugging face providers autoTokenizers for some of the models
  - Could be next task, creating a tokenizer

  - Will depend on the model how to tokenize it, ie for T5 it is text to text

  - Encodings appear to be too large to fine tune to the LLM, so will need to pivot to the books dataset, and store the personsas in a vector db and prompt the model

  - Could do then do an agent mode and see if the book is available or go and get the prices for each them.

 - Build an API, then train a chatbot on the documentation. Google gemini is pretty good at working with the google books api.



In [None]:
import torch
import transformers as tr

In [None]:
#import transformer library
#then it has the autoencoder, for the speicfic model, call it model checkpoint
model_name = "gpt2"
tokenizer = tr.AutoTokenizer.from_pretrained( model_name)

In [None]:
#need to convert the personsas dataset into a hugging face dataset
# Convert your pandas DataFrame directly into a Hugging Face Dataset object.
# This is the most efficient and recommended way to do it.
from datasets import Dataset
hf_dataset = Dataset.from_pandas(small_personas_df)

In [None]:
columns_to_include = [col for col in hf_dataset.column_names ]
columns_to_include

['uuid',
 'persona',
 'professional_persona',
 'sports_persona',
 'arts_persona',
 'travel_persona',
 'culinary_persona',
 'skills_and_expertise',
 'skills_and_expertise_list',
 'hobbies_and_interests',
 'hobbies_and_interests_list',
 'career_goals_and_ambitions',
 'sex',
 'age',
 'marital_status',
 'education_level',
 'bachelors_field',
 'occupation',
 'city',
 'state',
 'zipcode',
 'country']

In [None]:
#going to take a small segment of the personsas dataset to see if this pattern works for the tokenization
def combine_columns_for_tokenization (dataset, columns_to_include ):
  """Takes a hugging face dataset,a long with list of column name. Then iterates through the dataset and combines each row into
  one"""
  combined_text = []

  # Get the list of columns to include
  # You might want to exclude some columns like 'uuid', 'id', etc.

  # Iterate through each row in the batch of examples
  for i in range(len(dataset[columns_to_include[0]])):
      # Build the text string for the current row
      row_string_parts = []
      for column in columns_to_include:
          # Get the column name and its value for the current row
          column_name = column.replace('_', ' ').title()  # Formats 'hobbies_and_interests' to 'Hobbies And Interests'
          column_value = dataset[column][i]

          # Append the formatted string to the parts list
          row_string_parts.append(f"{column_name}: {column_value}")

      # Join all the parts for the row into a single string
      #Will leave in the period just to see how it performs
      combined_text.append(". ".join(row_string_parts) + ".")

  return {"text": combined_text}

# Apply the function to the dataset to create the new 'text' column
columns_to_include = [col for col in hf_dataset.column_names ]
combined_dataset = hf_dataset.map(
    combine_columns_for_tokenization,
    batched=True,
    fn_kwargs={"columns_to_include": columns_to_include} # <-- The magic line
)
combined_dataset

In [None]:
combined_text

["Uuid: df6b2b96-a938-48b0-83d8-75bfed059a3d. Persona: A disciplined, sociable visionary, Jonathan balances practicality with curiosity, leaving a lasting impact on his community through his organized, competitive approach. Professional Persona: A retired manufacturing manager, Jonathan now excels as a community developer, leveraging his organizational skills and competitive nature to drive sustainable growth in Wickliffe. Sports Persona: An avid golfer, Jonathan plays weekly at the Wickliffe Country Club and cheers for the Cleveland Browns, maintaining his competitive spirit even in leisure. Arts Persona: A history enthusiast, Jonathan often leads tours at the Lake County Historical Society, sharing stories about local pioneers and their impact on the region's development. Travel Persona: A seasoned, meticulous planner, Jonathan favors international destinations with rich histories, like Edinburgh and Dublin, where he can explore ancestral roots and enjoy a round of golf at prestigiou

In [None]:
# The maximum sequence length for the GPT-2 model.
# The maximum sequence length for the GPT-2 model.
max_length = 512

# Load the tokenizer
tokenizer = tr.AutoTokenizer.from_pretrained("gpt2")

# --- THIS IS THE FIX ---
# Set the padding token to be the same as the end-of-sentence token.
tokenizer.pad_token = tokenizer.eos_token

# Define the tokenization function
def tokenize_function(examples):
    """
    This function tokenizes a batch of text from the 'text' column of the dataset.
    """
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True, # Will now work correctly
        max_length=max_length,
    )

# Apply the tokenization function to the entire dataset using `map`.
tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)

print("\nTokenized Dataset Structure:")
print(tokenized_dataset)
print("-" * 20)
print("First item in the tokenized dataset:")
print(tokenized_dataset[0])

In [None]:
# Assuming you've stored the tokenized dataset in `tokenized_dataset`

# Get the first item
first_item = tokenized_dataset[0]

# Print a few key details for verification
print("--- Verification of First Tokenized Item ---")
print("Original Text:\n", first_item['text'][:300] + "...") # Print first 300 chars of original text
print("\nNumber of Tokens (input_ids length):", len(first_item['input_ids']))
print("Number of Attention Mask values:", len(first_item['attention_mask']))

# Print the first 20 token IDs to see what they look like
print("\nFirst 20 Input IDs:", first_item['input_ids'][:20])

# Print the last 20 attention mask values to check for padding
# You should see 1s, followed by 0s if the text was shorter than max_length.
print("Last 20 Attention Mask values:", first_item['attention_mask'][-20:])

# To get the actual tokens (words/subwords) back from the IDs:
# This is a useful step for sanity-checking.
# You might need to install 'sentencepiece' if it's not already installed for your tokenizer.
decoded_text = tokenizer.decode(first_item['input_ids'], skip_special_tokens=True)
print("\nDecoded Text (for verification):", decoded_text[:300] + "...")

--- Verification of First Tokenized Item ---
Original Text:
 Uuid: df6b2b96-a938-48b0-83d8-75bfed059a3d. Persona: A disciplined, sociable visionary, Jonathan balances practicality with curiosity, leaving a lasting impact on his community through his organized, competitive approach. Professional Persona: A retired manufacturing manager, Jonathan now excels as ...

Number of Tokens (input_ids length): 512
Number of Attention Mask values: 512

First 20 Input IDs: [52, 27112, 25, 47764, 21, 65, 17, 65, 4846, 12, 64, 24, 2548, 12, 2780, 65, 15, 12, 5999, 67]
Last 20 Attention Mask values: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Decoded Text (for verification): Uuid: df6b2b96-a938-48b0-83d8-75bfed059a3d. Persona: A disciplined, sociable visionary, Jonathan balances practicality with curiosity, leaving a lasting impact on his community through his organized, competitive approach. Professional Persona: A retired manufacturing manager, Jonathan now excels as ...


In [None]:
# The maximum sequence length for the GPT-2 model.
# Adjust this based on the length of your personas.
# 1024 is the max for gpt2, but smaller values can save memory.
max_length = 512

def tokenize_function(examples):
    """
    This function takes a batch of text and returns the tokenized output.
    """
    # The tokenizer will convert the text into input_ids, attention_mask, etc.
    # `truncation=True`: Ensures sequences longer than max_length are cut.
    # `padding=True`: Pads shorter sequences to max_length for batching.
    # `return_tensors="pt"`: Returns PyTorch tensors. We'll use this later.
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

# Apply the tokenization function to the entire dataset using `map`.
# `batched=True` tells the function to process items in batches, which is faster.
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)

print("\nTokenized Dataset Structure:")
print(tokenized_dataset)
print("-" * 20)
print("First item in the tokenized dataset:")
print(tokenized_dataset[0])
print(f"Shape of input_ids for the first item: {tokenized_dataset[0]['input_ids'].shape}")

Step 5)
Make sure can hookup to the transformers library and get an LLM

Transformers is the hugging face API, would specifgy the model there

In [None]:
!pip install transformers sentence-transformers torch # torch is the backend, sentence-transformers for embeddings

In [None]:

# 1. Choose a model ID from Hugging Face Hub
# For a smaller, fast example: "gpt2"
# For something more capable (but larger): "distilgpt2", "microsoft/DialoGPT-small",
# or for more recent, look into instruction-tuned models like "google/gemma-2b-it" (requires agreement)
# or "meta-llama/Llama-2-7b-chat-hf" (requires agreement)

# Let's start with GPT-2 for a quick demonstration
model_name = "gpt2"

# Check if GPU is available and set device
device = 0 if torch.cuda.is_available() else -1 # 0 for first GPU, -1 for CPU

# Option 1: Using the `pipeline` API (simplest for common tasks)
# This handles tokenizer and model loading automatically for many tasks
print(f"Loading model '{model_name}' using pipeline...")
generator = tr.pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.float16, # Use float16 for memory efficiency on GPU
    device=device
)
print("Model loaded via pipeline!")

# Example usage with pipeline:
prompt = "Given a persona who loves action movies and sci-fi, which streaming service would they choose?"
output = generator(prompt, max_new_tokens=50, num_return_sequences=1)
print("\nLLM Prediction (Pipeline):")
print(output[0]['generated_text'])