In [1]:
import os
import time
import json
from dotenv import load_dotenv
import pandas as pd
from google import genai


In [None]:
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)

In [13]:

# uses exponential backoff when calling gemini api
def generate_text(prompt, model="gemini-2.0-flash-lite-preview-02-05", max_retries=5, initial_wait=2):

    attempt = 0
    wait_time = initial_wait

    while attempt < max_retries:
        try:
            response = client.models.generate_content(
                model=model,
                contents=[prompt]
            )
            return response.text
        except Exception as e:
            print(f"Error encountered: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            wait_time *= 2  # backoff
            attempt += 1

    raise Exception("Max retries reached. The request could not be completed.")

# def validate_profile(profile):
#     required_keys = ["bio", "movies", "tv", "religion", "music", "sports", "books", "politics"]
    
#     # checking all keys present or not
#     for key in required_keys:
#         if key not in profile:
#             return False
    
#     # checking range of numeric entries
#     for key in ["bio", "movies", "tv", "religion", "music", "sports", "books", "politics"]:
#         value = profile[key]
#         if not isinstance(value, str):
#             return False
        
#     return True

def generate_profiles(n_profiles=500):
    prompt = f"""
You are a profile generator. Create {n_profiles} unique dating profiles.
Return them as a valid JSON array of objects. Each object has these fields, create creative bio but only choose one value from list of options from below:
    "bio": (string) a short creative biography (1-2 sentences),
    "movies": ("Action", "Comedy", "Drama", "Horror", "Science Fiction"),
    "religion": ("Christianity", "Islam", "Hinduism", "Buddhism", "Judaism"),
    "music": ("Rock", "Pop", "Hip-Hop", "Jazz", "Classical"),
    "sports": ("Soccer", "Basketball", "Tennis", "Cricket", "Baseball"),
    "books": ("Fantasy", "Mystery", "Romance", "Science Fiction", "Non-Fiction"),
    "politics": ("Liberalism", "Conservatism", "Socialism", "Libertarianism", "Communism")
Example for one item in the array:
{{
    "bio": "Coffee fanatic. Amateur traveler. Always up for a new adventure!",
    "movies": "Action",
    "religion": "Islam",
    "music": "Rock",
    "sports": "Basketball",
    "books": "Fantasy",
    "politics": "Conservatism" 
}}
"""
    response_text = generate_text(prompt).replace('json', '').replace("```", "")
    print(response_text)
    try:
        # converting JSON string into list of dictionaries
        raw_profiles = json.loads(response_text)
        print(raw_profiles)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}. Raw Response: {response_text}")
        return []
    
    # valid_profiles = []
    # for p in raw_profiles:
    #     if validate_profile(p):
    #         valid_profiles.append(p)

    # print(f"Generated {len(valid_profiles)} valid profiles out of requested {n_profiles}.")
    return raw_profiles

if __name__ == "__main__":
    total_needed = 10
    batch_size = 10

    all_profiles = []
    total_collected = 0
 
    while total_collected <= total_needed:
        needed_now = min(batch_size, total_needed - total_collected)
        batch = generate_profiles(batch_size)
        all_profiles.extend(batch)
        total_collected = len(all_profiles)
        time.sleep(2)
    
    print(all_profiles)
    df = pd.DataFrame(all_profiles)
    df.rename(
        columns={
            "bio": "Bios",
            "movies": "Movies",
            "tv": "TV",
            "religion": "Religion",
            "music": "Music",
            "sports": "Sports",
            "books": "Books",
            "politics": "Politics"
        },
        inplace=True
    )
    
    print(df.head(10))  
    print(f"\nTotal valid profiles generated: {len(df)}")
    
    df.to_csv("dating_profiles.csv", index=False)
    print("\nSaved to 'dating_profiles.csv'.")



[
  {
    "bio": "Avid stargazer with a penchant for philosophical debates. Seeking someone to share sunsets and meaningful conversations.",
    "movies": "Science Fiction",
    "religion": "Buddhism",
    "music": "Jazz",
    "sports": "Tennis",
    "books": "Non-Fiction",
    "politics": "Liberalism"
  },
  {
    "bio": "Lover of spontaneous road trips and the perfect slice of pizza. Life's too short for boring dates!",
    "movies": "Comedy",
    "religion": "Christianity",
    "music": "Pop",
    "sports": "Baseball",
    "books": "Romance",
    "politics": "Conservatism"
  },
  {
    "bio": "Bookworm and history buff searching for my own fairytale ending. Let's explore museums and old bookstores together.",
    "movies": "Drama",
    "religion": "Judaism",
    "music": "Classical",
    "sports": "Soccer",
    "books": "Fantasy",
    "politics": "Socialism"
  },
  {
    "bio": "World traveler with a taste for spicy food and adrenaline rushes. Ready to experience the best life has 

In [15]:
df = pd.read_csv('dating_profiles.csv')
df.head()

Unnamed: 0,Bios,Movies,Religion,Music,Sports,Books,Politics
0,Avid stargazer with a penchant for philosophic...,Science Fiction,Buddhism,Jazz,Tennis,Non-Fiction,Liberalism
1,Lover of spontaneous road trips and the perfec...,Comedy,Christianity,Pop,Baseball,Romance,Conservatism
2,Bookworm and history buff searching for my own...,Drama,Judaism,Classical,Soccer,Fantasy,Socialism
3,World traveler with a taste for spicy food and...,Action,Islam,Hip-Hop,Basketball,Science Fiction,Libertarianism
4,Always up for a good laugh and a competitive g...,Comedy,Hinduism,Rock,Cricket,Mystery,Communism
