In [6]:
import csv
import random
import itertools

def generate_chitchat_dataset(target_count=11119):
    # Base question categories
    base_questions = [
        # Personal preferences
        "What's your favorite type of music?", "Do you prefer coffee or tea?", "What's your favorite season and why?",
        "Are you a morning person or a night owl?", "What's your favorite way to spend a weekend?",
        "Do you prefer books or movies?", "What's your favorite type of cuisine?", "Are you more of an introvert or extrovert?",
        "What's your favorite color?", "Do you prefer the beach or the mountains?", "What's your favorite holiday?",
        "Are you a dog person or a cat person?", "What's your favorite type of weather?", "Do you prefer sweet or savory snacks?",
        "What's your favorite time of day?", "Do you like spicy food?", "What's your favorite genre of movies?",
        "Are you more creative or analytical?", "What's your favorite way to exercise?", "Do you prefer indoor or outdoor activities?",
        
        # Hobbies and interests
        "What hobbies do you enjoy?", "Do you play any musical instruments?", "What's the last book you read?",
        "Do you enjoy cooking?", "What sports do you like to watch or play?", "Are you into gardening?",
        "Do you collect anything?", "What's your favorite board game?", "Do you enjoy photography?",
        "Are you interested in astronomy?", "Do you like to paint or draw?", "What crafts do you enjoy?",
        "Do you enjoy hiking?", "Are you into video games?", "Do you like puzzles?", "What's your favorite outdoor activity?",
        "Do you enjoy dancing?", "Are you interested in history?", "Do you like to write?", "What's your favorite way to be creative?",
        
        # Travel and places
        "What's the most beautiful place you've ever visited?", "Where would you love to travel next?",
        "Do you prefer traveling alone or with others?", "What's your favorite city?", "Have you ever been camping?",
        "What's the furthest you've traveled from home?", "Do you prefer road trips or flying?", "What's your dream vacation destination?",
        "Have you ever been to a different country?", "What's the best trip you've ever taken?",
        "Do you like to plan trips in detail or be spontaneous?", "What's your favorite type of accommodation when traveling?",
        "Have you ever been on a cruise?", "What's the most interesting cultural experience you've had?",
        "Do you prefer city breaks or nature getaways?", "What's the best food you've tried while traveling?",
        "Have you ever been backpacking?", "What's your favorite mode of transportation?", "Do you like to visit museums when you travel?",
        "What's the most adventurous thing you've done while traveling?",
        
        # Food and drinks
        "What's your favorite comfort food?", "Can you cook?", "What's the strangest food you've ever tried?",
        "Do you have any dietary restrictions?", "What's your go-to breakfast?", "Do you enjoy trying new restaurants?",
        "What's your favorite dessert?", "Do you like to bake?", "What's your favorite pizza topping?",
        "Do you prefer home-cooked meals or eating out?", "What's your favorite snack?", "Do you drink alcohol?",
        "What's your favorite type of bread?", "Do you like seafood?", "What's your favorite fruit?",
        "Do you enjoy spicy food?", "What's your favorite vegetable?", "Do you have a favorite restaurant?",
        "What's your favorite ice cream flavor?", "Do you like to try exotic foods?",
        
        # Entertainment and media
        "What's the last movie you watched?", "Do you have a favorite TV show?", "What type of music do you listen to when you're sad?",
        "Do you prefer comedies or dramas?", "What's your favorite podcast?", "Do you still listen to the radio?",
        "What's the best concert you've ever been to?", "Do you like reality TV shows?", "What's your favorite streaming service?",
        "Do you prefer subtitles or dubbing for foreign films?", "What's your favorite documentary?", "Do you like stand-up comedy?",
        "What's your favorite animated movie?", "Do you enjoy musicals?", "What's the scariest movie you've ever seen?",
        "Do you like to binge-watch series?", "What's your favorite classic movie?", "Do you prefer watching alone or with others?",
        "What's your favorite genre of music?", "Do you like live theater?",
        
        # Daily life
        "How do you like to start your day?", "What's your typical evening routine?", "Do you make your bed every morning?",
        "How do you handle stress?", "What's your biggest pet peeve?", "Do you prefer to plan your day or go with the flow?",
        "What's the first thing you do when you get home?", "How do you stay motivated?", "What's your favorite way to relax?",
        "Do you prefer calling or texting?", "How do you organize your living space?", "What's your morning beverage of choice?",
        "Do you enjoy grocery shopping?", "How do you deal with difficult people?", "What's your biggest time waster?",
        "Do you prefer to multitask or focus on one thing?", "How do you make important decisions?", "What's your favorite day of the week?",
        "Do you enjoy cleaning?", "How do you stay connected with friends and family?",
        
        # Conversational starters
        "How's your day going?", "What's new with you?", "How was your weekend?", "What have you been up to lately?",
        "How are you feeling today?", "What's on your mind?", "Tell me something interesting about your day",
        "What's the highlight of your week so far?", "How do you usually spend your free time?", "What's been keeping you busy?",
        
        # Childhood and memories
        "What's your favorite childhood memory?", "What was your favorite toy as a child?", "Did you have any pets growing up?",
        "What was your favorite subject in school?", "What did you want to be when you grew up?", "What's the best gift you received as a child?",
        "Did you have a favorite teacher?", "What was your favorite cartoon?", "Did you enjoy playing outside as a child?",
        "What was your favorite game to play with friends?", "Did you have a favorite bedtime story?", "What was your least favorite chore as a child?",
        "Did you collect anything as a child?", "What was your favorite birthday party?", "Did you have an imaginary friend?",
        "What was your favorite family tradition?", "What scared you the most as a child?", "Did you enjoy school trips?",
        "What was your favorite playground activity?", "Did you have a special blanket or stuffed animal?",
        
        # Opinions and thoughts
        "What do you think about social media?", "Do you believe in luck?", "What's your opinion on remote work?",
        "Do you think people are generally good?", "What's your take on climate change?", "Do you believe in aliens?",
        "What do you think about artificial intelligence?", "Do you think money can buy happiness?", "What's your opinion on online dating?",
        "Do you believe in fate or free will?", "What do you think about meditation?", "Do you think technology makes life better or worse?",
        "What's your opinion on self-help books?", "Do you believe in ghosts?", "What do you think about modern art?",
        "Do you think people should follow their dreams?", "What's your opinion on small talk?", "Do you believe in karma?",
        "What do you think about minimalism?", "Do you think the world is getting better or worse?"
    ]
    
    # Question templates for generating variations
    templates = [
        "What's your favorite {item}?", "How do you feel about {item}?", "What do you think about {item}?",
        "Tell me about {item}", "What's your experience with {item}?", "How would you describe {item}?",
        "What's interesting about {item}?", "What's your take on {item}?", "How important is {item} to you?",
        "What role does {item} play in your life?", "What's challenging about {item}?", "What's rewarding about {item}?",
        "How do you approach {item}?", "What's your relationship with {item}?", "How has {item} influenced you?",
        "What's surprising about {item}?", "What's your goal with {item}?", "How do you stay motivated with {item}?",
        "What advice would you give about {item}?", "How do you share {item} with others?"
    ]
    
    # Items/topics for templates
    items = [
        "exercise", "meditation", "reading", "cooking", "gardening", "photography", "volunteering",
        "learning", "creativity", "adventure", "friendship", "family", "work", "hobbies", "travel",
        "music", "art", "sports", "technology", "nature", "pets", "games", "puzzles", "crafts",
        "dancing", "singing", "writing", "drawing", "hiking", "swimming", "cycling", "running",
        "yoga", "chess", "cards", "movies", "books", "podcasts", "concerts", "festivals",
        "museums", "libraries", "parks", "beaches", "mountains", "cities", "communities",
        "traditions", "memories", "dreams", "goals", "challenges", "success", "happiness",
        "kindness", "patience", "courage", "honesty", "loyalty", "independence", "growth",
        "discovery", "innovation", "balance", "peace", "wisdom", "curiosity", "wonder",
        "solitude", "companionship", "routine", "spontaneity", "comfort", "risk", "change"
    ]
    
    # Hypothetical questions
    hypothetical_questions = [
        "If you could have dinner with anyone, who would it be?",
        "If you won the lottery, what would you do first?",
        "If you could live anywhere in the world, where would you choose?",
        "If you could have any superpower, what would it be?",
        "If you could time travel, where would you go?",
        "If you could be any animal, what would you choose?",
        "If you had to eat one food for the rest of your life, what would it be?",
        "If you could learn any skill instantly, what would it be?",
        "If you could meet your future self, what would you ask?",
        "If you could change one thing about the world, what would it be?",
        "If you were stranded on a desert island, what three things would you want?",
        "If you could be invisible for a day, what would you do?",
        "If you could read minds, would you want to?",
        "If you could fly or breathe underwater, which would you choose?",
        "If you had to give up one of your senses, which would it be?",
        "If you could live in any historical period, when would you choose?",
        "If you could speak any language fluently, which would it be?",
        "If you could have any job in the world, what would it be?",
        "If you could eliminate one problem from the world, what would it be?",
        "If you could have been born in any country, which would you choose?"
    ]
    
    # Generate all possible combinations
    all_questions = set()
    
    # Add base questions
    for q in base_questions:
        all_questions.add(q.strip())
    
    # Add hypothetical questions
    for q in hypothetical_questions:
        all_questions.add(q.strip())
    
    # Generate template combinations
    for template, item in itertools.product(templates, items):
        question = template.format(item=item)
        all_questions.add(question.strip())
    
    # Additional specific questions to increase variety
    additional_questions = [
        "What makes you laugh the most?", "What's something you're proud of?", "What motivates you?",
        "What's the best compliment you've ever received?", "What's something new you learned recently?",
        "What's your favorite way to spend a rainy day?", "What's something you've always wanted to try?",
        "What's the most beautiful thing you've ever seen?", "What's your dream job?", "What's your biggest fear?",
        "How do you define success?", "What's your favorite app?", "What's your take on online shopping?",
        "What does happiness mean to you?", "What's your philosophy on work-life balance?",
        "What's your favorite cultural tradition?", "How do you stay hopeful during difficult times?",
        "What's something positive you see happening in the world?", "What's a social issue you care about?",
        "How do you think we can make the world better?", "What's the kindest thing someone has done for you?",
        "What's your love language?", "What makes a good friend?", "How do you prefer to meet new people?",
        "What's the best advice you've ever received?", "What's the most important quality in a relationship?",
        "How do you handle disagreements?", "What's your biggest relationship deal-breaker?",
        "Do you believe in soulmates?", "How do you show appreciation to others?", "What's your approach to making friends?",
        "What's your morning routine like?", "How do you unwind after work?", "What's your guilty pleasure?",
        "What's your favorite season activity?", "How do you celebrate achievements?", "What's your backup plan?",
        "What's your hidden talent?", "What's your biggest accomplishment?", "What's your favorite quote?",
        "What's your spirit animal?", "What's your superpower in friendships?", "What's your biggest weakness?",
        "What's your favorite childhood book?", "What's your go-to karaoke song?", "What's your favorite emoji?",
        "What's your ideal Saturday?", "What's your favorite memory from this year?", "What's your biggest goal?",
        "What's your favorite thing about yourself?", "What's your most used phone app?", "What's your comfort movie?",
        "What's your favorite thing about your hometown?", "What's your ideal vacation length?", "What's your favorite podcast topic?",
        "What's your go-to conversation starter?", "What's your favorite way to exercise?", "What's your ideal work environment?",
        "What's your favorite type of weather for outdoor activities?", "What's your preferred method of communication?",
        "What's your favorite time to be creative?", "What's your approach to trying new things?", "What's your favorite way to learn?",
        "What's your ideal evening?", "What's your favorite thing about technology?", "What's your approach to goal setting?",
        "What's your favorite way to celebrate?", "What's your go-to stress reliever?", "What's your favorite childhood memory?",
        "What's your biggest inspiration?", "What's your favorite way to help others?", "What's your ideal day off?",
        "What's your favorite thing about your current life stage?", "What's your approach to work-life balance?",
        "What's your favorite way to stay connected with friends?", "What's your go-to comfort food when sick?",
        "What's your favorite thing about learning new skills?", "What's your ideal creative space?", "What's your favorite way to start the week?",
        "What's your approach to making big decisions?", "What's your favorite thing about your personality?",
        "What's your go-to method for staying organized?", "What's your favorite way to treat yourself?",
        "What's your ideal social gathering size?", "What's your favorite thing about different cultures?",
        "What's your approach to handling change?", "What's your favorite way to stay motivated?", "What's your ideal learning environment?"
    ]
    
    for q in additional_questions:
        all_questions.add(q.strip())
    
    # Convert to list and ensure we have enough questions
    questions_list = list(all_questions)
    
    if len(questions_list) < target_count:
        print(f"Warning: Only generated {len(questions_list)} unique questions, less than target {target_count}")
        target_count = len(questions_list)
    
    # Shuffle and take the required number
    random.shuffle(questions_list)
    return questions_list[:target_count]

def categorize_question(question):
    question_lower = question.lower()
    
    if any(word in question_lower for word in ['favorite', 'prefer', 'like', 'love']):
        return 'preferences'
    elif any(word in question_lower for word in ['feel', 'think', 'opinion', 'believe']):
        return 'opinions'
    elif any(word in question_lower for word in ['if you could', 'would you', 'imagine']):
        return 'hypothetical'
    elif any(word in question_lower for word in ['childhood', 'growing up', 'when you were']):
        return 'memories'
    elif any(word in question_lower for word in ['food', 'eat', 'cook', 'restaurant']):
        return 'food'
    elif any(word in question_lower for word in ['travel', 'trip', 'vacation', 'visit']):
        return 'travel'
    elif any(word in question_lower for word in ['movie', 'music', 'book', 'show', 'watch']):
        return 'entertainment'
    elif any(word in question_lower for word in ['friend', 'relationship', 'family', 'social']):
        return 'relationships'
    elif any(word in question_lower for word in ['work', 'job', 'career', 'professional']):
        return 'career'
    elif any(word in question_lower for word in ['day', 'morning', 'evening', 'routine']):
        return 'daily_life'
    else:
        return 'general'

# Generate the dataset
print("Generating chitchat dataset...")
chitchat_questions = generate_chitchat_dataset(11119)

# Verify no duplicates
unique_count = len(set(q.lower().strip() for q in chitchat_questions))
print(f"Total questions generated: {len(chitchat_questions)}")
print(f"Unique questions: {unique_count}")
print(f"Duplicates: {len(chitchat_questions) - unique_count}")

# Create CSV file
with open('chitchat_dataset_11119_2.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['id', 'question', 'category'])
    
    for i, question in enumerate(chitchat_questions, 1):
        category = categorize_question(question)
        writer.writerow([i, question, category])

print("Dataset saved as 'chitchat_dataset_11119.csv'")
print("\nFirst 10 questions:")
for i, question in enumerate(chitchat_questions[:10], 1):
    print(f"{i}. {question}")

# Count categories
category_counts = {}
for question in chitchat_questions:
    category = categorize_question(question)
    category_counts[category] = category_counts.get(category, 0) + 1

print(f"\nCategory distribution:")
for category, count in sorted(category_counts.items()):
    print(f"{category}: {count} questions")

Generating chitchat dataset...
Total questions generated: 1775
Unique questions: 1775
Duplicates: 0
Dataset saved as 'chitchat_dataset_11119.csv'

First 10 questions:
1. How do you share festivals with others?
2. Tell me about drawing
3. How do you feel about goals?
4. What's challenging about crafts?
5. What's your experience with memories?
6. What's your goal with spontaneity?
7. What's your biggest relationship deal-breaker?
8. Do you like to write?
9. Have you ever been camping?
10. What's challenging about dreams?

Category distribution:
career: 19 questions
daily_life: 23 questions
entertainment: 53 questions
food: 40 questions
general: 992 questions
hypothetical: 168 questions
memories: 1 questions
opinions: 171 questions
preferences: 180 questions
relationships: 106 questions
travel: 22 questions
