<a href="https://colab.research.google.com/github/mosomo82/COMP_SCI_5530/blob/main/Project_Customer_Churn/src/Generating_Customer_Interaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import google.generativeai as genai
import requests
import io
import time
import os
import random
import json
import re

# --- 1. CONFIGURATION ---

# Paste your Gemini API Key here
GEMINI_API_KEY = "AIzaSyC3XZAoOjVtVGwU4TS6lS5vU78tJ8JkwzY"

# GitHub URL for your clean_data.csv file
DATA_URL = "https://raw.githubusercontent.com/mosomo82/COMP_SCI_5530/refs/heads/main/Project_Customer_Churn/clean_data/clean_data_text_generation.csv"

# Number of customers to sample
N_SAMPLES = 5

# Output file name
OUTPUT_FILE = "output.csv"

# --- 2. CONFIGURE GEMINI API ---

try:
    genai.configure(api_key=GEMINI_API_KEY)
except AttributeError:
    print("Error: The Google Generative AI library is not installed correctly or the API key is invalid.")
    exit()

# Set up the model # higher temp for more "creative" and realistic chat logs
generation_config={
    "temperature": 0.8,
    "top_p": 1,
    #"top_k": 1,
    "max_output_tokens": 2048
}

safety_settings=[
      {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
      {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
      {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
      {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    ]

model = genai.GenerativeModel(model_name="gemini-2.5-flash",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

# --- 3. DYNAMIC PROMPT CREATION FUNCTION ---

import random  # <-- CRITICAL: Added importa

def create_prompt(customer_row):
    """Creates a tailored LLM prompt based on the customer's profile."""

    # Base profile string
    profile = f"""
    - CustomerID: {customer_row['customerID']}
    - Contract: {customer_row['Contract_Encoded']}
    - Internet Service: {customer_row['InternetService']}
    - Online Security: {customer_row['OnlineSecurity']}
    - Tech Support Add-on: {customer_row['TechSupport']}
    - Monthly Charges: {customer_row['MonthlyCharges']}
    - Payment Method: {customer_row['AutomaticPayment']}
    """

    # Create different scenarios for "Churn" vs. "No Churn"
    if customer_row['Churn'] == 1:

        # --- Build a list of LOGICAL churn reasons ---
        reasons = [
            f"frustration with high MonthlyCharges of ${customer_row['MonthlyCharges']}",
            f"unreliable {customer_row['InternetService']} internet service",
            "a billing dispute",
        ]

        # Add reasons based on their actual services
        if customer_row['OnlineSecurity'] == 'No':
            reasons.append("concerns about a lack of online security")

        if customer_row['TechSupport'] == 'Yes':
            reasons.append(f"difficulty getting help (even though I pay for TechSupport)")
        elif customer_row['TechSupport'] == 'No':
            reasons.append(f"frustration about not having tech support when I needed it")

        if customer_row['AutomaticPayment'] == 0:
             reasons.append("frustration with the manual payment process")
        else:
             reasons.append("an issue with an automatic payment")
        # --- End of reason building ---

        chosen_reason = random.choice(reasons)

        prompt = f"""
        You are a data generation bot. Your task is to generate a realistic, multi-turn customer support chat log.
        The customer (Churn=1) just left a Customer Satisfaction (CSAT) survey.

        CUSTOMER PROFILE:
        {profile}

        INSTRUCTIONS:
        Generate a JSON object with the following 3 fields.

        1. Their satisfaction score (a numerical CSAT score 1-5).
        2. "SurveyComment": A brief, negative survey comment that reflects the satisfaction score and the chosen reason.
        3. "ChatLog": The chat log should reflect the customer's {chosen_reason}.
        4. The customer should be reasonably unhappy and end the chat unsatisfied.

        Provide your output *only* in the following JSON format:
        {{"SatisfactionScore": "...", "SurveyComment": "...", "ChatLog": "..."}}

        """

    else:  # Churn == 0

        # --- Build a list of LOGICAL non-churn reasons ---
        reasons = [
            f"the reliable {customer_row['InternetService']} service",
            "a simple billing question that got resolved",
        ]
        if customer_row['OnlineSecurity'] == 'Yes':
            reasons.append("appreciating the included OnlineSecurity")
        if customer_row['TechSupport'] == 'Yes':
            reasons.append("a quick technical question that got resolved easily")
        if customer_row['AutomaticPayment'] == 1:
            reasons.append("the convenience of automatic payments")

        chosen_reason = f"praising {random.choice(reasons)}"

        prompt = f"""
        You are a data generation bot. Your task is to generate a realistic, multi-turn customer support chat log.
        The customer (Churn=0) just left a Customer Satisfaction (CSAT) survey.

        CUSTOMER PROFILE:
        {profile}

        INSTRUCTIONS:
        Generate a JSON object with the following 3 fields.

        1. Their satisfaction score (a numerical CSAT score 1-5).
        2. "SurveyComment": A brief, positive or neutral survey comment reflecting the satisfaction score.
        3. "ChatLog": The chat log should be about a neutral or positive topic, like {chosen_reason}.
        4. The customer should be reasonably pleasant or neutral and end the chat satisfied.

        Provide your output *only* in the following JSON format:
        {{"SatisfactionScore": "...", "SurveyComment": "...", "ChatLog": "..."}}

        """

    return prompt

# --- 4. LOAD AND SAMPLE DATA ---

print(f"Loading data from {DATA_URL}...")
try:
    response = requests.get(DATA_URL)
    response.raise_for_status() # Raise an error for bad responses
    df = pd.read_csv(io.StringIO(response.text))
    print(f"Successfully loaded {len(df)} total customers.")
except requests.exceptions.RequestException as e:
    print(f"Error: Failed to fetch data from GitHub. {e}")
    exit()

# Create a balanced sample
df_churn_yes = df[df['Churn'] == 1].sample(n=N_SAMPLES, random_state=42)
df_churn_no = df[df['Churn'] == 0].sample(n=N_SAMPLES, random_state=42)
df_sample = pd.concat([df_churn_yes, df_churn_no])

print(f"Created a balanced sample of {len(df_sample)} customers.")

# --- 5. GENERATE LOGS AND SAVE TO FILE ---

all_feedback = []
total = len(df_sample)
MAX_RETRIES = 3

print(f"\nStarting generation of {total} survey responses...")

for index, row in df_sample.iterrows():
    # 1. Create the dynamic prompt
    prompt_text = create_prompt(row)

    retries = 0
    while retries < MAX_RETRIES:
        try:
            # 2. Call the API
            response = model.generate_content(prompt_text)

            # 3. Clean and parse the JSON response
            cleaned_text = re.sub(r'```json\n(.*?)\n```', r'\1', response.text, flags=re.DOTALL)
            data = json.loads(cleaned_text)

            # 4. Store the result
            all_feedback.append({
                'CustomerID': row['customerID'],
                'SatisfactionScore': data.get('SatisfactionScore'),
                'SurveyComment': data.get('SurveyComment'),
                'InteractionText': data.get('ChatLog'),
                'OriginalChurnStatus': row['Churn']
            })

            print(f"({len(all_feedback)}/{total}) Generated survey for {row['customerID']} (Score={data.get('SatisfactionScore')})")
            break # Break out of the retry loop on success

        except json.JSONDecodeError:
            retries += 1
            print(f"Warning: Failed to decode JSON for {row['customerID']} on attempt {retries}. Retrying...")
            if retries < MAX_RETRIES:
                time.sleep(2) # Wait a bit before retrying
            else:
                print(f"Error: Failed to decode JSON for {row['customerID']} after {MAX_RETRIES} retries. Skipping.")
        except Exception as e:
            print(f"Error generating content for {row['customerID']}: {e}")
            break # Exit retry loop for other errors

    # IMPORTANT: Add a delay to respect API rate limits (e.g., 60 requests/minute)
    # Only sleep if we successfully generated or exhausted retries
    if retries < MAX_RETRIES or (retries == MAX_RETRIES and len(all_feedback) < total):
         time.sleep(1)


# --- 6. SAVE FINAL FILE ---

print("\n...Generation complete.")
df_output = pd.DataFrame(all_feedback)

# Save to CSV
df_output.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')

print(f"\nSuccessfully created '{OUTPUT_FILE}' with {len(df_output)} entries.")
display(df_output.head())

Loading data from https://raw.githubusercontent.com/mosomo82/COMP_SCI_5530/refs/heads/main/Project_Customer_Churn/clean_data/clean_data_text_generation.csv...
Successfully loaded 7032 total customers.
Created a balanced sample of 10 customers.

Starting generation of 10 survey responses...
Error generating content for 6302-JGYRJ: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
(1/10) Generated survey for 2320-JRSDE (Score=1)
Error generating content for 2332-EFBJY: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
Error generating content for 1624-WOIWJ: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Unnamed: 0,CustomerID,SatisfactionScore,SurveyComment,InteractionText,OriginalChurnStatus
0,2320-JRSDE,1,Ridiculous charges for absolutely no service. ...,Customer: I just submitted my cancellation for...,1
1,9391-EOYLI,1,My bill suddenly increased without proper noti...,"Agent: Hello, thank you for contacting TechCon...",1
2,2683-JXWQQ,4,My billing question was handled quickly and ef...,"Customer: Hi, I have a quick question about my...",0
3,5732-IKGQH,5,Extremely helpful support! My billing question...,"Customer: Hi there, I just received my latest ...",0
4,1421-HCERK,5,"Extremely satisfied with the service, especial...","Customer: Hi, I just wanted to quickly mention...",0
