# Manage Session Records

This notebook provides utilities to manage session records in DynamoDB/CSV:

- **Clear all session data** - Delete all records from session_* tables and files
- **Delete a session record** (by session_id) from all related tables
- **Update a record** in a selected table by session_id
- **List sessions** for viewing
- **Reassign Session IDs** - Fix session ID assignments based on (user_id, test_date, boyfriend_name)

**Note:** When a record is deleted, `Summary_Sessions` is automatically updated.


In [14]:
import sys
from pathlib import Path
from typing import Dict, Any, Optional
import pandas as pd

# Add project root to path
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root))

from src.adapters.database.database_handler import DatabaseHandler
from src.utils.session_id_generator import generate_session_id, find_existing_session_id
from src.utils.summary_updater import update_summary_after_delete


## Clear All Session Data

**WARNING:** This will delete ALL records from all session_* tables and CSV files. This action cannot be undone!

Use this when you want to start fresh with clean data.


In [None]:
def clear_all_session_data(use_dynamodb: bool = True, confirm: bool = False):
    """
    Clear all session data from both DynamoDB and CSV files.
    
    This function:
    1. Deletes all records from session_* tables in DynamoDB (if use_dynamodb=True)
    2. Deletes all session_*.csv files from the data folder
    3. Clears Summary_Sessions table/file
    
    Args:
        use_dynamodb: If True, clear DynamoDB tables; if False, only clear CSV files
        confirm: Must be True to actually perform the deletion (safety check)
    
    Returns:
        True if successful, False otherwise
    """
    if not confirm:
        print("=" * 60)
        print("SAFETY CHECK")
        print("=" * 60)
        print("[WARNING] This will delete ALL session data!")
        print("To proceed, call this function with confirm=True")
        print("=" * 60)
        return False
    
    print("=" * 60)
    print("Clearing All Session Data")
    print("=" * 60)
    print(f"Backend: {'DynamoDB' if use_dynamodb else 'CSV only'}")
    print("=" * 60)
    
    session_tables = [
        "session_responses",
        "session_gtk_responses",
        "session_feedback",
        "session_toxicity_rating",
        "session_insights",
    ]
    
    # Clear DynamoDB tables
    if use_dynamodb:
        print("\n[1] Clearing DynamoDB tables...")
        from src.adapters.database.database_handler import DatabaseHandler
        
        db_handler = DatabaseHandler(db_read_allowed=True, db_write_allowed=True)
        
        try:
            for table_name in session_tables:
                try:
                    table = db_handler.backend.dynamodb.Table(table_name)
                    
                    # Scan and delete all items
                    deleted_count = 0
                    while True:
                        response = table.scan()
                        items = response.get("Items", [])
                        
                        if not items:
                            break
                        
                        # Delete items in batch
                        with table.batch_writer() as batch:
                            for item in items:
                                # Get the primary key (usually 'id')
                                key = {"id": item["id"]}
                                batch.delete_item(Key=key)
                                deleted_count += 1
                        
                        # Check if there are more items
                        if "LastEvaluatedKey" not in response:
                            break
                    
                    print(f"  [OK] Cleared {table_name}: {deleted_count} records deleted")
                except Exception as e:
                    print(f"  [WARNING] Could not clear {table_name}: {e}")
            
            # Clear Summary_Sessions
            try:
                table = db_handler.backend.dynamodb.Table("Summary_Sessions")
                response = table.scan()
                items = response.get("Items", [])
                
                with table.batch_writer() as batch:
                    for item in items:
                        key = {"summary_id": item["summary_id"]}
                        batch.delete_item(Key=key)
                
                print(f"  [OK] Cleared Summary_Sessions: {len(items)} records deleted")
            except Exception as e:
                print(f"  [WARNING] Could not clear Summary_Sessions: {e}")
            
            db_handler.close()
            print("[OK] DynamoDB tables cleared")
        except Exception as e:
            print(f"[ERROR] Error clearing DynamoDB: {e}")
            db_handler.close()
    
    # Clear CSV files
    print("\n[2] Clearing CSV files...")
    from pathlib import Path
    
    project_root = Path().resolve().parent.parent
    data_dir = project_root / "data"
    
    deleted_files = []
    for table_name in session_tables:
        csv_file = data_dir / f"{table_name}.csv"
        if csv_file.exists():
            try:
                csv_file.unlink()
                deleted_files.append(csv_file.name)
                print(f"  [OK] Deleted {csv_file.name}")
            except Exception as e:
                print(f"  [ERROR] Could not delete {csv_file.name}: {e}")
    
    # Clear Summary_Sessions.csv
    summary_file = data_dir / "Summary_Sessions.csv"
    if summary_file.exists():
        try:
            summary_file.unlink()
            deleted_files.append(summary_file.name)
            print(f"  [OK] Deleted {summary_file.name}")
        except Exception as e:
            print(f"  [ERROR] Could not delete {summary_file.name}: {e}")
    
    print(f"\n[OK] CSV files cleared: {len(deleted_files)} files deleted")
    
    print("\n" + "=" * 60)
    print("[SUCCESS] All session data cleared!")
    print("=" * 60)
    return True


In [None]:
def initialize_summary_sessions_manual(use_dynamodb: bool = True):
    """
    Manually initialize Summary_Sessions table with default values.
    
    This is useful when you've cleared all session data and want to
    ensure Summary_Sessions exists with proper defaults.
    
    Args:
        use_dynamodb: If True, use DynamoDB; if False, use CSV
    """
    print("=" * 60)
    print("Initializing Summary_Sessions")
    print("=" * 60)
    print(f"Backend: {'DynamoDB' if use_dynamodb else 'CSV'}")
    print("=" * 60)
    
    from src.adapters.database.database_handler import DatabaseHandler
    from src.utils.summary_initializer import initialize_summary_sessions
    
    db_handler = DatabaseHandler(db_read_allowed=use_dynamodb, db_write_allowed=use_dynamodb)
    
    try:
        success = initialize_summary_sessions(db_handler)
        if success:
            print("\n[SUCCESS] Summary_Sessions initialized successfully!")
        else:
            print("\n[ERROR] Failed to initialize Summary_Sessions")
    except Exception as e:
        print(f"\n[ERROR] Error: {e}")
    finally:
        db_handler.close()


In [None]:
# Initialize Summary_Sessions with default values
# Uncomment to run
# initialize_summary_sessions_manual(use_dynamodb=USE_DYNAMODB)


In [None]:
# Clear all session data
# WARNING: This will delete ALL session records!
# Uncomment and set confirm=True to proceed
# clear_all_session_data(use_dynamodb=USE_DYNAMODB, confirm=False)  # Set to True to actually delete


## Configuration

Set `USE_DYNAMODB = True` to use DynamoDB, or `False` to use CSV files.


In [2]:
# Configuration
USE_DYNAMODB = True  # Set to False to use CSV instead


## Delete a Session

Deletes a session record from all related tables by session_id and updates Summary_Sessions.

**Note:** 
- You only need to provide the `session_id` (which is a hash of user_id and boyfriend_name)
- The `session_id` value is stored in the `id` column in all database tables
- You can get the `id` value from the `list_sessions()` function output


In [3]:
def delete_session(session_id, db_write_allowed: bool = True) -> bool:
    """
    Delete a session record from all related tables based on session_id.
    Also updates Summary_Sessions table.
    
    Args:
        session_id: The session ID value (stored in the 'id' column in database tables).
                    This is a hash of user_id and boyfriend_name.
                    Can be int or str (will be converted to int for comparison).
        db_write_allowed: If True, use DynamoDB; if False, use CSV
        
    Returns:
        True if deletion was successful, False otherwise
        
    Note:
        The session_id parameter corresponds to the 'id' column in all tables.
        The 'id' column is stored as a number (int) in the database.
        You can get this value from list_sessions() output.
    """
    # Convert session_id to int to ensure type match with database
    try:
        session_id_int = int(session_id)
    except (ValueError, TypeError):
        print(f"[ERROR] session_id must be a number (int or string representation of int). Got: {type(session_id)} = {session_id}")
        return False
    
    print("=" * 60)
    print(f"Deleting Session Record")
    print("=" * 60)
    print(f"Session ID: {session_id_int}")
    print(f"Backend: {'DynamoDB' if db_write_allowed else 'CSV'}")
    print("=" * 60)
    
    db_handler = DatabaseHandler(db_write_allowed=db_write_allowed)
    
    try:
        # Load the record to get its values before deleting
        print(f"\n[1] Loading record with session_id {session_id_int} from session_responses...")
        session_responses = db_handler.load_table("session_responses")
        
        if session_responses.empty:
            print("[ERROR] session_responses table is empty")
            db_handler.close()
            return False
        
        # Ensure 'id' column is numeric for proper comparison
        session_responses["id"] = pd.to_numeric(session_responses["id"], errors='coerce')
        
        # Find the record by session_id (session_id is stored in the 'id' column as int)
        record = session_responses[session_responses["id"] == session_id_int]
        
        if record.empty:
            print(f"[ERROR] Record with session_id={session_id_int} not found")
            db_handler.close()
            return False
        
        # Get values for summary update
        row = record.iloc[0]
        deleted_toxic_score = float(row.get("toxic_score", 0))
        deleted_filter_violations = int(row.get("filter_violations", 0))
        
        print(f"[OK] Found record:")
        print(f"     Session ID: {session_id_int}")
        print(f"     User ID: {row.get('user_id')}")
        print(f"     Boyfriend Name: {row.get('boyfriend_name')}")
        print(f"     Toxic Score: {deleted_toxic_score}")
        print(f"     Filter Violations: {deleted_filter_violations}")
        
        # Delete the record from all related tables
        print(f"\n[2] Deleting records with session_id {session_id_int} from all tables...")
        tables_to_delete = [
            "session_responses",
            "session_gtk_responses",
            "session_feedback",
            "session_toxicity_rating",
            "session_insights",
        ]
        
        deleted_count = 0
        for table_name in tables_to_delete:
            try:
                # Delete record using session_id_int (which is stored in the 'id' column as int)
                if db_handler.delete_record(table_name, session_id_int, id_column="id"):
                    deleted_count += 1
                    print(f"[OK] Deleted from {table_name}")
                else:
                    print(f"[INFO] No record found in {table_name} (may not exist for this session)")
            except Exception as e:
                print(f"[WARNING] Could not delete from {table_name}: {e}")
        
        if deleted_count == 0:
            print("[ERROR] Failed to delete any records")
            db_handler.close()
            return False
        
        # Update Summary_Sessions
        print(f"\n[3] Updating Summary_Sessions...")
        update_success = update_summary_after_delete(
            db_handler=db_handler,
            deleted_toxic_score=deleted_toxic_score,
            deleted_filter_violations=deleted_filter_violations,
        )
        
        if not update_success:
            print("[WARNING] Record deleted but Summary_Sessions update failed")
        
        db_handler.close()
        
        print("\n" + "=" * 60)
        print("[SUCCESS] Session record deleted and Summary_Sessions updated!")
        print("=" * 60)
        return True
        
    except Exception as e:
        print(f"\n[ERROR] Error deleting session: {e}")
        import traceback
        print(f"[ERROR] Traceback: {traceback.format_exc()}")
        db_handler.close()
        return False


In [None]:
# Example: Delete a session
# Uncomment and modify the session_id below to delete a session
# Note: session_id is the value stored in the 'id' column in the database tables
# You can get this value from the list_sessions() function output (shown as "Session ID")

# delete_session(
#     session_id="12",  # This is the 'id' column value from the table
#     db_write_allowed=USE_DYNAMODB
# )

Deleting Session Record
Session ID: 12
Backend: DynamoDB
[INFO] Running locally, looking for credentials in file
[OK] AWS DynamoDB connection established

[1] Loading record with session_id 12 from session_responses...
[OK] Found record:
     Session ID: 12
     User ID: 60b7e7a2-0c76-4593-a6e6-e45fef8cf874
     Boyfriend Name: Test1
     Toxic Score: 0.26752
     Filter Violations: 0

[2] Deleting records with session_id 12 from all tables...
[OK] Deleted record with id=12 from session_responses
[OK] Deleted from session_responses
[OK] Deleted record with id=12 from session_gtk_responses
[OK] Deleted from session_gtk_responses
[OK] Deleted record with id=12 from session_feedback
[OK] Deleted from session_feedback
[OK] Deleted record with id=12 from session_toxicity_rating
[OK] Deleted from session_toxicity_rating
[INFO] No record found in session_insights (may not exist for this session)

[3] Updating Summary_Sessions...
[ERROR] Error updating DynamoDB record: Float types are not supp

True

## Update a Record

Updates a record in a specific table by session_id.


In [5]:
def update_session_record(
    table_name: str,
    session_id: int,
    update_data: Dict[str, Any],
    db_write_allowed: bool = True
) -> bool:
    """
    Update a record in a specific table by session_id.
    
    Args:
        table_name: Name of the table to update
        session_id: Session ID of the record to update
        update_data: Dictionary of fields to update
        db_write_allowed: If True, use DynamoDB; if False, use CSV
        
    Returns:
        True if update was successful, False otherwise
    """
    print("=" * 60)
    print(f"Updating Session Record")
    print("=" * 60)
    print(f"Table: {table_name}")
    print(f"Session ID: {session_id}")
    print(f"Backend: {'DynamoDB' if db_write_allowed else 'CSV'}")
    print("=" * 60)
    
    db_handler = DatabaseHandler(db_write_allowed=db_write_allowed)
    
    try:
        # Check if record exists
        print(f"\n[1] Checking if record exists in {table_name}...")
        table_data = db_handler.load_table(table_name)
        
        if table_data.empty:
            print(f"[ERROR] Table '{table_name}' is empty")
            db_handler.close()
            return False
        
        # Find the record
        record = table_data[table_data["id"] == session_id]
        
        if record.empty:
            print(f"[ERROR] Record with session_id={session_id} not found in {table_name}")
            db_handler.close()
            return False
        
        print(f"[OK] Found record in {table_name}")
        print(f"     Current values: {dict(record.iloc[0].head(5))}...")
        
        # Update the record
        print(f"\n[2] Updating record...")
        print(f"     Fields to update: {list(update_data.keys())}")
        
        db_handler.update_record(
            table_name=table_name,
            key_dict={"id": session_id},
            update_dict=update_data
        )
        
        print(f"[OK] Record updated successfully!")
        
        db_handler.close()
        
        print("\n" + "=" * 60)
        print("[SUCCESS] Record updated!")
        print("=" * 60)
        return True
        
    except Exception as e:
        print(f"\n[ERROR] Error updating record: {e}")
        import traceback
        print(f"[ERROR] Traceback: {traceback.format_exc()}")
        db_handler.close()
        return False


In [None]:
# Example: Update a record
# Uncomment and modify the values below to update a record

# update_session_record(
#     table_name="session_responses",
#     session_id=123456789,
#     update_data={
#         "toxic_score": 0.75,
#         "filter_violations": 2
#     },
#     db_write_allowed=USE_DYNAMODB
# )


## List Sessions

Lists all sessions or sessions for a specific user.


In [None]:
def list_sessions(user_id: Optional[str] = None, db_write_allowed: bool = True) -> None:
    """
    List all sessions or sessions for a specific user.
    
    Args:
        user_id: Optional user ID to filter by. If None, lists all sessions.
        db_write_allowed: If True, use DynamoDB; if False, use CSV
    """
    print("=" * 60)
    print("Listing Sessions")
    print("=" * 60)
    if user_id:
        print(f"Filter: user_id = {user_id}")
    else:
        print("Filter: All sessions")
    print("=" * 60)
    
    db_handler = DatabaseHandler(db_write_allowed=db_write_allowed)
    
    try:
        session_responses = db_handler.load_table("session_responses")
        
        if session_responses.empty:
            print("[INFO] No sessions found")
            db_handler.close()
            return
        
        # Filter by user_id if provided
        if user_id:
            filtered = session_responses[session_responses["user_id"] == user_id]
        else:
            filtered = session_responses
        
        if filtered.empty:
            print(f"[INFO] No sessions found for user_id={user_id}")
            db_handler.close()
            return
        
        print(f"\n[OK] Found {len(filtered)} session(s):\n")
        
        # Display sessions
        for idx, (_, row) in enumerate(filtered.iterrows(), 1):
            print(f"Session {idx}:")
            print(f"  Session ID: {row.get('id')}")
            print(f"  User ID: {row.get('user_id')}")
            print(f"  Name: {row.get('name')}")
            print(f"  Boyfriend Name: {row.get('boyfriend_name')}")
            print(f"  Toxic Score: {row.get('toxic_score')}")
            print(f"  Filter Violations: {row.get('filter_violations')}")
            print(f"  Language: {row.get('language')}")
            print(f"  Session Start: {row.get('session_start_time')}")
            print()
        
        db_handler.close()
        
    except Exception as e:
        print(f"\n[ERROR] Error listing sessions: {e}")
        import traceback
        print(f"[ERROR] Traceback: {traceback.format_exc()}")
        db_handler.close()


In [None]:
# Example: List all sessions
list_sessions(db_write_allowed=USE_DYNAMODB)


## Reassign Session IDs and Recalculate Summary_Sessions

This section:
1. Groups records by (user_id, test_date, boyfriend_name) combination
2. Assigns the same session_id to all records with matching combinations across all CSV files
3. Recalculates Summary_Sessions based on the updated data


In [1]:
import sys
from pathlib import Path
import pandas as pd
from decimal import Decimal
from datetime import datetime
import hashlib

# Add project root to path
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root))

from src.utils.session_id_generator import generate_session_id
from src.utils.constants import DATE_FORMAT, CSV_SEPARATOR

# Configuration
DATA_DIR = project_root / "data"
CSV_FILES = {
    "session_responses": "session_responses.csv",
    "session_gtk_responses": "session_gtk_responses.csv",
    "session_feedback": "session_feedback.csv",
    "session_toxicity_rating": "session_toxicity_rating.csv",
    "session_insights": "session_insights.csv",
}


In [2]:
def generate_session_id_with_date(user_id: str, boyfriend_name: str, test_date: str) -> int:
    """
    Generate a unique, deterministic session_id based on user_id, boyfriend_name, and test_date.
    
    Args:
        user_id: Unique identifier for the user
        boyfriend_name: Name of the boyfriend being rated
        test_date: Test date (session date)
        
    Returns:
        A positive integer session_id
    """
    # Normalize inputs
    user_id_norm = str(user_id).lower().strip()
    bf_name_norm = str(boyfriend_name).lower().strip()
    test_date_norm = str(test_date).strip()
    
    # Create a deterministic string from all three components
    combined = f"{user_id_norm}_{bf_name_norm}_{test_date_norm}"
    
    # Generate hash using SHA256
    hash_obj = hashlib.sha256(combined.encode('utf-8'))
    hash_hex = hash_obj.hexdigest()
    
    # Convert first 8 characters of hash to integer
    session_id = int(hash_hex[:8], 16) % (2**31 - 1)
    
    # Ensure it's positive and at least 1
    if session_id == 0:
        session_id = 1
    
    return session_id


In [3]:
def get_test_date_column(df: pd.DataFrame) -> str:
    """Determine which column contains the test date."""
    # Check common column names for test date (in priority order)
    # Prefer session_start_time as it's more consistent across tables
    date_columns = ['session_start_time', 'test_date', 'timestamp']
    for col in date_columns:
        if col in df.columns:
            return col
    return None

def normalize_date(date_value):
    """Normalize date to string format using consistent DATE_FORMAT."""
    if pd.isna(date_value):
        return None
    # Convert to string and strip
    date_str = str(date_value).strip()
    # If it's a datetime object, format it consistently
    try:
        if isinstance(date_value, pd.Timestamp):
            return date_value.strftime(DATE_FORMAT)
        # Try parsing if it's a string and reformat
        parsed_date = pd.to_datetime(date_str)
        return parsed_date.strftime(DATE_FORMAT)
    except:
        return date_str


In [4]:
def detect_csv_separator(file_path):
    """Detect CSV separator by reading first line."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            # Count semicolons and commas
            semicolon_count = first_line.count(';')
            comma_count = first_line.count(',')
            # Use semicolon if it appears, otherwise comma
            if semicolon_count > 0:
                return ';'
            elif comma_count > 0:
                return ','
            else:
                return CSV_SEPARATOR  # Default
    except:
        return CSV_SEPARATOR  # Default fallback

def load_and_prepare_data():
    """Load all CSV files and prepare them for session ID assignment."""
    print("=" * 60)
    print("Loading CSV Files")
    print("=" * 60)
    
    data = {}
    
    for table_name, filename in CSV_FILES.items():
        file_path = DATA_DIR / filename
        if not file_path.exists():
            print(f"[WARNING] File not found: {filename}")
            continue
        
        try:
            # Detect separator for each file
            separator = detect_csv_separator(file_path)
            df = pd.read_csv(file_path, sep=separator)
            print(f"[OK] Loaded {filename}: {len(df)} records (separator: '{separator}')")
            data[table_name] = df
        except Exception as e:
            print(f"[ERROR] Failed to load {filename}: {e}")
    
    return data


In [5]:
def create_session_mapping(data: dict) -> dict:
    """
    Create a mapping of (user_id, test_date, boyfriend_name) -> session_id.
    
    Returns:
        Dictionary mapping (user_id, test_date, boyfriend_name) tuples to session_id
    """
    print("\n" + "=" * 60)
    print("Creating Session ID Mapping")
    print("=" * 60)
    
    session_mapping = {}
    
    # Process each table to extract unique combinations
    for table_name, df in data.items():
        if df.empty:
            continue
        
        # Get required columns - check with case-insensitive matching
        df_columns_lower = [col.lower().strip() for col in df.columns]
        user_id_col = None
        boyfriend_name_col = None
        
        for col in df.columns:
            col_lower = col.lower().strip()
            if col_lower == 'user_id':
                user_id_col = col
            elif col_lower == 'boyfriend_name':
                boyfriend_name_col = col
        
        if not user_id_col or not boyfriend_name_col:
            print(f"[WARNING] {table_name} missing required columns. Available columns: {list(df.columns[:10])}")
            continue
        
        # Get test_date column
        date_col = get_test_date_column(df)
        if not date_col:
            print(f"[WARNING] {table_name} has no date column (test_date, session_start_time, or timestamp)")
            continue
        
        print(f"\nProcessing {table_name}...")
        
        # Extract unique combinations
        for idx, row in df.iterrows():
            user_id = str(row[user_id_col]).strip()
            boyfriend_name = str(row[boyfriend_name_col]).strip()
            test_date = normalize_date(row[date_col])
            
            if not user_id or not boyfriend_name or not test_date:
                print(f"[WARNING] Row {idx} in {table_name} has missing values, skipping")
                continue
            
            key = (user_id, test_date, boyfriend_name)
            
            # Generate session_id if not already in mapping
            if key not in session_mapping:
                session_id = generate_session_id_with_date(user_id, boyfriend_name, test_date)
                session_mapping[key] = session_id
                print(f"  Created mapping: ({user_id[:8]}..., {test_date[:10]}..., {boyfriend_name}) -> {session_id}")
    
    print(f"\n[OK] Created {len(session_mapping)} unique session mappings")
    return session_mapping


In [6]:
def update_session_ids(data: dict, session_mapping: dict):
    """Update session IDs in all dataframes based on the mapping."""
    print("\n" + "=" * 60)
    print("Updating Session IDs")
    print("=" * 60)
    
    updated_data = {}
    
    for table_name, df in data.items():
        if df.empty:
            updated_data[table_name] = df
            continue
        
        print(f"\nUpdating {table_name}...")
        df = df.copy()
        
        # Get required columns - check with case-insensitive matching
        user_id_col = None
        boyfriend_name_col = None
        
        for col in df.columns:
            col_lower = col.lower().strip()
            if col_lower == 'user_id':
                user_id_col = col
            elif col_lower == 'boyfriend_name':
                boyfriend_name_col = col
        
        if not user_id_col or not boyfriend_name_col:
            print(f"[WARNING] {table_name} missing required columns, skipping. Available: {list(df.columns[:10])}")
            updated_data[table_name] = df
            continue
        
        # Get test_date column
        date_col = get_test_date_column(df)
        if not date_col:
            print(f"[WARNING] {table_name} has no date column, skipping")
            updated_data[table_name] = df
            continue
        
        # Update IDs
        updated_count = 0
        skipped_count = 0
        
        for idx, row in df.iterrows():
            user_id = str(row[user_id_col]).strip()
            boyfriend_name = str(row[boyfriend_name_col]).strip()
            test_date = normalize_date(row[date_col])
            
            if not user_id or not boyfriend_name or not test_date:
                skipped_count += 1
                continue
            
            key = (user_id, test_date, boyfriend_name)
            
            if key in session_mapping:
                new_id = session_mapping[key]
                old_id = row.get('id')
                df.at[idx, 'id'] = new_id
                if old_id != new_id:
                    updated_count += 1
            else:
                skipped_count += 1
        
        print(f"  Updated: {updated_count} records")
        if skipped_count > 0:
            print(f"  Skipped: {skipped_count} records (missing values)")
        
        updated_data[table_name] = df
    
    return updated_data


In [7]:
def save_updated_csvs(updated_data: dict, backup: bool = True):
    """Save updated dataframes to CSV files."""
    print("\n" + "=" * 60)
    print("Saving Updated CSV Files")
    print("=" * 60)
    
    if backup:
        # Create backup directory
        backup_dir = DATA_DIR / "backup"
        backup_dir.mkdir(exist_ok=True)
        print(f"[INFO] Backups will be saved to: {backup_dir}")
    
    for table_name, df in updated_data.items():
        if table_name not in CSV_FILES:
            continue
        
        filename = CSV_FILES[table_name]
        file_path = DATA_DIR / filename
        
        # Backup original if requested
        if backup and file_path.exists():
            backup_path = backup_dir / f"{filename}.backup"
            import shutil
            shutil.copy2(file_path, backup_path)
            print(f"[OK] Backed up {filename}")
        
        # Save updated file
        try:
            df.to_csv(file_path, index=False)
            print(f"[OK] Saved {filename} ({len(df)} records)")
        except Exception as e:
            print(f"[ERROR] Failed to save {filename}: {e}")


In [8]:
def recalculate_summary_sessions(data: dict):
    """Recalculate Summary_Sessions table from session_responses."""
    print("\n" + "=" * 60)
    print("Recalculating Summary_Sessions")
    print("=" * 60)
    
    if "session_responses" not in data:
        print("[ERROR] session_responses not found in data")
        return None
    
    df = data["session_responses"]
    
    if df.empty:
        print("[WARNING] session_responses is empty")
        return None
    
    # Required columns
    required_cols = ['toxic_score', 'filter_violations']
    if not all(col in df.columns for col in required_cols):
        print(f"[ERROR] session_responses missing required columns: {required_cols}")
        return None
    
    # Convert to numeric
    df['toxic_score'] = pd.to_numeric(df['toxic_score'], errors='coerce')
    df['filter_violations'] = pd.to_numeric(df['filter_violations'], errors='coerce')
    
    # Remove rows with NaN values
    df_clean = df.dropna(subset=['toxic_score', 'filter_violations'])
    
    if df_clean.empty:
        print("[WARNING] No valid records after cleaning")
        return None
    
    # Calculate statistics
    sum_toxic_score = float(df_clean['toxic_score'].sum())
    max_toxic_score = float(df_clean['toxic_score'].max())
    min_toxic_score = float(df_clean['toxic_score'].min())
    avg_toxic_score = float(df_clean['toxic_score'].mean())
    
    sum_filter_violations = int(df_clean['filter_violations'].sum())
    avg_filter_violations = float(df_clean['filter_violations'].mean())
    
    # Count unique sessions (unique id values)
    count_guys = df_clean['id'].nunique()
    
    # Get max IDs from each table (for backward compatibility, but set to 0 as per code)
    max_id_session_responses = 0
    max_id_gtk_responses = 0
    max_id_feedback = 0
    max_id_session_toxicity_rating = 0
    
    last_update_date = datetime.now().strftime(DATE_FORMAT)
    
    summary_data = {
        'summary_id': 1,
        'sum_toxic_score': sum_toxic_score,
        'max_toxic_score': max_toxic_score,
        'min_toxic_score': min_toxic_score,
        'avg_toxic_score': avg_toxic_score,
        'sum_filter_violations': sum_filter_violations,
        'avg_filter_violations': avg_filter_violations,
        'count_guys': count_guys,
        'max_id_session_responses': max_id_session_responses,
        'max_id_gtk_responses': max_id_gtk_responses,
        'max_id_feedback': max_id_feedback,
        'max_id_session_toxicity_rating': max_id_session_toxicity_rating,
        'last_update_date': last_update_date,
    }
    
    print(f"[OK] Calculated Summary_Sessions:")
    print(f"  count_guys: {count_guys}")
    print(f"  avg_toxic_score: {avg_toxic_score:.6f}")
    print(f"  sum_toxic_score: {sum_toxic_score:.6f}")
    print(f"  sum_filter_violations: {sum_filter_violations}")
    
    return summary_data


In [9]:
def save_summary_sessions(summary_data: dict):
    """Save Summary_Sessions to CSV."""
    if summary_data is None:
        return
    
    print("\n" + "=" * 60)
    print("Saving Summary_Sessions")
    print("=" * 60)
    
    file_path = DATA_DIR / "Summary_Sessions.csv"
    
    # Backup if exists
    if file_path.exists():
        backup_dir = DATA_DIR / "backup"
        backup_dir.mkdir(exist_ok=True)
        import shutil
        backup_path = backup_dir / "Summary_Sessions.csv.backup"
        shutil.copy2(file_path, backup_path)
        print(f"[OK] Backed up Summary_Sessions.csv")
    
    # Create DataFrame and save with consistent separator
    df = pd.DataFrame([summary_data])
    df.to_csv(file_path, sep=CSV_SEPARATOR, index=False)
    print(f"[OK] Saved Summary_Sessions.csv")


In [10]:
def show_changes_summary(data: dict, updated_data: dict, session_mapping: dict):
    """Show a summary of changes before saving."""
    print("\n" + "=" * 60)
    print("Changes Summary")
    print("=" * 60)
    
    print(f"\nTotal unique sessions: {len(session_mapping)}")
    
    print("\nSession ID changes per table:")
    for table_name in CSV_FILES.keys():
        if table_name not in data or table_name not in updated_data:
            continue
        
        old_df = data[table_name]
        new_df = updated_data[table_name]
        
        if old_df.empty or new_df.empty:
            continue
        
        # Count changed IDs
        if 'id' in old_df.columns and 'id' in new_df.columns:
            changes = (old_df['id'] != new_df['id']).sum()
            total = len(old_df)
            print(f"  {table_name}: {changes}/{total} records updated")
    
    print("\n" + "=" * 60)

def reassign_session_ids_and_recalculate(dry_run: bool = False):
    """
    Main function to reassign session IDs and recalculate Summary_Sessions.
    
    Args:
        dry_run: If True, only show what would be changed without saving files
    """
    print("=" * 60)
    print("Reassign Session IDs and Recalculate Summary_Sessions")
    if dry_run:
        print("DRY RUN MODE - No files will be modified")
    print("=" * 60)
    
    # Step 1: Load data
    data = load_and_prepare_data()
    
    if not data:
        print("[ERROR] No data loaded")
        return
    
    # Step 2: Create session mapping
    session_mapping = create_session_mapping(data)
    
    if not session_mapping:
        print("[ERROR] No session mappings created")
        return
    
    # Step 3: Update session IDs
    updated_data = update_session_ids(data, session_mapping)
    
    # Step 4: Show summary
    show_changes_summary(data, updated_data, session_mapping)
    
    if dry_run:
        print("\n[INFO] Dry run completed. No files were modified.")
        print("Set dry_run=False to apply changes.")
        return
    
    # Step 5: Save updated CSVs
    save_updated_csvs(updated_data, backup=True)
    
    # Step 6: Recalculate Summary_Sessions
    summary_data = recalculate_summary_sessions(updated_data)
    
    # Step 7: Save Summary_Sessions
    save_summary_sessions(summary_data)
    
    print("\n" + "=" * 60)
    print("[SUCCESS] All operations completed!")
    print("=" * 60)


In [12]:
# Run the reassignment process
# First run with dry_run=True to preview changes, then set to False to apply
reassign_session_ids_and_recalculate(dry_run=False)  # Preview changes
# reassign_session_ids_and_recalculate(dry_run=False)  # Apply changes


Reassign Session IDs and Recalculate Summary_Sessions
Loading CSV Files
[OK] Loaded session_responses.csv: 12 records (separator: ',')
[OK] Loaded session_gtk_responses.csv: 12 records (separator: ',')
[OK] Loaded session_feedback.csv: 10 records (separator: ',')
[OK] Loaded session_toxicity_rating.csv: 12 records (separator: ',')
[OK] Loaded session_insights.csv: 7 records (separator: ',')

Creating Session ID Mapping

Processing session_responses...
  Created mapping: (3013b931..., 2025-03-28..., Berke) -> 1231198605
  Created mapping: (3013b931..., 2025-03-28..., Cemberk) -> 694732827
  Created mapping: (be4b46dc..., 2025-03-29..., Aaron) -> 700972892
  Created mapping: (177bc767..., 2025-03-26..., toksik kaan) -> 1342029424
  Created mapping: (177bc767..., 2025-03-26..., ozan) -> 1315277317
  Created mapping: (1477a67f..., 2025-03-27..., bok) -> 1353492132
  Created mapping: (44ed9413..., 2025-10-21..., sergei) -> 1022067251
  Created mapping: (0ee1c646..., 2025-03-27..., lucas) ->

In [None]:
# Example: List sessions for a specific user
# Uncomment and modify the user_id below

# list_sessions(user_id="your_user_id_here", db_write_allowed=USE_DYNAMODB)
