# Imports

In [None]:
# Standard library imports
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

# Third-party imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import platform

# Configuration
%matplotlib inline
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Constants

In [None]:
# File paths
DATA_DIR = Path("data")
INPUT_FILENAME = DATA_DIR / "PORTRAIT_last.csv"
OUTPUT_FILENAME = DATA_DIR / "PORTRAIT_last_updated.csv"

# Column names
RESPONDENT_ID = "respondent_id"
USER_CODE = "Código de usuario"
GENDER = (
    "¿Con qué género se identifica más usted? "
    "(Selecciona la opción que más te identifique)"
)

# Users to exclude
USERS_TO_REMOVE = {
    "1XIH2", "B1Q2C", "KY12C", "QO12D", "S1HA2",
    "XZ21K", "21WYJ", "B21DT",
}

# Duplicate users to remove (user_code: respondent_id)
DUPLICATES_TO_REMOVE = {
    "1H2GG": "118898041284",
    "IC21Y": "118919025758",
    "B2I1M": "118915917238",
    "1Y2ZF": "118877646327"
}

# Functions

In [None]:
def find_column_index(header_row: pd.Series, target_value: str) -> int:
    """Find index of target_value in header_row.

    Args:
        header_row: Pandas Series containing column headers
        target_value: String to find in header_row

    Returns:
        int: Index of the target value

    Raises:
        ValueError: If target_value is not found in header_row
    """
    try:
        return header_row.tolist().index(target_value)
    except ValueError as e:
        raise ValueError(
            f"Header value '{target_value}' not found in DataFrame header.\n"
            f"Available headers: {header_row.tolist()}"
        ) from e

def clean_user_code(df: pd.DataFrame, user_col_idx: int) -> None:
    """Clean and standardize user codes in the DataFrame.

    Args:
        df: Input DataFrame
        user_col_idx: Index of the user code column
    """
    df.iloc[3:, user_col_idx] = (
        df.iloc[3:, user_col_idx]
        .astype(str)
        .str.strip()
        .str.upper()
    )

def remove_duplicate_users(
    df: pd.DataFrame,
    user_col_idx: int,
    dupes: Dict[str, str]
) -> pd.DataFrame:
    """Remove duplicate users based on user code and respondent ID.

    Args:
        df: Input DataFrame
        user_col_idx: Index of the user code column
        dupes: Dictionary mapping user codes to respondent IDs to remove

    Returns:
        DataFrame with duplicates removed
    """
    mask = ~(
        (df.iloc[:, user_col_idx].isin(dupes.keys())) &
        (df[df.columns[0]].astype(str).isin(dupes.values()))
    )
    return df[mask].copy()

# Data loading

In [None]:
# Ensure data directory exists
DATA_DIR.mkdir(exist_ok=True)

# Read the raw data
print(f"Reading data from {INPUT_FILENAME}...")
df = pd.read_csv(INPUT_FILENAME, header=None)

# Remove last 8 rows (typically metadata)
df = df.iloc[:-8, :]

# Add sequence numbers as first row
seq_df = pd.DataFrame([range(1, df.shape[1] + 1)])
df = pd.concat([seq_df, df], ignore_index=True)

# Display initial data info
print(f"Initial data shape: {df.shape}")
df.head(3)

# Data Cleaning

In [None]:
# Find column indices
try:
    user_col_idx = find_column_index(df.iloc[1], USER_CODE)
    gender_col_idx = find_column_index(df.iloc[1], GENDER)
    print(f"Found columns: user_code at index {user_col_idx}, gender at index {gender_col_idx}")
except ValueError as e:
    print(f"Error finding required columns: {e}")
    raise

# Clean and standardize user codes
clean_user_code(df, user_col_idx)

# Remove duplicate users
df = remove_duplicate_users(df, user_col_idx, DUPLICATES_TO_REMOVE)

# Remove users in the exclusion list
df = df[~df.iloc[:, user_col_idx].isin(USERS_TO_REMOVE)]

# EXTRA CASES
# Change usrername 02E1T to O2E1T
df.iloc[3:, user_col_idx] = df.iloc[3:, user_col_idx].replace("02E1T", "O2E1T")

# Save processed data
df.to_csv(OUTPUT_FILENAME, index=False, header=False)
print(f"Processed data saved to {OUTPUT_FILENAME}")
print(f"Number of participants: {len(df) - 3}")

# Display the cleaned data
df.head()

# Data Analysis

In [None]:
# Example analysis: Basic statistics
if len(df) > 3:  # Ensure we have data
    # Get the data rows (skip first 3 header rows)
    data_rows = df.iloc[3:]

    # Example: Count by gender if gender column exists
    if gender_col_idx is not None:
        gender_counts = data_rows.iloc[:, gender_col_idx].value_counts()
        print("Gender distribution:")
        print(gender_counts)

        # Simple visualization
        plt.figure(figsize=(10, 5))
        gender_counts.plot(kind='bar')
        plt.title('Gender Distribution')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# Read updated csv and initialize needed variables

In [None]:
# Load the updated CSV file
df = pd.read_csv(OUTPUT_FILENAME, header=None)
header_row = df.iloc[0]
subheader_row    = df.iloc[1]   # e.g. contains "Durante las últimas dos semanas…", “Código de usuario”, etc.
subsubheader_row = df.iloc[2]   # e.g. contains "Little interest…", “Ninguna de las anteriores”, etc.
user_col_index = find_column_index(subheader_row, "Código de usuario")
sex_col_index = find_column_index(subheader_row, "¿Con qué género se identifica más usted? (Selecciona la opción que más te identifique)")

# Define all test configurations
TEST_CONFIG = {
    "PHQ": {
        "header": "Durante las últimas dos semanas, ¿con qué frecuencia ha tenido molestias debido a los siguientes problemas?",
        "end_marker": "Si ha marcado cualquiera de los problemas, ¿Qué tanta dificultad le han dado estos problemas para hacer su trabajo, encargarse de las tareas del hogar, o llevarse bien con otras personas?",
        "end_marker_row": "subheader",
        "end_adjustment": -1,  # Adjust end index by -1 for PHQ
        "range": (0, 3),
        "description": "Patient Health Questionnaire"
    },
    "BAI": {
        "header": "En el cuestionario hay una lista de síntomas comunes de la ansiedad. Lea cada uno de los ítems atentamente, e indique cuanto le ha afectado en la última semana incluyendo hoy:",
        "end_marker": "Con sudores, fríos o calientes.",
        "end_marker_row": "subsubheader",
        "range": (0, 4),
        "description": "Beck Anxiety Inventory"
    },
    "OCI": {
        "header": "Escoge la opción que mejor describe CUÁNTO malestar o molestia te ha producido esta experiencia durante el último mes.",
        "end_marker": "Tener con frecuencia pensamientos repugnantes y que le cueste librarse de ellos.",
        "end_marker_row": "subsubheader",
        "range": (0, 4),
        "description": "Obsessive-Compulsive Inventory"
    },
    "STAI": {
        "header": "Lea cada frase y señale la opción que indique mejor cómo se siente en general, en la mayoría de las ocasiones. No hay respuestas buenas ni malas. No emplee demasiado tiempo en cada frase y conteste señalando la respuesta que mejor describa cómo se siente usted generalmente.",
        "end_marker": "Cuando pienso sobre asuntos y preocupaciones actuales me pongo tenso y agitado.",
        "end_marker_row": "subsubheader",
        "reverse_items": [0, 5, 6, 9, 12, 15, 18],  # 0-based item numbers
        "range": (0, 3),
        "description": "State-Trait Anxiety Inventory"
    },
    "BFI": {
        "header": "Por favor, valore cada afirmación del cuestionario en una escala del 1 al 5, donde 1 significa \"Muy en desacuerdo\" y 5 \"Muy de acuerdo\".",
        "end_marker": "Es sofisticado en arte, música o literatura.",
        "end_marker_row": "subsubheader",
        "range": (1, 5),
        "description": "Big Five Inventory",
        "subscales": {
            "Extraversion": [1, 6, 11, 16, 21, 26, 31, 36],
            "Agreeableness": [2, 7, 12, 17, 22, 27, 32, 37, 42],
            "Conscientiousness": [3, 8, 13, 18, 23, 28, 33, 38, 43],
            "Neuroticism": [4, 9, 14, 19, 24, 29, 34, 39],
            "Openness": [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]
        },
        "reverse_scored_items": {
            "Extraversion": [6, 21, 31],
            "Agreeableness": [2, 12, 27, 37],
            "Conscientiousness": [8, 18, 23, 43],
            "Neuroticism": [9, 24, 34],
            "Openness": [35, 41]
        }
    },
    "ASSIST": {
        "header": "A lo largo de la vida, ¿cuál de las siguientes sustancias ha consumido alguna vez? (solo que consumió sin receta médica)",
        "end_header": "¿Alguna vez ha consumido alguna droga por vía inyectada? (solo las que consumió sin receta médica)",
        "first_sub": "Ninguna de las anteriores",
        "last_sub": "Response",
        "allowed_codes": {
            2: {0, 2, 3, 4, 6},
            3: {0, 3, 4, 5, 6},
            4: {0, 4, 5, 6, 7},
            5: {0, 5, 6, 7, 8},
            6: {0, 6, 3},
            7: {0, 6, 3}
        }
    }
}

def find_test_indices(header_lookup, test_config):
    """Find start and end indices for a test section."""
    start_index = find_column_index(header_lookup, test_config["header"])

    # Determine which row to find the end marker in
    if test_config.get("end_marker_row") == "subsubheader":
        end_row = subsubheader_row
    else:
        end_row = subheader_row

    end_index = find_column_index(end_row, test_config["end_marker"])

    # Apply any adjustments to the end index
    if "end_adjustment" in test_config:
        end_index += test_config["end_adjustment"]

    return start_index, end_index

# Find all test indices
test_indices = {}
for test_name, config in TEST_CONFIG.items():
    if test_name == "ASSIST":
        # Special handling for ASSIST
        assist_start_index = find_column_index(subheader_row, config["header"])
        assist_end_index = find_column_index(subheader_row, config["end_header"])
        assist_first_sub_idx = find_column_index(subsubheader_row, config["first_sub"])
        assist_last_sub_idx = find_column_index(subsubheader_row, config["last_sub"])

        test_indices.update({
            "assist_start_index": assist_start_index,
            "assist_end_index": assist_end_index,
            "assist_first_sub_idx": assist_first_sub_idx,
            "assist_last_sub_idx": assist_last_sub_idx
        })

        # Additional ASSIST-specific variables
        first_substance_index = assist_start_index + 1
        base_cols = list(range(first_substance_index, first_substance_index + 9))
        sub_names = df.iloc[2, base_cols].tolist()
        assist_allowed = config["allowed_codes"]
    else:
        # Standard test processing
        try:
            start, end = find_test_indices(subheader_row, config)
            test_indices[f"{test_name.lower()}_start_index"] = start
            test_indices[f"{test_name.lower()}_end_index"] = end
            print(f"Found {config['description']} ({test_name}) at columns {start}-{end}")
        except ValueError as e:
            print(f"Warning: Could not find {test_name} test: {e}")

# Add standard column indices
test_indices.update({
    "user_col_index": find_column_index(subheader_row, "Código de usuario"),
    "sex_col_index": find_column_index(subheader_row,
        "¿Con qué género se identifica más usted? (Selecciona la opción que más te identifique)")
})

# Make all indices available as variables in the notebook
locals().update(test_indices)

# Generate BFI subscales with absolute column indices
if 'bfi_start_index' in test_indices:
    bfi_subscales = {
        trait: [test_indices['bfi_start_index'] + (q - 1) for q in questions]
        for trait, questions in TEST_CONFIG["BFI"]["subscales"].items()
    }
    reverse_scored_items = TEST_CONFIG["BFI"]["reverse_scored_items"]

#  STAI reverse items (0-based)
stai_items_to_reverse = TEST_CONFIG["STAI"]["reverse_items"]



In [None]:
def get_test_description(test_name, test_config, start_idx, end_idx):
    """Generate a detailed description of the test validation."""
    description = f"# Test {test_name}\n"
    description += f"- Description: {test_config.get('description', 'No description available')}\n"
    description += f"- Columns: {start_idx} to {end_idx}\n"
    description += f"- Valid range: {test_config.get('range', (0, 3))}\n"
    return description

def validate_test_responses(df, test_name, start_idx, end_idx, valid_range, start_row=3):
    """
    Validate that all responses for a test fall within the specified range.
    Returns: tuple of (passed, issues, detailed_report)
    """
    min_val, max_val = valid_range
    issues = []
    detailed_report = []
    passed = True

    # Add test header to detailed report
    detailed_report.append(f"# Test {test_name}")

    # Check if indices are valid
    if start_idx >= len(df.columns) or end_idx >= len(df.columns):
        error_msg = f"  - Error: Column indices out of bounds (max: {len(df.columns)-1})"
        issues.append(error_msg)
        detailed_report.append(error_msg)
        return False, issues, detailed_report

    # Add column range info
    detailed_report.append(f"- Validating columns {start_idx} to {end_idx} (range: {min_val}-{max_val})")

    # Get the range of columns to check
    cols_to_test = list(range(start_idx, end_idx + 1))
    valid_count = 0
    total_cells = 0

    for row_idx in range(start_row, len(df)):
        for col_idx in cols_to_test:
            total_cells += 1
            header_val = df.iloc[0, col_idx] if col_idx < len(df.iloc[0]) else "N/A"
            cell = df.iloc[row_idx, col_idx]

            # Check for missing values
            if pd.isna(cell):
                issue = f"  - Row {row_idx+1}, Column {col_idx+1} (header: {header_val}) is empty"
                issues.append(issue)
                detailed_report.append(issue)
                passed = False
                continue

            # Check if value is numeric and in range
            try:
                value = float(cell)
                if not (min_val <= value <= max_val):
                    issue = f"  - Row {row_idx+1}, Column {col_idx+1} (header: {header_val}): Value {value} out of range"
                    issues.append(issue)
                    detailed_report.append(issue)
                    passed = False
                else:
                    valid_count += 1
            except (ValueError, TypeError):
                issue = f"  - Row {row_idx+1}, Column {col_idx+1} (header: {header_val}): Non-numeric value: {cell}"
                issues.append(issue)
                detailed_report.append(issue)
                passed = False

    # Add summary to detailed report
    if passed:
        detailed_report.append(f"\n✓ All {valid_count} values are valid")
    else:
        detailed_report.append(f"\n✗ Found {len(issues)} issues in {total_cells} cells")

    return passed, issues, detailed_report

def validate_all_tests(df, test_configs, test_indices):
    """Validate all tests and return detailed reports."""
    results = {}

    for test_name, config in test_configs.items():
        if test_name == "ASSIST":
            continue  # Skip ASSIST as it has different validation rules

        test_key = test_name.lower()
        start_idx = test_indices.get(f"{test_key}_start_index")
        end_idx = test_indices.get(f"{test_key}_end_index")

        if start_idx is None or end_idx is None:
            print(f"Skipping {test_name}: Missing indices")
            continue

        print(f"\n=== Validating {test_name} ===")
        valid_range = config.get("range", (0, 3))

        # Get test description
        test_desc = get_test_description(test_name, config, start_idx, end_idx)
        print(test_desc)

        # Run validation
        passed, issues, detailed_report = validate_test_responses(
            df, test_name, start_idx, end_idx, valid_range
        )

        # Print detailed report
        print("\n".join(detailed_report))

        # Store results
        results[test_name] = {
            "passed": passed,
            "issues": issues,
            "num_issues": len(issues),
            "columns": f"{start_idx}-{end_idx}",
            "valid_range": valid_range,
            "report": detailed_report
        }

        status = "✓ PASSED" if passed else f"✗ FAILED ({len(issues)} issues)"
        print(f"\n{status}")

    return results

# Run the validation
print("=== Starting Validation ===\n")
validation_results = validate_all_tests(df, TEST_CONFIG, test_indices)

# Print final summary
print("\n=== Final Validation Summary ===")
print("PASSED TESTS:")
passed_tests = [name for name, result in validation_results.items() if result["passed"]]
if passed_tests:
    for name in passed_tests:
        print(f"✓ {name} (columns {validation_results[name]['columns']})")
else:
    print("No tests passed validation.")

print("\nFAILED TESTS:")
failed_tests = [name for name, result in validation_results.items() if not result["passed"]]
if failed_tests:
    for name in failed_tests:
        result = validation_results[name]
        print(f"✗ {name} (columns {result['columns']}): {result['num_issues']} issues")
else:
    print("All tests passed validation!")

# TEST ASSIST

In [None]:
for r in range(3, df.shape[0]):
    # Special case if "Ninguna de las anteriores"
    if pd.notna(df.iat[r, assist_first_sub_idx]) and int(df.iat[r, assist_first_sub_idx]) == 0:
        if not df.iloc[r][first_substance_index:assist_end_index].isna().all():
            subset = df.iloc[r][first_substance_index:assist_end_index]
            not_na = subset[~subset.isna()]
            raise ValueError(f"Not all values of row {r} between columns {first_substance_index} and {assist_end_index} are NaN: {not_na.to_dict()}")
        continue
    for c in range(assist_start_index, assist_start_index + 9):
        val = df.iat[r, c]
        header = df.iat[2, c]
        if pd.notna(val):
            if header != "Otro (especifique)":
                # If the “next” question is 0, zero out the three follow-ups
                if int(df.iat[r, c + 9]) == 0:
                    df.iat[r, c + 18] = df.iat[r, c + 27] = df.iat[r, c + 36] = 0
                # Checking all the values for each specific substance (i.e. every next 9 columns)
                for qnum, allowed_vals in assist_allowed.items():
                    sc = c + 9 * (qnum - 1)  # substance column
                    resp = int(df.iat[r, sc])
                    if resp in allowed_vals:
                        continue
                    else:
                        raise ValueError(
                            f"In Question {qnum}, Column {sc} found: {resp}\n"
                            f"assist_allowed Values are: {allowed_vals}"
                        )
# Skip value validation as per the user's request
print("Passed ASSIST value validation.")

# Results ASSIST

In [None]:
# prepare to collect per-user dicts
results = []
for r in range(3, df.shape[0]):
    username = df.iat[r, user_col_index]
    rec = {"username": username}

    for c, name in zip(base_cols, sub_names):
        offsets = [9*i for i in range(1,7)]       # Q2–Q7
        if name.startswith("Tabaco"):
            offsets.remove(36)                    # drop Q5
        total = sum(int(df.iat[r, c+off])       # NaN→skip
                    for off in offsets
                    if pd.notna(df.iat[r, c+off]))
        rec[name] = total

    results.append(rec)
assist_scores_df = pd.DataFrame(results)

low_max = {
  "Tabaco (cigarrillos, tabaco de mascar, puros, etc.)": 3,
  "Alcohol": 10,
}
moderate_max = 26


for col in assist_scores_df.columns:
    if col == "username": continue
    lo   = low_max.get(col, 3)
    sc   = assist_scores_df[col]
    masks = [
      sc <= lo,
      sc.between(lo+1, moderate_max),
      sc >= (moderate_max+1),
    ]
    labels = [
      "No requiere intervención",
      "Recibir intervención breve",
      "Tratamiento más intensivo",
    ]
    #assist_scores_df[f"{col}_risk"] = np.select(masks, labels, default=np.nan)
    assist_scores_df[f"{col}_risk"] = np.select(masks, labels, default="Unknown")

# 3) inspect
assist_scores_df.head(6)

# TOTAL RESULTS

In [None]:
def calculate_test_scores(row, start_idx, end_idx, test_name):
    """Helper function to calculate test scores with error handling."""
    try:
        scores = row[start_idx:end_idx + 1].astype(float)
        return int(scores.sum())
    except Exception as e:
        raise ValueError(f"Error processing {test_name} responses for user '{row[user_col_index]}': {e}")

def process_stai_scores(row, start_idx, end_idx, items_to_reverse, sex_col_index):
    """Process STAI scores with reverse scoring and classification."""
    try:
        scores = row[start_idx:end_idx + 1].astype(float).copy()

        # Apply reverse scoring
        for item in items_to_reverse:
            col_idx = start_idx + item
            if not pd.isna(row[col_idx]):
                scores.iloc[item] = 3 - int(scores.iloc[item])

        total = int(scores.sum())
        sex = row[sex_col_index]

        # Classify based on gender
        if sex == "3":
            classification = "Binary"
        elif (sex == "2" and total >= 29) or (sex == "1" and total >= 37):
            classification = "Severe"
        else:
            classification = "Low/Normal"

        return total, classification

    except Exception as e:
        raise ValueError(f"Error processing STAI responses for user '{row[user_col_index]}': {e}")

def process_bfi_scores(row, start_idx, subscales, reverse_items):
    """Process BFI scores with reverse scoring."""
    try:
        bfi_scores = {}
        for subscale, cols in subscales.items():
            scores = []
            for col in cols:
                try:
                    value = float(row[col])
                    # Reverse score if needed
                    if col - start_idx + 1 in reverse_items.get(subscale, []):
                        value = 6 - value
                    scores.append(value)
                except ValueError:
                    raise ValueError(f"Invalid BFI value in column {col}: {row[col]}")
            bfi_scores[subscale] = round(sum(scores) / len(scores), 2)
        return bfi_scores
    except Exception as e:
        raise ValueError(f"Error processing BFI responses: {e}")

def classify_phq(score):
    """Classify PHQ score into depression severity levels."""
    if score < 5: return "Low"
    if score < 10: return "Mild depression"
    if score < 15: return "Moderate depression"
    if score < 20: return "Moderately severe depression"
    return "Severe depression"

def classify_bai(score):
    """Classify BAI score into anxiety levels."""
    if score <= 21: return "Low anxiety"
    if score <= 35: return "Moderate anxiety"
    return "Potentially concerning levels of anxiety"

def classify_oci(score):
    """Classify OCI score for OCD indication."""
    return "Presence of OCD" if score > 21 else "No sign of OCD"

# Merge dataframes
df = df.merge(
    assist_scores_df,
    left_on=df.columns[user_col_index],
    right_on="username",
    how="left"
)

# Get substance names dynamically
sub_names = [c for c in assist_scores_df.columns
            if c != "username" and not c.endswith(("_risk", "_classification"))]

print("START")
# Process each user's responses
for idx in range(3, len(df)):
    row = df.iloc[idx]
    username = row[user_col_index]
    try:
        # Calculate test scores
        phq_total = calculate_test_scores(row, phq_start_index, phq_end_index, "PHQ")
        bai_total = calculate_test_scores(row, bai_start_index, bai_end_index, "BAI")
        oci_total = calculate_test_scores(row, oci_start_index, oci_end_index, "OCI")
        stai_total, stai_classification = process_stai_scores(
            row, stai_start_index, stai_end_index, stai_items_to_reverse, sex_col_index
        )

        # Process BFI scores
        bfi_scores = process_bfi_scores(row, bfi_start_index, bfi_subscales, reverse_scored_items)

        # Classify scores
        classifications = {
            "phq": (phq_total, classify_phq(phq_total)),
            "bai": (bai_total, classify_bai(bai_total)),
            "oci": (oci_total, classify_oci(oci_total)),
            "stai": (stai_total, stai_classification)
        }

        # Update dataframe with results
        for prefix, (total, classification) in classifications.items():
            df.at[idx, f"{prefix}_total"] = total
            df.at[idx, f"{prefix}_classification"] = classification

        # Add BFI subscale scores
        for subscale, score in bfi_scores.items():
            df.at[idx, f"BFI_{subscale}"] = score

        # Print report
        report = (
            f"{username}: PHQ Total = {phq_total} ({classifications['phq'][1]}), "
            f"OCI Total = {oci_total} ({classifications['oci'][1]}), "
            f"BAI Total = {bai_total} ({classifications['bai'][1]}), "
            f"STAI Total = {stai_total} ({stai_classification}), "
            f"BFI Scores = {bfi_scores}"
        )
        print(report)

    except ValueError as e:
        print(f"Error processing {username}: {str(e)}")
        continue

# Binary classes for BFI

In [None]:
def classify_bfi_scores(df, bfi_subscales, median_cutoffs=None):
    """
    Classify BFI scores as Low/High based on median cutoffs.

    Args:
        df: DataFrame containing BFI scores
        bfi_subscales: Dictionary of BFI subscales and their column indices
        median_cutoffs: Optional dictionary of custom median cutoffs. If None, uses default values.

    Returns:
        DataFrame with added classification columns
    """
    # https://www.researchgate.net/figure/Descriptive-statistics-for-the-variables-included-in-Study-3_tbl4_362005430
    # Define median cutoffs (or use defaults)
    # Set default median cutoffs if not provided
    if median_cutoffs is None:
        median_cutoffs = {
            "Extraversion": 3.42,
            "Agreeableness": 3.87,
            "Conscientiousness": 3.40,
            "Neuroticism": 3.13,
            "Openness": 2.98
        }

    # Create classification columns
    for subscale in bfi_subscales:
        col_name = f"BFI_{subscale}"
        df[f"{col_name}_classification"] = df[col_name].apply(
            lambda x: "Low" if x < median_cutoffs[subscale] else "High"
        )

    return df

# Example usage:
# df = classify_bfi_scores(df, bfi_subscales)

# Apply the classification
df = classify_bfi_scores(df, bfi_subscales)

# Optional: Print results
for idx, row in df[3:].iterrows():
    username = row[user_col_index]
    bfi_scores = {subscale: row[f"BFI_{subscale}"] for subscale in bfi_subscales}
    bfi_class = {subscale: row[f"BFI_{subscale}_classification"] for subscale in bfi_subscales}
    print(f"{username}: BFI Scores = {bfi_scores}, BFI Class = {bfi_class}")

In [None]:
responses_df = df[3:]

# Define output files
output_files = {
    'excel': 'results_surveyMonkey_Processed.xlsx',
    'csv': 'results_surveyMonkey_Processed.csv'
}

# Get responses and create results DataFrame
responses_df = df[3:]
results_cols = {
    'username': responses_df.iloc[:, user_col_index],
    'gender': responses_df.iloc[:, sex_col_index],
    **{col: responses_df[col] for col in [
        'phq_total', 'phq_classification',
        'oci_total', 'oci_classification',
        'bai_total', 'bai_classification',
        'stai_total', 'stai_classification'
    ]}
}
# Add any additional assist columns here if needed (example):
# "assist_column": responses_df["assist_column_name"]

results_df = pd.DataFrame(results_cols)

# Add BFI columns
for sub in bfi_subscales:
    results_df[[f'BFI_{sub}', f'BFI_{sub}_classification']] = responses_df[[f'BFI_{sub}', f'BFI_{sub}_classification']]


# Merge with assist scores and export
results_df = results_df.merge(assist_scores_df, on='username', how='left')
results_df.to_excel(output_files['excel'], index=False)
results_df.to_csv(output_files['csv'], index=False)

# Copy to network location
if platform.system() == 'Windows':
    dest_path = r'W:\Portrait\SVM\data\results_surveyMonkey_Processed.csv'
else:
    dest_path = "/Volumes/mgialou/Portrait/SVM/data/results_surveyMonkey_Processed.csv"


# Copy the file
#shutil.copy2("results_surveyMonkey_Processed.csv", CSV_PATH)

print(f"Results exported to {output_files['excel']} and {output_files['csv']}")
print(f"CSV file would be copied to: {dest_path}")


# Visualizations

In [None]:
# Option 1: use DataFrame.filter with a regex
# Option B: single-filter regex
# Option C: boolean mask
mask = responses_df.columns.str.startswith('BFI') | responses_df.columns.str.startswith('username')
responses_df.loc[:, mask].head(50)
df = responses_df
len(df)

In [None]:
sex_codes = df.loc[3:, sex_col_index].dropna().astype(int)
gender_map = {1: "Female", 2: "Male", 3: "Non-binary"}
genders = sex_codes.map(gender_map)

order = ["Female", "Male", "Non-binary"]
counts = genders.value_counts().reindex(order, fill_value=0)
print("Counts of gender: ", counts)

plt.figure(figsize=(6, 4))
plt.bar(counts.index, counts.values, edgecolor='black')
plt.title("Gender Distribution of Respondents")
plt.ylabel("Number of Users")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Make sure these columns exist (you can do this before the loop)
new_cols = [
  "phq_total","phq_classification",
  "oci_total","oci_classification",
  "bai_total","bai_classification",
  "stai_total","stai_classification"
] + [f"BFI_{sub}" for sub in bfi_subscales]
# plus all ASSIST score and risk columns are already in df from the merge
# Numeric totals

# assemble totals into one DataFrame
totals = df.loc[3:, [f for f in ['phq_total','bai_total','oci_total','stai_total']]].astype(int)
print(totals)
plt.figure(figsize=(10,6))
for i, col in enumerate(totals.columns, 1):
    plt.subplot(2,2,i)
    plt.hist(totals[col], bins=12, edgecolor='black')
    plt.title(col.replace('_',' ').upper())
    plt.xlabel('Score')
    plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# List of your classification columns and plot titles
class_cols = [
    'phq_classification',
    'bai_classification',
    'oci_classification',
    'stai_classification'
]
titles = [
    'PHQ-9 Depression Severity',
    'BAI Anxiety Level',
    'OCI-R OCD Classification',
    'STAI Anxiety Classification'
]

# Make the 2×2 plot
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()

for ax, col, title in zip(axes, class_cols, titles):
    # count each category
    counts = df.loc[3:, col].value_counts().sort_index()
    ax.bar(counts.index, counts.values, edgecolor='black')
    ax.set_title(title)
    ax.set_ylabel('Number of Users')
    ax.tick_params(axis='x', rotation=45, labelsize=9)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.boxplot([totals[c] for c in totals.columns], tick_labels=[c.upper() for c in totals.columns])
plt.title("Distribution of Psychometric Assesment Scores")
plt.ylabel("Score")
plt.show()

In [None]:
#sub_names = ['Tabaco (cigarrillos, tabaco de mascar, puros, etc.)', 'Bebidas alcohólicas (cerveza, vinos, licores, etc.)']
# reshape into long form
assist_long = assist_scores_df.melt(id_vars='username',
    value_vars=sub_names,
    var_name='sustancia', value_name='score'
).merge(
    assist_scores_df.melt(id_vars='username',
        value_vars=[f"{s}_risk" for s in sub_names],
        var_name='tmp', value_name='risk'
    ).assign(sustancia=lambda d: d['tmp'].str.replace('_risk',''))[['username','sustancia','risk']],
    on=['username','sustancia']
)

# Filter out individuals with a score of 0 on the sustancias
assist_long_filtered = assist_long[assist_long['score'] > 0]

# count
#counts = assist_long.groupby(['sustancia','risk']).size().unstack(fill_value=0)
counts = assist_long_filtered.groupby(['sustancia','risk']).size().unstack(fill_value=0)

# copia y acorta solo el primer término de cada etiqueta
counts_short = counts.copy()
counts_short.index = counts_short.index.str.split().str[0]

# ya con índices cortos
ax = counts_short.plot(
    kind='bar',
    stacked=True,
    figsize=(10,6)
)
plt.title("Distribución de categorías de riesgo ASSIST por sustancia")
ax.legend(title="Riesgo", bbox_to_anchor=(1,1))


counts.plot(kind='bar', stacked=True, figsize=(10,6))
plt.ylabel("Número de usuarios")
plt.title("Distribución de categorías de riesgo ASSIST por sustancia")
plt.legend(title="Riesgo", bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()



In [None]:
# assuming you've merged bfi subscale columns (Extraversion, Agreeableness, …) into df
bfi_cols = ["BFI_Extraversion","BFI_Agreeableness","BFI_Conscientiousness",
            "BFI_Neuroticism","BFI_Openness"]

plt.figure(figsize=(8,5))
plt.boxplot(
    [df.loc[3:, trait].astype(float) for trait in bfi_cols],
    tick_labels=bfi_cols,
    notch=True
)
plt.xticks(rotation=15)
plt.ylabel("Mean Score")
plt.title("BFI Subscale Distributions")
plt.tight_layout()
plt.show()

In [None]:
# collect a numeric matrix
num_cols = ["phq_total","bai_total","oci_total","stai_total"] + bfi_cols
mat = df.loc[3:, num_cols].astype(float).corr()

plt.figure(figsize=(6,5))
im = plt.imshow(mat, aspect="auto")
plt.colorbar(im)
plt.xticks(range(len(num_cols)), num_cols, rotation=45, ha="right")
plt.yticks(range(len(num_cols)), num_cols)
plt.title("Correlation Matrix")
# annotate the cells
for i in range(len(num_cols)):
    for j in range(len(num_cols)):
        plt.text(j, i, f"{mat.iat[i,j]:.2f}", ha="center", va="center", fontsize=8, color="white" if abs(mat.iat[i,j])>0.5 else "black")
plt.tight_layout()
plt.show()
