# Set Up

In [1]:
import pandas as pd
import numpy as np
import random

random.seed(1234)
np.random.seed(1234)  # Sets the seed specifically for NumPy's random number generator


# load data
data = pd.read_csv('data.csv')
data

Unnamed: 0,case_id,age,sex,ethnicity,rel_acquaintance,rel_friend,rel_roommate,rel_boygirlfriend,rel_relative,rel_other,...,tneo_n3_dep,tneo_n4_sel,tneo_n5_imp,tneo_n6_vul,tneo_o1_fan,tneo_o2_aes,tneo_o3_fee,tneo_o4_act,tneo_o5_ide,tneo_o6_val
0,1,27.0,M,2.0,,,,,,,...,51.25,40.18,64.00,55.10,46.64,46.97,66.7,57.07,41.98,58.04
1,2,26.0,M,3.0,,,,,,,...,69.63,60.64,66.27,65.31,54.84,56.44,51.7,51.63,51.90,45.78
2,3,24.0,F,4.0,,,,,,,...,60.44,74.27,54.91,65.31,75.33,56.44,56.7,40.76,51.90,58.04
3,4,33.0,M,3.0,,1.0,,,,,...,67.79,58.36,64.00,52.55,54.84,50.76,36.7,65.22,63.81,58.04
4,5,23.0,F,5.0,,,,,,,...,62.28,67.45,41.27,60.20,50.74,48.86,49.2,46.20,38.02,38.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,466,20.0,F,3.0,,,,,,,...,38.38,40.18,23.09,27.04,56.89,73.48,69.2,57.07,71.75,67.84
466,467,27.0,M,4.0,,,,,,,...,76.99,72.00,68.55,75.51,71.23,75.38,61.7,67.93,67.78,58.04
467,468,21.0,F,3.0,,,,,,,...,54.93,62.91,39.00,62.76,52.79,62.12,69.2,59.78,47.94,62.94
468,469,20.0,F,3.0,,,,,,,...,47.57,37.91,57.18,39.80,63.03,45.08,36.7,59.78,41.98,55.59


In [2]:
# Define the mapping for reverse coding based on the scheme provided
reverse_coding_map = {
    'bfi1': 'reversed_bfi1', 'bfi2': 'reversed_bfi2', 'bfi3R': 'reversed_bfi3', 'bfi4R': 'reversed_bfi4',
    'bfi5R': 'reversed_bfi5', 'bfi6': 'reversed_bfi6', 'bfi7': 'reversed_bfi7', 'bfi8R': 'reversed_bfi8',
    'bfi9R': 'reversed_bfi9', 'bfi10': 'reversed_bfi10', 'bfi11R': 'reversed_bfi11', 'bfi12R': 'reversed_bfi12',
    'bfi13': 'reversed_bfi13', 'bfi14': 'reversed_bfi14', 'bfi15': 'reversed_bfi15', 'bfi16R': 'reversed_bfi16',
    'bfi17R': 'reversed_bfi17', 'bfi18': 'reversed_bfi18', 'bfi19': 'reversed_bfi19', 'bfi20': 'reversed_bfi20',
    'bfi21': 'reversed_bfi21', 'bfi22R': 'reversed_bfi22', 'bfi23R': 'reversed_bfi23', 'bfi24R': 'reversed_bfi24',
    'bfi25R': 'reversed_bfi25', 'bfi26R': 'reversed_bfi26', 'bfi27': 'reversed_bfi27', 'bfi28R': 'reversed_bfi28',
    'bfi29R': 'reversed_bfi29', 'bfi30R': 'reversed_bfi30', 'bfi31R': 'reversed_bfi31', 'bfi32': 'reversed_bfi32',
    'bfi33': 'reversed_bfi33', 'bfi34': 'reversed_bfi34', 'bfi35': 'reversed_bfi35', 'bfi36R': 'reversed_bfi36',
    'bfi37R': 'reversed_bfi37', 'bfi38': 'reversed_bfi38', 'bfi39': 'reversed_bfi39', 'bfi40': 'reversed_bfi40',
    'bfi41': 'reversed_bfi41', 'bfi42R': 'reversed_bfi42', 'bfi43': 'reversed_bfi43', 'bfi44R': 'reversed_bfi44',
    'bfi45R': 'reversed_bfi45', 'bfi46': 'reversed_bfi46', 'bfi47R': 'reversed_bfi47', 'bfi48R': 'reversed_bfi48',
    'bfi49R': 'reversed_bfi49', 'bfi50R': 'reversed_bfi50', 'bfi51R': 'reversed_bfi51', 'bfi52': 'reversed_bfi52',
    'bfi53': 'reversed_bfi53', 'bfi54': 'reversed_bfi54', 'bfi55R': 'reversed_bfi55', 'bfi56': 'reversed_bfi56',
    'bfi57': 'reversed_bfi57', 'bfi58R': 'reversed_bfi58', 'bfi59': 'reversed_bfi59', 'bfi60': 'reversed_bfi60'
}

# Perform reverse coding
for original, reversed_var in reverse_coding_map.items():
    if original.endswith('R'):  # Reverse coded
        data[reversed_var] = 6 - data[original[:-1]]
    else:  # Not reverse coded
        data[reversed_var] = data[original]

# Display the first few rows of the new variables to confirm
data

Unnamed: 0,case_id,age,sex,ethnicity,rel_acquaintance,rel_friend,rel_roommate,rel_boygirlfriend,rel_relative,rel_other,...,reversed_bfi51,reversed_bfi52,reversed_bfi53,reversed_bfi54,reversed_bfi55,reversed_bfi56,reversed_bfi57,reversed_bfi58,reversed_bfi59,reversed_bfi60
0,1,27.0,M,2.0,,,,,,,...,5,5,4,3,2,5,5,2,1,1
1,2,26.0,M,3.0,,,,,,,...,2,4,4,4,4,4,3,2,4,4
2,3,24.0,F,4.0,,,,,,,...,2,5,3,4,5,2,4,5,1,4
3,4,33.0,M,3.0,,1.0,,,,,...,4,4,2,5,4,4,4,2,2,4
4,5,23.0,F,5.0,,,,,,,...,3,4,4,2,3,3,3,4,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,466,20.0,F,3.0,,,,,,,...,5,5,5,2,5,2,4,3,1,4
466,467,27.0,M,4.0,,,,,,,...,2,4,3,4,5,3,5,2,3,3
467,468,21.0,F,3.0,,,,,,,...,2,5,5,2,4,5,5,5,2,4
468,469,20.0,F,3.0,,,,,,,...,4,5,4,1,4,5,5,4,2,5


# Create functions 

In [3]:
# save the subset data to a new csv file
data.to_csv('bfi2.csv', index=False)

In [4]:
def domain_stats(df, domain_prefix):
    """
    Compute the mean and standard deviation for columns that start with the specified domain prefix.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - domain_prefix (str): The prefix of the columns for which to compute the statistics.
    
    Returns:
    - pd.DataFrame: A DataFrame containing the mean and standard deviation for each column that starts with the specified prefix.
    """
    # Filter the columns that start with the domain prefix
    filtered_columns = [col for col in df.columns if col.startswith(domain_prefix)]
    domain_df = df[filtered_columns]
    
    # Compute mean and standard deviation
    means = domain_df.mean()
    std_devs = domain_df.std()
    
    # Create a result DataFrame
    result_df = pd.DataFrame([means, std_devs], index=["Mean", "Standard Deviation"])
    return result_df

In [5]:
def domain_correlation(df, domain_prefix):
    """
    Compute the correlation matrix for columns that start with the specified domain prefix.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - domain_prefix (str): The prefix of the columns for which to compute the correlation matrix.
    
    Returns:
    - pd.DataFrame: A 3x3 DataFrame containing the correlation matrix for the first three columns that start with the specified prefix.
    """
    # Filter columns that start with the domain prefix
    filtered_columns = [col for col in df.columns if col.startswith(domain_prefix)]
    # Select only the first three columns that match the prefix, if available
    filtered_columns = filtered_columns[:3]
    domain_df = df[filtered_columns]
    
    # Compute the correlation matrix
    correlation_matrix = domain_df.corr()
    
    # Return only the top left 3x3 section of the correlation matrix
    return correlation_matrix.iloc[:3, :3]

In [6]:
# Domains with their facets defined
domains = {
    "Extraversion": [['reversed_bfi1', 'reversed_bfi16', 'reversed_bfi31', 'reversed_bfi46'], 
                      ['reversed_bfi6', 'reversed_bfi21', 'reversed_bfi36', 'reversed_bfi51'], 
                      ['reversed_bfi11', 'reversed_bfi26', 'reversed_bfi41', 'reversed_bfi56']],
    "Agreeableness": [['reversed_bfi2', 'reversed_bfi17', 'reversed_bfi32', 'reversed_bfi47'], 
                       ['reversed_bfi7', 'reversed_bfi22', 'reversed_bfi37', 'reversed_bfi52'], 
                       ['reversed_bfi12', 'reversed_bfi27', 'reversed_bfi42', 'reversed_bfi57']],
    "Conscientiousness": [['reversed_bfi3', 'reversed_bfi18', 'reversed_bfi33', 'reversed_bfi48'], 
                           ['reversed_bfi8', 'reversed_bfi23', 'reversed_bfi38', 'reversed_bfi53'], 
                           ['reversed_bfi13', 'reversed_bfi28', 'reversed_bfi43', 'reversed_bfi58']],
    "Neuroticism": [['reversed_bfi4', 'reversed_bfi19', 'reversed_bfi34', 'reversed_bfi49'], 
                     ['reversed_bfi9', 'reversed_bfi24', 'reversed_bfi39', 'reversed_bfi54'], 
                     ['reversed_bfi14', 'reversed_bfi29', 'reversed_bfi44', 'reversed_bfi59']],
    "Openness": [['reversed_bfi10', 'reversed_bfi25', 'reversed_bfi40', 'reversed_bfi55'], 
                  ['reversed_bfi5', 'reversed_bfi20', 'reversed_bfi35', 'reversed_bfi50'], 
                  ['reversed_bfi15', 'reversed_bfi30', 'reversed_bfi45', 'reversed_bfi60']]
}

# Function to calculate average correlation excluding the diagonal
def average_correlation(items):
    # Subset the data for the items
    subset = data[items]
    # Calculate the correlation matrix
    corr_matrix = subset.corr()
    # Flatten the matrix and exclude diagonal (self-correlation)
    correlations = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)]
    # Calculate the mean of the correlations
    return np.mean(correlations)

# Dictionary to hold the average correlations for each domain
domain_avg_correlations = {}

# Calculate the average of the average correlations for each domain
for domain, facets in domains.items():
    avg_corrs = [average_correlation(facet) for facet in facets]
    domain_avg_correlations[domain] = np.mean(avg_corrs)

domain_avg_correlations

{'Extraversion': 0.4610554748141877,
 'Agreeableness': 0.3821996228465807,
 'Conscientiousness': 0.40613896396510524,
 'Neuroticism': 0.5055967450877641,
 'Openness': 0.4570178582963769}

In [7]:
# get the average of domain_avg_correlations
average_domain_avg_correlations = np.mean(list(domain_avg_correlations.values()))
average_domain_avg_correlations

0.4424017330020029

In [8]:
def simulate_item_responses(means, std_devs, corr_matrix, intra_group_corr, n_simulations):
    """
    Simulate item responses based on group characteristics and correlations.

    Parameters:
    - means (np.array): Array of means for each group.
    - std_devs (np.array): Array of standard deviations for each group.
    - corr_matrix (np.array): Correlation matrix between the groups.
    - intra_group_corr (float): Correlation coefficient for items within the same group.
    - n_simulations (int): Number of simulations to generate.

    Returns:
    - np.array: Matrix of simulated responses (n_simulations x num_items).
    """
    num_groups = len(means)
    num_items_per_group = 4  # Assuming 4 items per group
    num_items = num_groups * num_items_per_group

    # Construct the covariance matrix from correlations and standard deviations
    cov_matrix = np.outer(std_devs, std_devs) * corr_matrix

    # Generate group-level scores
    group_scores = np.random.multivariate_normal(means, cov_matrix, size=n_simulations)

    # Placeholder for item scores
    item_scores = np.zeros((n_simulations, num_items))

    # Calculate item-level standard deviation within groups
    item_std_dev_within_group = np.sqrt((1 - intra_group_corr) * std_devs**2)

    # Generate item scores
    for group_index in range(num_groups):
        start_idx = group_index * num_items_per_group
        end_idx = start_idx + num_items_per_group
        
        for i in range(num_items_per_group):
            item_errors = np.random.normal(0, item_std_dev_within_group[group_index], n_simulations)
            item_scores[:, start_idx + i] = group_scores[:, group_index] + item_errors

    # Convert scores to integers and ensure they are within the range [1, 5]
    bounded_item_scores = np.clip(np.round(item_scores), 1, 5)

    return bounded_item_scores

# Extraversion 

## Extract parameters for Extraversion

In [9]:
# Test the function with 'bfi2_e' as the domain prefix
domain_stat_e = domain_stats(data, 'bfi2_e')
domain_stat_e

Unnamed: 0,bfi2_e_sociability,bfi2_e_assertiveness,bfi2_e_energy_level,bfi2_e
Mean,3.020213,3.278723,3.528191,3.275851
Standard Deviation,0.962858,0.838807,0.74644,0.71443


In [10]:
# Test the function with 'bfi2_e' as the domain prefix
correlation_e = domain_correlation(data, 'bfi2_e')
correlation_e

Unnamed: 0,bfi2_e_sociability,bfi2_e_assertiveness,bfi2_e_energy_level
bfi2_e_sociability,1.0,0.593938,0.625359
bfi2_e_assertiveness,0.593938,1.0,0.437577
bfi2_e_energy_level,0.625359,0.437577,1.0


## Simulate item responses for Extraversion

In [11]:
means = np.array(domain_stat_e.loc['Mean'].values[:3])
std_devs = np.array(domain_stat_e.loc['Standard Deviation'].values[:3])
corr_matrix = domain_correlation(data, 'bfi2_e')
intra_group_corr = average_domain_avg_correlations #0.44
n_simulations = 200

simulated_data_e = simulate_item_responses(means, std_devs, corr_matrix, intra_group_corr, n_simulations)

In [12]:
simulated_data_e

array([[3., 4., 2., ..., 4., 2., 3.],
       [4., 4., 3., ..., 2., 3., 4.],
       [3., 3., 2., ..., 3., 4., 4.],
       ...,
       [2., 3., 1., ..., 3., 5., 4.],
       [3., 4., 2., ..., 4., 4., 5.],
       [4., 4., 4., ..., 3., 3., 4.]])

In [13]:
extraversion_name = ['simulated_bfi1', 'simulated_bfi16', 'simulated_bfi31', 'simulated_bfi46', 
                     'simulated_bfi6', 'simulated_bfi21', 'simulated_bfi36', 'simulated_bfi51', 
                     'simulated_bfi11', 'simulated_bfi26', 'simulated_bfi41', 'simulated_bfi56']

# assign variable names to the simulated data
simulated_data_e = pd.DataFrame(simulated_data_e, columns=extraversion_name)

simulated_data_e

Unnamed: 0,simulated_bfi1,simulated_bfi16,simulated_bfi31,simulated_bfi46,simulated_bfi6,simulated_bfi21,simulated_bfi36,simulated_bfi51,simulated_bfi11,simulated_bfi26,simulated_bfi41,simulated_bfi56
0,3.0,4.0,2.0,3.0,2.0,2.0,3.0,1.0,3.0,4.0,2.0,3.0
1,4.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,4.0
2,3.0,3.0,2.0,1.0,1.0,2.0,3.0,2.0,4.0,3.0,4.0,4.0
3,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0
4,3.0,2.0,3.0,3.0,1.0,2.0,2.0,3.0,4.0,2.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,2.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0
196,4.0,5.0,4.0,4.0,5.0,3.0,5.0,4.0,3.0,2.0,3.0,3.0
197,2.0,3.0,1.0,2.0,5.0,4.0,4.0,4.0,5.0,3.0,5.0,4.0
198,3.0,4.0,2.0,3.0,3.0,3.0,5.0,3.0,4.0,4.0,4.0,5.0


# Agreeableness

## Extract parameters for Agreeableness

In [14]:
# Test the function with 'bfi2_a' as the domain prefix
domain_stat_a = domain_stats(data, 'bfi2_a')
domain_stat_a

Unnamed: 0,bfi2_a_compassion,bfi2_a_respectfulness,bfi2_a_trust,bfi2_a
Mean,3.852128,3.953723,3.339894,3.715128
Standard Deviation,0.747559,0.664643,0.775614,0.600391


In [15]:
# Test the function with 'bfi2_a' as the domain prefix
correlation_a = domain_correlation(data, 'bfi2_a')
correlation_a

Unnamed: 0,bfi2_a_compassion,bfi2_a_respectfulness,bfi2_a_trust
bfi2_a_compassion,1.0,0.513497,0.529301
bfi2_a_respectfulness,0.513497,1.0,0.501578
bfi2_a_trust,0.529301,0.501578,1.0


## Simulate item responses for Agreeableness

In [16]:
means = np.array(domain_stat_a.loc['Mean'].values[:3])
std_devs = np.array(domain_stat_a.loc['Standard Deviation'].values[:3])
corr_matrix = domain_correlation(data, 'bfi2_a')
intra_group_corr = average_domain_avg_correlations #0.44
n_simulations = 200

simulated_data_a = simulate_item_responses(means, std_devs, corr_matrix, intra_group_corr, n_simulations)

In [17]:
# assign variable names to the simulated data
agreeableness_name = ['simulated_bfi2', 'simulated_bfi17', 'simulated_bfi32', 'simulated_bfi47', 
                      'simulated_bfi7', 'simulated_bfi22', 'simulated_bfi37', 'simulated_bfi52', 
                      'simulated_bfi12', 'simulated_bfi27', 'simulated_bfi42', 'simulated_bfi57']

simulated_data_a = pd.DataFrame(simulated_data_a, columns=agreeableness_name)
simulated_data_a

Unnamed: 0,simulated_bfi2,simulated_bfi17,simulated_bfi32,simulated_bfi47,simulated_bfi7,simulated_bfi22,simulated_bfi37,simulated_bfi52,simulated_bfi12,simulated_bfi27,simulated_bfi42,simulated_bfi57
0,4.0,4.0,3.0,4.0,4.0,3.0,5.0,5.0,4.0,3.0,3.0,4.0
1,2.0,2.0,2.0,3.0,4.0,4.0,4.0,4.0,2.0,2.0,3.0,2.0
2,3.0,3.0,3.0,2.0,4.0,3.0,4.0,3.0,2.0,2.0,2.0,2.0
3,4.0,3.0,5.0,3.0,3.0,3.0,4.0,3.0,4.0,2.0,2.0,2.0
4,5.0,5.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
195,4.0,5.0,5.0,5.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0,4.0
196,2.0,2.0,4.0,2.0,3.0,3.0,3.0,3.0,1.0,2.0,1.0,1.0
197,3.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,4.0,2.0,3.0,3.0
198,3.0,3.0,4.0,2.0,4.0,4.0,5.0,4.0,3.0,4.0,4.0,4.0


# Conscientiousness

## Extract parameters for Conscientiousness

In [18]:
# Test the function with 'bfi2_c' as the domain prefix
domain_stat_c = domain_stats(data, 'bfi2_c')
domain_stat_c

Unnamed: 0,bfi2_c_organization,bfi2_c_productiveness,bfi2_c_responsibility,bfi2_c
Mean,3.604255,3.337766,3.479255,3.47383
Standard Deviation,0.88145,0.781335,0.677439,0.647734


In [19]:
# Test the function with 'bfi2_c' as the domain prefix
correlation_c = domain_correlation(data, 'bfi2_c')
correlation_c

Unnamed: 0,bfi2_c_organization,bfi2_c_productiveness,bfi2_c_responsibility
bfi2_c_organization,1.0,0.523057,0.501303
bfi2_c_productiveness,0.523057,1.0,0.57798
bfi2_c_responsibility,0.501303,0.57798,1.0


## Simulate item responses for Conscientiousness

In [20]:
# Define the means, standard deviations, correlation matrix, intra-group correlation, and number of simulations
means = np.array(domain_stat_c.loc['Mean'].values[:3])
std_devs = np.array(domain_stat_c.loc['Standard Deviation'].values[:3])
corr_matrix = domain_correlation(data, 'bfi2_c')
intra_group_corr = average_domain_avg_correlations #0.44
n_simulations = 200

# Simulate item responses for Conscientiousness
simulated_data_c = simulate_item_responses(means, std_devs, corr_matrix, intra_group_corr, n_simulations)

In [21]:
# assign variable names to the simulated data
conscientiousness_name = ['simulated_bfi3', 'simulated_bfi18', 'simulated_bfi33', 'simulated_bfi48', 
                          'simulated_bfi8', 'simulated_bfi23', 'simulated_bfi38', 'simulated_bfi53', 
                          'simulated_bfi13', 'simulated_bfi28', 'simulated_bfi43', 'simulated_bfi58']

simulated_data_c = pd.DataFrame(simulated_data_c, columns=conscientiousness_name)
simulated_data_c

Unnamed: 0,simulated_bfi3,simulated_bfi18,simulated_bfi33,simulated_bfi48,simulated_bfi8,simulated_bfi23,simulated_bfi38,simulated_bfi53,simulated_bfi13,simulated_bfi28,simulated_bfi43,simulated_bfi58
0,1.0,2.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0
1,5.0,5.0,5.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0,5.0
2,3.0,3.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,4.0
3,3.0,2.0,2.0,2.0,4.0,2.0,4.0,4.0,3.0,3.0,3.0,3.0
4,2.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
195,3.0,3.0,3.0,3.0,2.0,4.0,2.0,3.0,4.0,4.0,4.0,4.0
196,4.0,4.0,4.0,5.0,3.0,3.0,2.0,4.0,3.0,4.0,4.0,4.0
197,2.0,2.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,3.0,3.0
198,5.0,4.0,3.0,5.0,4.0,3.0,4.0,2.0,4.0,5.0,4.0,4.0


# Neuroticism

## Extract parameters for Neuroticism

In [22]:
# Test the function with 'bfi2_n' as the domain prefix
domain_stat_n = domain_stats(data, 'bfi2_n')
domain_stat_n

Unnamed: 0,bfi2_n_anxiety,bfi2_n_depression,bfi2_n_emotional_volatility,bfi2_n
Mean,3.425,2.565957,2.743617,2.911638
Standard Deviation,0.841259,0.92303,0.954292,0.768512


In [23]:
# Test the function with 'bfi2_n' as the domain prefix
correlation_n = domain_correlation(data, 'bfi2_n')
correlation_n

Unnamed: 0,bfi2_n_anxiety,bfi2_n_depression,bfi2_n_emotional_volatility
bfi2_n_anxiety,1.0,0.558478,0.575573
bfi2_n_depression,0.558478,1.0,0.598226
bfi2_n_emotional_volatility,0.575573,0.598226,1.0


## Simulate item responses for Neuroticism

In [24]:
# Define the means, standard deviations, correlation matrix, intra-group correlation, and number of simulations
means = np.array(domain_stat_n.loc['Mean'].values[:3])
std_devs = np.array(domain_stat_n.loc['Standard Deviation'].values[:3])
corr_matrix = domain_correlation(data, 'bfi2_n')
intra_group_corr = average_domain_avg_correlations #0.44
n_simulations = 200

# Simulate item responses for Neuroticism
simulated_data_n = simulate_item_responses(means, std_devs, corr_matrix, intra_group_corr, n_simulations)

In [25]:
# assign variable names to the simulated data
neuroticism_name = ['simulated_bfi4', 'simulated_bfi19', 'simulated_bfi34', 'simulated_bfi49', 
                    'simulated_bfi9', 'simulated_bfi24', 'simulated_bfi39', 'simulated_bfi54', 
                    'simulated_bfi14', 'simulated_bfi29', 'simulated_bfi44', 'simulated_bfi59']

simulated_data_n = pd.DataFrame(simulated_data_n, columns=neuroticism_name)
simulated_data_n

Unnamed: 0,simulated_bfi4,simulated_bfi19,simulated_bfi34,simulated_bfi49,simulated_bfi9,simulated_bfi24,simulated_bfi39,simulated_bfi54,simulated_bfi14,simulated_bfi29,simulated_bfi44,simulated_bfi59
0,5.0,5.0,5.0,4.0,1.0,2.0,3.0,2.0,5.0,4.0,5.0,4.0
1,2.0,4.0,4.0,3.0,3.0,3.0,3.0,2.0,4.0,5.0,5.0,4.0
2,5.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,5.0,4.0,5.0,4.0
3,4.0,5.0,4.0,4.0,4.0,5.0,3.0,5.0,2.0,2.0,2.0,3.0
4,4.0,4.0,4.0,5.0,3.0,4.0,2.0,3.0,2.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
195,5.0,3.0,4.0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,3.0,4.0
196,3.0,2.0,1.0,1.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0
197,4.0,3.0,5.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,2.0
198,2.0,2.0,3.0,4.0,3.0,2.0,3.0,2.0,3.0,2.0,4.0,2.0


# Openness

## Extract parameters for Openness

In [26]:
# Test the function with 'bfi2_o' as the domain prefix
domain_stat_o = domain_stats(data, 'bfi2_o')
domain_stat_o

Unnamed: 0,bfi2_o_intellectual_curiosity,bfi2_o_aesthetic_sensitivity,bfi2_o_creative_imagination,bfi2_o
Mean,3.823404,3.573936,3.521277,3.639532
Standard Deviation,0.722532,0.904995,0.774269,0.638165


In [27]:
# Test the function with 'bfi2_o' as the domain prefix
correlation_o = domain_correlation(data, 'bfi2_o')
correlation_o

Unnamed: 0,bfi2_o_intellectual_curiosity,bfi2_o_aesthetic_sensitivity,bfi2_o_creative_imagination
bfi2_o_intellectual_curiosity,1.0,0.437393,0.475049
bfi2_o_aesthetic_sensitivity,0.437393,1.0,0.444297
bfi2_o_creative_imagination,0.475049,0.444297,1.0


## Simulate item responses for Openness

In [28]:
# Define the means, standard deviations, correlation matrix, intra-group correlation, and number of simulations
means = np.array(domain_stat_o.loc['Mean'].values[:3])
std_devs = np.array(domain_stat_o.loc['Standard Deviation'].values[:3])
corr_matrix = domain_correlation(data, 'bfi2_o')
intra_group_corr = average_domain_avg_correlations #0.44
n_simulations = 200

# Simulate item responses for Openness
simulated_data_o = simulate_item_responses(means, std_devs, corr_matrix, intra_group_corr, n_simulations)

In [29]:
# assign variable names to the simulated data
openness_name = ['simulated_bfi10', 'simulated_bfi25', 'simulated_bfi40', 'simulated_bfi55', 
                  'simulated_bfi5', 'simulated_bfi20', 'simulated_bfi35', 'simulated_bfi50', 
                  'simulated_bfi15', 'simulated_bfi30', 'simulated_bfi45', 'simulated_bfi60']

simulated_data_o = pd.DataFrame(simulated_data_o, columns=openness_name)
simulated_data_o

Unnamed: 0,simulated_bfi10,simulated_bfi25,simulated_bfi40,simulated_bfi55,simulated_bfi5,simulated_bfi20,simulated_bfi35,simulated_bfi50,simulated_bfi15,simulated_bfi30,simulated_bfi45,simulated_bfi60
0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,5.0,3.0,3.0,4.0,3.0
1,3.0,2.0,2.0,3.0,4.0,4.0,3.0,2.0,2.0,3.0,3.0,4.0
2,4.0,4.0,3.0,3.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0
3,5.0,4.0,5.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,3.0
4,5.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
195,5.0,5.0,4.0,5.0,5.0,4.0,4.0,2.0,5.0,4.0,4.0,5.0
196,3.0,4.0,3.0,4.0,4.0,5.0,5.0,4.0,4.0,3.0,5.0,3.0
197,5.0,4.0,5.0,4.0,4.0,3.0,3.0,4.0,5.0,4.0,5.0,5.0
198,4.0,1.0,4.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0


# Combine all the simulated data

In [30]:
simulated_data = pd.concat([simulated_data_e, simulated_data_a, simulated_data_c, simulated_data_n, simulated_data_o], axis=1)
simulated_data

Unnamed: 0,simulated_bfi1,simulated_bfi16,simulated_bfi31,simulated_bfi46,simulated_bfi6,simulated_bfi21,simulated_bfi36,simulated_bfi51,simulated_bfi11,simulated_bfi26,...,simulated_bfi40,simulated_bfi55,simulated_bfi5,simulated_bfi20,simulated_bfi35,simulated_bfi50,simulated_bfi15,simulated_bfi30,simulated_bfi45,simulated_bfi60
0,3.0,4.0,2.0,3.0,2.0,2.0,3.0,1.0,3.0,4.0,...,4.0,4.0,5.0,4.0,4.0,5.0,3.0,3.0,4.0,3.0
1,4.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,...,2.0,3.0,4.0,4.0,3.0,2.0,2.0,3.0,3.0,4.0
2,3.0,3.0,2.0,1.0,1.0,2.0,3.0,2.0,4.0,3.0,...,3.0,3.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0
3,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,...,5.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,3.0
4,3.0,2.0,3.0,3.0,1.0,2.0,2.0,3.0,4.0,2.0,...,5.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,2.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,...,4.0,5.0,5.0,4.0,4.0,2.0,5.0,4.0,4.0,5.0
196,4.0,5.0,4.0,4.0,5.0,3.0,5.0,4.0,3.0,2.0,...,3.0,4.0,4.0,5.0,5.0,4.0,4.0,3.0,5.0,3.0
197,2.0,3.0,1.0,2.0,5.0,4.0,4.0,4.0,5.0,3.0,...,5.0,4.0,4.0,3.0,3.0,4.0,5.0,4.0,5.0,5.0
198,3.0,4.0,2.0,3.0,3.0,3.0,5.0,3.0,4.0,4.0,...,4.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0


In [31]:
## calculate facet scores 
simulated_data['bfi_e_sociability'] = (simulated_data['simulated_bfi1'] + simulated_data['simulated_bfi16'] + simulated_data['simulated_bfi31'] + simulated_data['simulated_bfi46'])/4

simulated_data['bfi_e_assertiveness'] = (simulated_data['simulated_bfi6'] + simulated_data['simulated_bfi21'] + simulated_data['simulated_bfi36'] + simulated_data['simulated_bfi51'])/4

simulated_data['bfi_e_energy_level'] = (simulated_data['simulated_bfi11'] + simulated_data['simulated_bfi26'] + simulated_data['simulated_bfi41'] + simulated_data['simulated_bfi56'])/4

simulated_data['bfi_a_compassion'] = (simulated_data['simulated_bfi2'] + simulated_data['simulated_bfi17'] + simulated_data['simulated_bfi32'] + simulated_data['simulated_bfi47'])/4

simulated_data['bfi_a_respectfulness'] = (simulated_data['simulated_bfi7'] + simulated_data['simulated_bfi22'] + simulated_data['simulated_bfi37'] + simulated_data['simulated_bfi52'])/4

simulated_data['bfi_a_trust'] = (simulated_data['simulated_bfi12'] + simulated_data['simulated_bfi27'] + simulated_data['simulated_bfi42'] + simulated_data['simulated_bfi57'])/4

simulated_data['bfi_c_organization'] = (simulated_data['simulated_bfi3'] + simulated_data['simulated_bfi18'] + simulated_data['simulated_bfi33'] + simulated_data['simulated_bfi48'])/4

simulated_data['bfi_c_productiveness'] = (simulated_data['simulated_bfi8'] + simulated_data['simulated_bfi23'] + simulated_data['simulated_bfi38'] + simulated_data['simulated_bfi53'])/4

simulated_data['bfi_c_responsibility'] = (simulated_data['simulated_bfi13'] + simulated_data['simulated_bfi28'] + simulated_data['simulated_bfi43'] + simulated_data['simulated_bfi58'])/4

simulated_data['bfi_n_anxiety'] = (simulated_data['simulated_bfi4'] + simulated_data['simulated_bfi19'] + simulated_data['simulated_bfi34'] + simulated_data['simulated_bfi49'])/4

simulated_data['bfi_n_depression'] = (simulated_data['simulated_bfi9'] + simulated_data['simulated_bfi24'] + simulated_data['simulated_bfi39'] + simulated_data['simulated_bfi54'])/4

simulated_data['bfi_n_emotional_volatility'] = (simulated_data['simulated_bfi14'] + simulated_data['simulated_bfi29'] + simulated_data['simulated_bfi44'] + simulated_data['simulated_bfi59'])/4

simulated_data['bfi_o_intellectual_curiosity'] = (simulated_data['simulated_bfi10'] + simulated_data['simulated_bfi25'] + simulated_data['simulated_bfi40'] + simulated_data['simulated_bfi55'])/4

simulated_data['bfi_o_aesthetic_sensitivity'] = (simulated_data['simulated_bfi5'] + simulated_data['simulated_bfi20'] + simulated_data['simulated_bfi35'] + simulated_data['simulated_bfi50'])/4

simulated_data['bfi_o_creative_imagination'] = (simulated_data['simulated_bfi15'] + simulated_data['simulated_bfi30'] + simulated_data['simulated_bfi45'] + simulated_data['simulated_bfi60'])/4

## Domain Scores

# calculate domain scores
simulated_data['bfi_e'] = (simulated_data['bfi_e_sociability'] + simulated_data['bfi_e_assertiveness'] + simulated_data['bfi_e_energy_level'])/3

simulated_data['bfi_a'] = (simulated_data['bfi_a_compassion'] + simulated_data['bfi_a_respectfulness'] + simulated_data['bfi_a_trust'])/3

simulated_data['bfi_c'] = (simulated_data['bfi_c_organization'] + simulated_data['bfi_c_productiveness'] + simulated_data['bfi_c_responsibility'])/3

simulated_data['bfi_n'] = (simulated_data['bfi_n_anxiety'] + simulated_data['bfi_n_depression'] + simulated_data['bfi_n_emotional_volatility'])/3

simulated_data['bfi_o'] = (simulated_data['bfi_o_intellectual_curiosity'] + simulated_data['bfi_o_aesthetic_sensitivity'] + simulated_data['bfi_o_creative_imagination'])/3

simulated_data

Unnamed: 0,simulated_bfi1,simulated_bfi16,simulated_bfi31,simulated_bfi46,simulated_bfi6,simulated_bfi21,simulated_bfi36,simulated_bfi51,simulated_bfi11,simulated_bfi26,...,bfi_n_depression,bfi_n_emotional_volatility,bfi_o_intellectual_curiosity,bfi_o_aesthetic_sensitivity,bfi_o_creative_imagination,bfi_e,bfi_a,bfi_c,bfi_n,bfi_o
0,3.0,4.0,2.0,3.0,2.0,2.0,3.0,1.0,3.0,4.0,...,2.00,4.50,4.00,4.50,3.25,2.666667,3.833333,2.333333,3.750000,3.916667
1,4.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,...,2.75,4.50,2.50,3.25,3.00,3.250000,2.833333,4.083333,3.500000,2.916667
2,3.0,3.0,2.0,1.0,1.0,2.0,3.0,2.0,4.0,3.0,...,2.00,4.50,3.50,4.75,4.25,2.666667,2.750000,3.833333,3.833333,4.166667
3,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,...,4.25,2.25,4.75,4.25,3.75,4.583333,3.166667,2.916667,3.583333,4.250000
4,3.0,2.0,3.0,3.0,1.0,2.0,2.0,3.0,4.0,2.0,...,3.00,1.75,4.50,4.00,4.50,2.750000,4.333333,2.500000,3.000000,4.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,2.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.25,4.00,4.75,3.75,4.50,3.000000,4.666667,3.250000,3.833333,4.333333
196,4.0,5.0,4.0,4.0,5.0,3.0,5.0,4.0,3.0,2.0,...,3.00,2.00,3.50,4.50,3.75,3.750000,2.250000,3.666667,2.250000,3.916667
197,2.0,3.0,1.0,2.0,5.0,4.0,4.0,4.0,5.0,3.0,...,3.50,3.50,4.50,3.50,4.75,3.500000,3.333333,2.583333,3.583333,4.250000
198,3.0,4.0,2.0,3.0,3.0,3.0,5.0,3.0,4.0,4.0,...,2.50,2.75,3.00,2.50,3.00,3.583333,3.666667,3.916667,2.666667,2.833333


In [32]:
# create reversing mapping
reverse_simulated_data = simulated_data.copy()

reverse_coding_map = {'simulated_bfi1': 'bfi1',
 'simulated_bfi2': 'bfi2',
 'simulated_bfi3': 'bfi3R',
 'simulated_bfi4': 'bfi4R',
 'simulated_bfi5': 'bfi5R',
 'simulated_bfi6': 'bfi6',
 'simulated_bfi7': 'bfi7',
 'simulated_bfi8': 'bfi8R',
 'simulated_bfi9': 'bfi9R',
 'simulated_bfi10': 'bfi10',
 'simulated_bfi11': 'bfi11R',
 'simulated_bfi12': 'bfi12R',
 'simulated_bfi13': 'bfi13',
 'simulated_bfi14': 'bfi14',
 'simulated_bfi15': 'bfi15',
 'simulated_bfi16': 'bfi16R',
 'simulated_bfi17': 'bfi17R',
 'simulated_bfi18': 'bfi18',
 'simulated_bfi19': 'bfi19',
 'simulated_bfi20': 'bfi20',
 'simulated_bfi21': 'bfi21',
 'simulated_bfi22': 'bfi22R',
 'simulated_bfi23': 'bfi23R',
 'simulated_bfi24': 'bfi24R',
 'simulated_bfi25': 'bfi25R',
 'simulated_bfi26': 'bfi26R',
 'simulated_bfi27': 'bfi27',
 'simulated_bfi28': 'bfi28R',
 'simulated_bfi29': 'bfi29R',
 'simulated_bfi30': 'bfi30R',
 'simulated_bfi31': 'bfi31R',
 'simulated_bfi32': 'bfi32',
 'simulated_bfi33': 'bfi33',
 'simulated_bfi34': 'bfi34',
 'simulated_bfi35': 'bfi35',
 'simulated_bfi36': 'bfi36R',
 'simulated_bfi37': 'bfi37R',
 'simulated_bfi38': 'bfi38',
 'simulated_bfi39': 'bfi39',
 'simulated_bfi40': 'bfi40',
 'simulated_bfi41': 'bfi41',
 'simulated_bfi42': 'bfi42R',
 'simulated_bfi43': 'bfi43',
 'simulated_bfi44': 'bfi44R',
 'simulated_bfi45': 'bfi45R',
 'simulated_bfi46': 'bfi46',
 'simulated_bfi47': 'bfi47R',
 'simulated_bfi48': 'bfi48R',
 'simulated_bfi49': 'bfi49R',
 'simulated_bfi50': 'bfi50R',
 'simulated_bfi51': 'bfi51R',
 'simulated_bfi52': 'bfi52',
 'simulated_bfi53': 'bfi53',
 'simulated_bfi54': 'bfi54',
 'simulated_bfi55': 'bfi55R',
 'simulated_bfi56': 'bfi56',
 'simulated_bfi57': 'bfi57',
 'simulated_bfi58': 'bfi58R',
 'simulated_bfi59': 'bfi59',
 'simulated_bfi60': 'bfi60'}

# Perform reverse coding
for key, value in reverse_coding_map.items():
    if value.endswith('R'):  # Reverse coded
        reverse_simulated_data[key] = 6 - reverse_simulated_data[key]
    else:  # Not reverse coded
        reverse_simulated_data[key] = reverse_simulated_data[key]
        
reverse_simulated_data

Unnamed: 0,simulated_bfi1,simulated_bfi16,simulated_bfi31,simulated_bfi46,simulated_bfi6,simulated_bfi21,simulated_bfi36,simulated_bfi51,simulated_bfi11,simulated_bfi26,...,bfi_n_depression,bfi_n_emotional_volatility,bfi_o_intellectual_curiosity,bfi_o_aesthetic_sensitivity,bfi_o_creative_imagination,bfi_e,bfi_a,bfi_c,bfi_n,bfi_o
0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,...,2.00,4.50,4.00,4.50,3.25,2.666667,3.833333,2.333333,3.750000,3.916667
1,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,...,2.75,4.50,2.50,3.25,3.00,3.250000,2.833333,4.083333,3.500000,2.916667
2,3.0,3.0,4.0,1.0,1.0,2.0,3.0,4.0,2.0,3.0,...,2.00,4.50,3.50,4.75,4.25,2.666667,2.750000,3.833333,3.833333,4.166667
3,5.0,1.0,2.0,5.0,5.0,5.0,1.0,1.0,2.0,2.0,...,4.25,2.25,4.75,4.25,3.75,4.583333,3.166667,2.916667,3.583333,4.250000
4,3.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,2.0,4.0,...,3.00,1.75,4.50,4.00,4.50,2.750000,4.333333,2.500000,3.000000,4.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.25,4.00,4.75,3.75,4.50,3.000000,4.666667,3.250000,3.833333,4.333333
196,4.0,1.0,2.0,4.0,5.0,3.0,1.0,2.0,3.0,4.0,...,3.00,2.00,3.50,4.50,3.75,3.750000,2.250000,3.666667,2.250000,3.916667
197,2.0,3.0,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,...,3.50,3.50,4.50,3.50,4.75,3.500000,3.333333,2.583333,3.583333,4.250000
198,3.0,2.0,4.0,3.0,3.0,3.0,1.0,3.0,2.0,2.0,...,2.50,2.75,3.00,2.50,3.00,3.583333,3.666667,3.916667,2.666667,2.833333


In [33]:
# remove the 'simulated_' prefix from the variable names
reverse_simulated_data.columns = reverse_simulated_data.columns.str.replace('simulated_', '')

reverse_simulated_data

Unnamed: 0,bfi1,bfi16,bfi31,bfi46,bfi6,bfi21,bfi36,bfi51,bfi11,bfi26,...,bfi_n_depression,bfi_n_emotional_volatility,bfi_o_intellectual_curiosity,bfi_o_aesthetic_sensitivity,bfi_o_creative_imagination,bfi_e,bfi_a,bfi_c,bfi_n,bfi_o
0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,...,2.00,4.50,4.00,4.50,3.25,2.666667,3.833333,2.333333,3.750000,3.916667
1,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,...,2.75,4.50,2.50,3.25,3.00,3.250000,2.833333,4.083333,3.500000,2.916667
2,3.0,3.0,4.0,1.0,1.0,2.0,3.0,4.0,2.0,3.0,...,2.00,4.50,3.50,4.75,4.25,2.666667,2.750000,3.833333,3.833333,4.166667
3,5.0,1.0,2.0,5.0,5.0,5.0,1.0,1.0,2.0,2.0,...,4.25,2.25,4.75,4.25,3.75,4.583333,3.166667,2.916667,3.583333,4.250000
4,3.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,2.0,4.0,...,3.00,1.75,4.50,4.00,4.50,2.750000,4.333333,2.500000,3.000000,4.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.25,4.00,4.75,3.75,4.50,3.000000,4.666667,3.250000,3.833333,4.333333
196,4.0,1.0,2.0,4.0,5.0,3.0,1.0,2.0,3.0,4.0,...,3.00,2.00,3.50,4.50,3.75,3.750000,2.250000,3.666667,2.250000,3.916667
197,2.0,3.0,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,...,3.50,3.50,4.50,3.50,4.75,3.500000,3.333333,2.583333,3.583333,4.250000
198,3.0,2.0,4.0,3.0,3.0,3.0,1.0,3.0,2.0,2.0,...,2.50,2.75,3.00,2.50,3.00,3.583333,3.666667,3.916667,2.666667,2.833333


In [34]:
# save the simulated data to a new csv file
reverse_simulated_data.to_csv('facet_lvl_simulated_data.csv', index=False)