In [7]:
import pandas as pd

# Step 0: Load the data
cleaned_data = pd.read_csv(r'C:\Users\MadiL\Thesis CodeBase\Longitudinal\clean_data.csv')
df = cleaned_data

# Step 1: Combine LSAS_avo and LSAS_anx into LSAS_1, LSAS_2, ..., LSAS_24
for i in range(1, 25):
    avo_col = f'LSAS_avo{i}'
    anx_col = f'LSAS_anx{i}'
    combined_col = f'LSAS_{i}'
    df[combined_col] = df[avo_col] + df[anx_col]

# Step 2: Drop LSAS_avo_* and LSAS_anx_* columns
cols_to_drop = [f'LSAS_avo{i}' for i in range(1, 25)] + [f'LSAS_anx{i}' for i in range(1, 25)]
print("Columns before dropping:", df.columns.tolist())
df = df.drop(columns=cols_to_drop, errors='raise')  # Use 'raise' to ensure it fails if columns aren't found
print("Columns after dropping:", df.columns.tolist())

# Step 3: Ensure only combined LSAS columns are used
lsas_columns = [f'LSAS_{i}' for i in range(1, 25)]
print("LSAS columns:", lsas_columns)

# Step 4: Define other question columns
bdi_columns = [col for col in cleaned_data.columns if 'BDI' in col]
cfs_columns = [col for col in cleaned_data.columns if 'CFS' in col]

# Reverse score CFS questions 2, 3, 5, 10 in place
reverse_columns = ['CFS2_R', 'CFS3_R', 'CFS5_R', 'CFS10_R']

# Apply the reverse scoring transformation in place
cleaned_data[reverse_columns] = 7 - cleaned_data[reverse_columns]

# Reverse the DIRECTION of all CFS columns
cleaned_data[cfs_columns] *= -1

# Step 5: Combine all question columns into one list
question_columns = lsas_columns + bdi_columns + cfs_columns
print("Question columns for melt:", question_columns)

# Step 6: Reshape the data
cleaned_data_sorted = cleaned_data.sort_values(by=['Participant_Number', 'StartDate'])
long_data = pd.melt(cleaned_data_sorted, 
                    id_vars=['Participant_Number', 'StartDate', 'EndDate'], 
                    value_vars=question_columns, 
                    var_name='Question', value_name='Score')

# Add standardized scores to the DataFrame while keeping the original 'Score' column
# Ensure 'Score' has no missing values or non-numeric types
long_data['Score'] = pd.to_numeric(long_data['Score'], errors='coerce').fillna(0)

# Calculate standardized scores
long_data['Standardized_Score'] = (long_data['Score'] - long_data['Score'].mean()) / long_data['Score'].std()

# Verify the column was added
print(long_data.head())



Columns before dropping: ['Qualtricsname', 'Participant_Number', 'Corona', 'ResponseID', 'Worker_ID', 'StartDate', 'EndDate', 'Durationinseconds', 'Age', 'Gender', 'Education_version', 'Education_years_A', 'Relationship_status', 'Sexual_attraction', 'Children_YN', 'Children_num', 'Nationality', 'Ethnicity', 'LSAS_anx1', 'LSAS_anx2', 'LSAS_anx3', 'LSAS_anx4', 'LSAS_anx5', 'LSAS_anx6', 'LSAS_anx7', 'LSAS_anx8', 'LSAS_anx9', 'LSAS_anx10', 'LSAS_anx11', 'LSAS_anx12', 'LSAS_anx13', 'LSAS_anx14', 'LSAS_anx15', 'LSAS_anx16', 'LSAS_anx17', 'LSAS_anx18', 'LSAS_anx19', 'LSAS_anx20', 'LSAS_anx21', 'LSAS_anx22', 'LSAS_anx23', 'LSAS_anx24', 'LSAS_avo1', 'LSAS_avo2', 'LSAS_avo3', 'LSAS_avo4', 'LSAS_avo5', 'LSAS_avo6', 'LSAS_avo7', 'LSAS_avo8', 'LSAS_avo9', 'LSAS_avo10', 'LSAS_avo11', 'LSAS_avo12', 'LSAS_avo13', 'LSAS_avo14', 'LSAS_avo15', 'LSAS_avo16', 'LSAS_avo17', 'LSAS_avo18', 'LSAS_avo19', 'LSAS_avo20', 'LSAS_avo21', 'LSAS_avo22', 'LSAS_avo23', 'LSAS_avo24', 'BDI1', 'BDI2', 'BDI3', 'BDI4', 'BDI5

In [8]:
long_data

# Save this from data wrangler, then make a long_data_standardized.csv where the standardized_score column is renamed to just score
# Make another one, delete the standardized score column
long_data