## Balance Table

In [1]:
import os
import pandas as pd
import numpy as np
from tableone import TableOne
import scipy.stats.mstats as mstats

In [2]:
# Import survey data
df = pd.read_csv('final_data.csv')
df = df.rename(columns={'gpa_prev': 'gpa_hist'})
survey = df.drop_duplicates(subset='Student ID', keep='first')
survey.head().T
survey['education_parent_college'] = (survey['education_parent'] >=6).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey['education_parent_college'] = (survey['education_parent'] >=6).astype(int)


In [3]:
# Columns to be included in the balance table; adjust as needed
columns = ['education_parent_college', 'n_household_members',   'n_household_children', 'class_enjoyment', 'class_participation_likelihood',  
            'math_hw_completion', 'hw_help', 'private_tutorship', 'visit_training_center','female','n_weekday_study_hours','n_weekend_study_hours','gpa_hist']

# collapse all parents education variables that are below college degree
# survey.loc[survey['education_parent_1'] < 3, 'education_parent_1'] = 2
# survey.loc[survey['education_parent_2'] < 3, 'education_parent_1'] = 2
# education_order = [2,3,4]
# survey['education_parent_1'] = pd.Categorical(survey['education_parent_1'], categories=education_order, ordered=True)
# survey['education_parent_2'] = pd.Categorical(survey['education_parent_2'], categories=education_order, ordered=True)





# Specify the categorical variables; 'Treatment arm' is the grouping variable
categorical = ['education_parent_college','female', 'hw_help','private_tutorship','visit_training_center']



# Create TableOne instances
table1_no_adjust = TableOne(survey, columns=columns, categorical=categorical, 
                            groupby='Treatment arm', pval=True, isnull=False, decimals=2)
table1_adjusted = TableOne(survey, columns=columns, categorical=categorical, 
                           groupby='Treatment arm', pval=True, pval_adjust='fdr_bh', isnull=False)

# Convert TableOne output to DataFrame
df_no_adjust = pd.DataFrame(table1_no_adjust.tableone)
df_adjusted = pd.DataFrame(table1_adjusted.tableone)

# Access the p-value columns using the multi-level column structure
unadjusted_pvals = df_no_adjust[('Grouped by Treatment arm', 'P-Value')]
adjusted_pvals = df_adjusted[('Grouped by Treatment arm', 'P-Value (adjusted)')]

# Combine the p-value columns into a new DataFrame
combined_pvals = pd.DataFrame({
    'P-Value': unadjusted_pvals,
    'P-Value (FDR)': adjusted_pvals
})

df_no_adjust_reset = df_no_adjust.reset_index()
df_no_adjust_reset.columns = ['_'.join(col).strip() for col in df_no_adjust_reset.columns.values]
combined_pvals_reset = combined_pvals.reset_index()


# Merge the DataFrames
final_table = df_no_adjust_reset.join(combined_pvals_reset, lsuffix='_original', rsuffix='_combined')


# Format and Convert to LaTeX
final_table.columns = ['Description', 'Category', 'Overall', 'Augmented', 'Control', 'Vanilla', 'P-Value Adjusted', 'Desc Duplicate', 'Cat Duplicate', 'P-Value', 'P-Value (FDR)']

# Remove redundant columns
final_table = final_table.drop(columns=['P-Value Adjusted','Desc Duplicate', 'Cat Duplicate'])


name_mapping = {
    'n': 'Total Count',
    'education_parent_college, n (%)': 'Both parents with at least college degree',
    'n_household_members, mean (SD)': 'Number of Household Members',
    'n_household_children, mean (SD)': 'Number of Children in the same HH',
    'class_enjoyment, mean (SD)': 'Average Class Enjoyment Rate, Mean (SD)',
    'class_participation_likelihood, mean (SD)': 'Average Participation Likelihood, Mean (SD)',
    'math_hw_completion, mean (SD)': 'Average Math HW Completion, Mean (SD)',
    'hw_help, n (%)': 'Get External Help with Homeworks (e.g., a tutor), (Percent)',
    'private_tutorship, n (%)': 'Private Tutorship, N (Percent)',
    'visit_training_center, n (%)': 'Visits to Training Center, N (Percent)',
    'female, n (%)': 'Female, N (Percent)',
    'n_weekday_study_hours, mean (SD)': 'Average Weekday Study Hours, Mean (SD)',
    'n_weekend_study_hours, mean (SD)': 'Average Weekend Study Hours, Mean (SD)'
}

# Apply the mapping to the 'Description' column
final_table['Description'] = final_table['Description'].map(name_mapping).fillna(final_table['Description'])

# Adding the category to the description for clarity
final_table['Description'] = final_table.apply(lambda x: f"{x['Description']} Level {x['Category']}" if x['Category'] else x['Description'], axis=1)



# Format and print the LaTeX table
latex_table = final_table.to_latex(index=False, header=True, column_format='lllllllll', escape=False)

# Write the LaTeX table to a file
with open('balance_table_full.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to balance_table.tex")
print(latex_table)

  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,


LaTeX table saved to balance_table.tex
\begin{tabular}{lllllllll}
\toprule
Description & Category & Overall & Augmented & Control & Vanilla & P-Value & P-Value (FDR) \\
\midrule
Total Count &  & 943 & 312 & 349 & 282 &  &  \\
Both parents with at least college degree Level 0 & 0 & 206 (21.85) & 63 (20.19) & 72 (20.63) & 71 (25.18) & 0.268 & 0.576 \\
Both parents with at least college degree Level 1 & 1 & 737 (78.15) & 249 (79.81) & 277 (79.37) & 211 (74.82) &  &  \\
Number of Household Members &  & 3.60 (0.99) & 3.54 (0.99) & 3.70 (0.95) & 3.55 (1.02) & 0.076 & 0.490 \\
Number of Children in the same HH &  & 1.42 (1.43) & 1.34 (1.34) & 1.51 (1.51) & 1.39 (1.42) & 0.299 & 0.576 \\
Average Class Enjoyment Rate, Mean (SD) &  & 2.27 (1.07) & 2.24 (1.07) & 2.27 (1.10) & 2.29 (1.05) & 0.848 & 0.918 \\
Average Participation Likelihood, Mean (SD) &  & 2.39 (1.02) & 2.37 (1.03) & 2.36 (1.01) & 2.46 (1.02) & 0.375 & 0.609 \\
Average Math HW Completion, Mean (SD) &  & 3.08 (0.93) & 3.05 (0.89) & 

In [4]:
## Do the same for the main sample
survey = survey[survey['Honors']==0]



# Columns to be included in the balance table; adjust as needed
columns = ['education_parent_college', 'n_household_members',   'n_household_children', 'class_enjoyment', 'class_participation_likelihood',  
            'math_hw_completion', 'hw_help', 'private_tutorship', 'visit_training_center','female','n_weekday_study_hours','n_weekend_study_hours','gpa_hist']

# collapse all parents education variables that are below college degree
# survey.loc[survey['education_parent_1'] < 3, 'education_parent_1'] = 2
# survey.loc[survey['education_parent_2'] < 3, 'education_parent_1'] = 2
# education_order = [2,3,4]
# survey['education_parent_1'] = pd.Categorical(survey['education_parent_1'], categories=education_order, ordered=True)
# survey['education_parent_2'] = pd.Categorical(survey['education_parent_2'], categories=education_order, ordered=True)





# Specify the categorical variables; 'Treatment arm' is the grouping variable
categorical = ['education_parent_college','female', 'hw_help','private_tutorship','visit_training_center']



# Create TableOne instances
table1_no_adjust = TableOne(survey, columns=columns, categorical=categorical, 
                            groupby='Treatment arm', pval=True, isnull=False, decimals=2)
table1_adjusted = TableOne(survey, columns=columns, categorical=categorical, 
                           groupby='Treatment arm', pval=True, pval_adjust='fdr_bh', isnull=False)

# Convert TableOne output to DataFrame
df_no_adjust = pd.DataFrame(table1_no_adjust.tableone)
df_adjusted = pd.DataFrame(table1_adjusted.tableone)

# Access the p-value columns using the multi-level column structure
unadjusted_pvals = df_no_adjust[('Grouped by Treatment arm', 'P-Value')]
adjusted_pvals = df_adjusted[('Grouped by Treatment arm', 'P-Value (adjusted)')]

# Combine the p-value columns into a new DataFrame
combined_pvals = pd.DataFrame({
    'P-Value': unadjusted_pvals,
    'P-Value (FDR)': adjusted_pvals
})

df_no_adjust_reset = df_no_adjust.reset_index()
df_no_adjust_reset.columns = ['_'.join(col).strip() for col in df_no_adjust_reset.columns.values]
combined_pvals_reset = combined_pvals.reset_index()


# Merge the DataFrames
final_table = df_no_adjust_reset.join(combined_pvals_reset, lsuffix='_original', rsuffix='_combined')


# Format and Convert to LaTeX
final_table.columns = ['Description', 'Category', 'Overall', 'Augmented', 'Control', 'Vanilla', 'P-Value Adjusted', 'Desc Duplicate', 'Cat Duplicate', 'P-Value', 'P-Value (FDR)']

# Remove redundant columns
final_table = final_table.drop(columns=['P-Value Adjusted','Desc Duplicate', 'Cat Duplicate'])


name_mapping = {
    'n': 'Total Count',
    'education_parent_college, n (%)': 'Both parents with at least college degree',
    'n_household_members, mean (SD)': 'Number of Household Members',
    'n_household_children, mean (SD)': 'Number of Children in the same HH',
    'class_enjoyment, mean (SD)': 'Average Class Enjoyment Rate, Mean (SD)',
    'class_participation_likelihood, mean (SD)': 'Average Participation Likelihood, Mean (SD)',
    'math_hw_completion, mean (SD)': 'Average Math HW Completion, Mean (SD)',
    'hw_help, n (%)': 'Get External Help with Homeworks (e.g., a tutor), (Percent)',
    'private_tutorship, n (%)': 'Private Tutorship, N (Percent)',
    'visit_training_center, n (%)': 'Visits to Training Center, N (Percent)',
    'female, n (%)': 'Female, N (Percent)',
    'n_weekday_study_hours, mean (SD)': 'Average Weekday Study Hours, Mean (SD)',
    'n_weekend_study_hours, mean (SD)': 'Average Weekend Study Hours, Mean (SD)'
}

# Apply the mapping to the 'Description' column
final_table['Description'] = final_table['Description'].map(name_mapping).fillna(final_table['Description'])

# Adding the category to the description for clarity
final_table['Description'] = final_table.apply(lambda x: f"{x['Description']} Level {x['Category']}" if x['Category'] else x['Description'], axis=1)



# Format and print the LaTeX table
latex_table = final_table.to_latex(index=False, header=True, column_format='lllllllll', escape=False)

# Write the LaTeX table to a file
with open('balance_table_main.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to balance_table.tex")
print(latex_table)

LaTeX table saved to balance_table.tex
\begin{tabular}{lllllllll}
\toprule
Description & Category & Overall & Augmented & Control & Vanilla & P-Value & P-Value (FDR) \\
\midrule
Total Count &  & 839 & 277 & 320 & 242 &  &  \\
Both parents with at least college degree Level 0 & 0 & 196 (23.36) & 63 (22.74) & 68 (21.25) & 65 (26.86) & 0.285 & 0.619 \\
Both parents with at least college degree Level 1 & 1 & 643 (76.64) & 214 (77.26) & 252 (78.75) & 177 (73.14) &  &  \\
Number of Household Members &  & 3.59 (1.00) & 3.53 (0.99) & 3.69 (0.96) & 3.52 (1.07) & 0.074 & 0.424 \\
Number of Children in the same HH &  & 1.47 (1.47) & 1.39 (1.39) & 1.56 (1.51) & 1.45 (1.50) & 0.369 & 0.655 \\
Average Class Enjoyment Rate, Mean (SD) &  & 2.22 (1.08) & 2.19 (1.05) & 2.23 (1.11) & 2.24 (1.08) & 0.857 & 0.929 \\
Average Participation Likelihood, Mean (SD) &  & 2.37 (1.01) & 2.33 (1.02) & 2.34 (1.00) & 2.46 (1.01) & 0.286 & 0.619 \\
Average Math HW Completion, Mean (SD) &  & 3.04 (0.94) & 3.02 (0.89) & 

  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
