In [1]:
#How different random states affect the sample statistics

import pandas as pd
import plotly.graph_objects as go

# Load the data
df = pd.read_csv("../data/enhanced_data2.csv") 

# Test different random states
random_states = [42, 123, 555, 777, 999]

results = {
    'Random State': [],
    'Sample Mean': [],
    'Sample Size': [],
    'Min Value': [],
    'Max Value': []
}

for state in random_states:
    random_sample = df['subjects_num'].sample(n=1000, random_state=state)
    results['Random State'].append(state)
    results['Sample Mean'].append(random_sample.mean().round())
    results['Sample Size'].append(len(random_sample))
    results['Min Value'].append(random_sample.min())
    results['Max Value'].append(random_sample.max())

# Create comparison DataFrame
results_df = pd.DataFrame(results)

# Add population statistics for comparison
population_stats = pd.DataFrame({
    'Random State': ['Population'],
    'Sample Mean': [df['subjects_num'].mean().round()],
    'Sample Size': [len(df)],
    'Min Value': [df['subjects_num'].min()],
    'Max Value': [df['subjects_num'].max()]
})

# Combine population and sample statistics
final_results = pd.concat([population_stats, results_df], ignore_index=True)

# Display results
print("\nComparison of Different Random States:")
print(final_results.to_string(index=False))

# Optional: Create a bar plot to visualize the means
fig = go.Figure(data=[
    go.Bar(
        x=[str(state) for state in final_results['Random State']],
        y=final_results['Sample Mean'],
        text=final_results['Sample Mean'].round(2),
        textposition='auto',
    )
])

fig.update_layout(
    title='Sample Means with Different Random States',
    xaxis_title='Random State (Population = full dataset)',
    yaxis_title='Mean Value',
    showlegend=False
)

fig.show()


Comparison of Different Random States:
Random State  Sample Mean  Sample Size  Min Value  Max Value
  Population      10234.0        63550        5.0   550000.0
          42       9345.0         1000        5.0   550000.0
         123       7068.0         1000        5.0   550000.0
         555      12404.0         1000        5.0   550000.0
         777       9761.0         1000        5.0   550000.0
         999      10319.0         1000        5.0   550000.0


In [2]:
import pandas as pd

# Load the data
df = pd.read_csv("../data/enhanced_data2.csv") 

# Compare different sampling approaches
results = {
    'Sampling Method': [
        'Original Data',
        'Sample with n=1000',
        'Sample without n specified'
    ],
    'Sample Size': [
        len(df['subjects_num']),
        len(df['subjects_num'].sample(n=1000, random_state=42)),
        len(df['subjects_num'].sample(random_state=42))
    ],
    'Mean': [
        df['subjects_num'].mean().round(),
        df['subjects_num'].sample(n=1000, random_state=42).mean().round(),
        df['subjects_num'].sample(random_state=42).mean().round()
    ]
}

# Create DataFrame for display
results_df = pd.DataFrame(results)

print("\nSampling Comparison:")
print(results_df.to_string(index=False))


Sampling Comparison:
           Sampling Method  Sample Size    Mean
             Original Data        63550 10234.0
        Sample with n=1000         1000  9345.0
Sample without n specified            1     5.0
