In [None]:
import polars as pl

# For a single node opeartion, we can use Polars lazy API to read larger than memory datasets using streaming

# Read the Parquet file using lazy evaluation
df = pl.scan_parquet('large_dataset.parquet')

# Melt the DataFrame to have 'year' and 'weight' columns
df_unpivot = df.unpivot(
    index='patient',
    on=[col for col in df.collect_schema().names() if col != 'patient'],
    variable_name='year',
    value_name='weight'
)

# Compute statistical metrics for each patient
stats_df = df_unpivot.group_by('patient').agg([
    pl.col('weight').mean().alias('mean_weight'),
    pl.col('weight').median().alias('median_weight'),
    pl.col('weight').std().alias('std_weight'),
    pl.col('weight').min().alias('min_weight'),
    pl.col('weight').max().alias('max_weight'),
    pl.col('weight').quantile(0.25).alias('q1_weight'),
    pl.col('weight').quantile(0.75).alias('q3_weight')
])

# Collect the result into a DataFrame
stats_df_result = stats_df.collect()

# Save the result to a CSV file
stats_df_result.write_csv('person_statistics.csv')


In [13]:
import polars as pl

# For a single node opeartion, we can use Polars lazy API to read larger than memory datasets using streaming

# Read the Parquet file using lazy evaluation
df = pl.scan_parquet('large_dataset.parquet')

# Melt the DataFrame to have 'year' and 'weight' columns
df_unpivot = df.unpivot(
    index='patient',
    on=[col for col in df.collect_schema().names() if col != 'patient'],
    variable_name='year',
    value_name='weight'
)

# Compute statistical metrics for each patient
stats_df = df_unpivot.group_by('year').agg([
    pl.col('weight').mean().alias('mean_weight'),
    pl.col('weight').median().alias('median_weight'),
    pl.col('weight').std().alias('std_weight'),
    pl.col('weight').min().alias('min_weight'),
    pl.col('weight').max().alias('max_weight'),
    pl.col('weight').quantile(0.25).alias('q1_weight'),
    pl.col('weight').quantile(0.75).alias('q3_weight')
])

# Collect the result into a DataFrame
stats_df_result = stats_df.collect()

# Save the result to a CSV file
stats_df_result.write_csv('year_statistics.csv')


In [None]:
import polars as pl

# For a single node opeartion, we can use Polars lazy API to read larger than memory datasets using streaming

# Read the Parquet file using lazy evaluation
df = pl.scan_parquet('large_dataset.parquet')

# Melt the DataFrame to have 'year' and 'weight' columns
df_unpivot = df.unpivot(
    index='patient',
    on=[col for col in df.collect_schema().names() if col != 'patient'],
    variable_name='year',
    value_name='weight'
)

# Convert 'year' to integer
df_unpivot = df_unpivot.with_columns(pl.col('year').str.extract(r'(\d+)$').cast(pl.Int32))

# Sort by 'patient' and 'year' to ensure correct order
df_unpivot = df_unpivot.sort(['patient', 'year'])

# Calculate the weight difference per patient between consecutive years
df_diff = df_unpivot.with_columns([
    (pl.col('weight') - pl.col('weight').shift(1)).alias('weight_change'),
    pl.col('patient').eq(pl.col('patient').shift(1)).alias('same_patient')
]).filter(pl.col('same_patient'))

# Group by 'year' and calculate average weight change
weight_trends = df_diff.group_by('year').agg([
    pl.col('weight_change').mean().alias('avg_weight_change')
])

# Collect the result into a DataFrame
weight_trends_result = weight_trends.collect()

# Save the result to a CSV file
weight_trends_result.write_csv('weight_trends.csv')

In [25]:
weight_trends.collect()

year,avg_weight_change
i32,f64
1977,0.90178
1989,-0.22679
1986,-0.53195
1974,-0.84213
1980,-0.91579
…,…
1947,-0.07116
1953,0.13595
1962,0.80743
1956,-0.6312
