#### TEST

In [None]:
import pyspark
from pyspark import pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, \
StringType, IntegerType, TimestampType, DateType, FloatType
import time
import pathlib
from pyspark.sql.functions import col, to_date, sum, avg, max, min, \
stddev, percentile_approx,\
pandas_udf, PandasUDFType, lit, udf, collect_list, sqrt, monotonically_increasing_id, map_from_entries,\
rank, dense_rank, count, when
from pyspark.sql.window import Window

In [None]:
conf = pyspark.SparkConf().setAll([\
            ('spark.app.name', 'Glucose_Analysis_Spark')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()        

In [None]:
schema = StructType([
  StructField('PatientId', StringType(), True),
  StructField('Value', FloatType(), True),
  StructField('GlucoseDisplayDate', DateType(), True)
  ])

emptyRDD = spark.sparkContext.emptyRDD()
df = spark.createDataFrame(emptyRDD,schema)

df.printSchema()

In [None]:
df = spark.read.parquet('/cephfs/stepped_glucose_data/step0_load/parquet_0_to_10/part-00000-532ee45d-8e0d-44c4-8f3b-884b22175e0f-c000.snappy.parquet')

In [None]:
df = df.withColumn('y_binary', lit(1))

In [None]:
#def pyspark_summary_statistics(self, df, spark):
# @pandas_udf(StructType([StructField('Entropy', FloatType())]), PandasUDFType.GROUPED_MAP)
# def entropy_grouped(df):
#     return feat_create_obj.entropy_extraction(df.Value)

def entropy_udf(vals):
    feat_create_obj = feat_create()
    return udf(feat_create_obj.entropy_extraction(vals), FloatType())

def poincare_udf(vals):
    feat_create_obj = feat_create()
    return udf(calculate_poincare(vals),\
               StructType([\
                   StructField('First', FloatType()),\
                   StructField('Second', FloatType()),\
                   StructField('Third', FloatType())\
               ]))

# def chunk_by_index():
#     return udf(collect_list

def create_partition_date(df, chunk_val):
    window = Window.partitionBy(df['PatientId']).orderBy(df['GlucoseDisplayTime'])
    df = df.select('*', rank().over(window).alias('index'))
    df = df.withColumn("Chunk", (df.index/chunk_val).cast(IntegerType()))

    return df


def pyspark_summary_statistics(df, \
                               daily_stats_features_lower,\
                               daily_stats_features_upper, \
                               chunk_val = 12):  

    df_added = create_partition_date(df, chunk_val)
    
    group_cols = ["PatientId", "Chunk"]

    summary_df = df_added.groupby(group_cols)\
        .agg(max('y_binary').alias('y_summary_binary'),\
             avg("Value").alias("Mean"),\
             stddev("Value").alias("Std Dev"),\
             percentile_approx("Value", .5).alias("Median"), \
             min("Value").alias("Min"),\
             max("Value").alias("Max"),\
             count(when(col("Value") < daily_stats_features_lower, 1)).alias("CountBelow"),\
             count(when(col("Value") > daily_stats_features_upper, 1)).alias("CountAbove"),\
             (count(when(col("Value") < daily_stats_features_lower, 1))/chunk_val).alias("PercentageBelow"),\
             (count(when(col("Value") > daily_stats_features_upper, 1))/chunk_val).alias("PercentageAbove")
            )

    df_added = df_added.join(summary_df, ['PatientId', 'Chunk'])
    
    return df_added

In [None]:
pyspark_summary_statistics(df, 70, 180, 12)

In [None]:
added_daily_features=df.groupby(analysis_group).apply(transform_features)

    return added_daily_features

In [None]:
spark.stop()

In [None]:
df = spark.read.parquet('/cephfs/summary_stats/all_val_bool/')

In [None]:
df.show(5)

In [None]:
new_df = df.withColumn('target', when(df.DiffPrevious > 9, 1)
                         .when(df.DiffPrevious < -9,-1)
                         .otherwise(0))

In [None]:
new_df.select(col('DiffPrevious'), col('target')).show(5)

In [None]:
new_df.repartition('NumId').write.parquet('/cephfs/summary_stats/all_val_bool_updated')

In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Glucose") \
        .getOrCreate()

training_df_poincare = spark.read.parquet('/cephfs/featuresData/poincare/train')
training_df_poincare.show(5)
training_df_entropy = spark.read.parquet('/cephfs/featuresData/entropy/train')
training_df_entropy.show(5)

training_df_complex_features = training_df_poincare.join(training_df_entropy,['NumId', 'Chunk'])
training_df_complex_features.show()

training_features_summary_stats= spark.read.parquet('/cephfs/summary_stats/encoded/one_hot_train/summary_stats_cohort_bool_encoded.parquet')
training_features_summary_stats.show(3)

training_df_final = training_df_complex_features.join(training_features_summary_stats,['NumId', 'Chunk'])
training_df_final.show(5)

training_df_final.repartition('NumId').write.parquet('/cephfs/summary_stats/all_train_bool')