In [None]:
! pip install pyarrow
! pip install pandas
! pip install scikit-learn
! pip install pyspark
! pip install xgboost
! pip install kaleido
! pip install EntropyHub

In [None]:
from Input_Variables.read_vars import train_data_storage, validation_data_storage, test_data_storage, \
                                      inter_train_location, inter_test_location, inter_val_location,\
                                      one_hot_encoding_data, \
                                      analysis_group, \
                                      daily_stats_features_lower, daily_stats_features_upper, \
                                      model_storage_location, random_seed, \
                                      time_series_lag_values_created, \
                                      evaluation_metrics_output_storage, \
                                      feature_importance_storage_location, \
                                      overall_feature_importance_plot_location

from Data_Schema.schema import Pandas_UDF_Data_Schema
from Read_In_Data.read_data import Reading_Data
from Data_Pipeline.imputation_pipeline import Date_And_Value_Imputation


from Feature_Generation.create_binary_labels import Create_Binary_Labels
from Feature_Generation.summary_stats import Summary_Stats_Features
from Feature_Generation.lag_features import Create_Lagged_Features
from Feature_Generation.time_series_feature_creation import TS_Features
from Feature_Generation.difference_features import Difference_Features

from Data_Pipeline.encoding_scaling_pipeline import Feature_Transformations

from Model_Creation.pyspark_xgboost import Create_PySpark_XGBoost

from Model_Predictions.pyspark_model_preds import Model_Predictions

from Model_Evaluation.pyspark_model_eval import Evaluate_Model

from Feature_Importance.model_feature_importance import Feature_Importance

from Model_Plots.xgboost_classification_plots import XGBoost_Classification_Plot

import os

# PySpark UDF Schema Activation
pandas_udf_data_schema=Pandas_UDF_Data_Schema()

# Data Location
reading_data=Reading_Data()

# Create Binary y Variables
create_binary_labels=Create_Binary_Labels()

# Imputation
date_and_value_imputation=Date_And_Value_Imputation()

# Features Daily Stats Module
summary_stats_features=Summary_Stats_Features()

# Features Complex
ts_features=TS_Features()

# Features Lagged Value
create_lag_features=Create_Lagged_Features()

# Features Differences
difference_features=Difference_Features()

# PySpark XGBoost Model Module
create_pyspark_xgboost=Create_PySpark_XGBoost()

# Classification Evaluation
evaluate_model=Evaluate_Model()

# Model Plots Feature Importance
xgboost_classification_plot=XGBoost_Classification_Plot()

# Feature Transformations
feature_transformations=Feature_Transformations()


pyspark_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()


model_predictions=Model_Predictions()

# Feature Importance
feature_importance=Feature_Importance()

In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Glucose") \
        .getOrCreate()

In [None]:
interpolation_complete = os.path.exists('/cephfs/interpolation/train')

if interpolation_complete == False:
    date_and_value_imputation.interpolation_creation('train')
    
training_custom_imputation_pipeline = date_and_value_imputation.read_interpolation('/cephfs/interpolation/train/')

training_custom_imputation_pipeline.show(2)



interpolation_complete = os.path.exists('/cephfs/interpolation/test')

if interpolation_complete == False:
    date_and_value_imputation.interpolation_creation('test')
    
testing_custom_imputation_pipeline = date_and_value_imputation.read_interpolation('/cephfs/interpolation/test/')

testing_custom_imputation_pipeline.show(2)



interpolation_complete = os.path.exists('/cephfs/interpolation/val')

if interpolation_complete == False:
    date_and_value_imputation.interpolation_creation('val')
    
val_custom_imputation_pipeline = date_and_value_imputation.read_interpolation('/cephfs/interpolation/val/')

val_custom_imputation_pipeline.show(2)

In [None]:
training_df_differences = difference_features.add_difference_features(training_custom_imputation_pipeline)
training_df_differences.show(5)

training_df_chunks = summary_stats_features.create_chunk_col(training_df_differences, chunk_val = 288)
training_df_chunks.show(5)



testing_df_differences = difference_features.add_difference_features(testing_custom_imputation_pipeline)
testing_df_differences.show(5)

testing_df_chunks = summary_stats_features.create_chunk_col(testing_df_differences, chunk_val = 288)
testing_df_chunks.show(5)



val_df_differences = difference_features.add_difference_features(val_custom_imputation_pipeline)
val_df_differences.show(5)

val_df_chunks = summary_stats_features.create_chunk_col(val_df_differences, chunk_val = 288)
val_df_chunks.show(5)

In [None]:
# testing_df_chunks.repartition('NumId').write.parquet('/cephfs/featuresData/chunks/test')
# val_df_chunks.repartition('NumId').write.parquet('/cephfs/featuresData/chunks/val')

In [None]:
training_df_poincare = training_df_chunks.groupby(['NumId', 'Chunk']).apply(ts_features.poincare)
training_df_poincare.show(5)
training_df_entropy = training_df_chunks.groupby(['NumId', 'Chunk']).apply(ts_features.entropy)
training_df_entropy.show(5)

# training_df_poincare.repartition('NumId').write.parquet('/cephfs/featuresData/poincare/train')
# training_df_entropy.repartition('NumId').write.parquet('/cephfs/featuresData/entropy/train')

training_df_poincare = spark.read.parquet('/cephfs/featuresData/poincare/train')
training_df_poincare.show(5)
training_df_entropy = spark.read.parquet('/cephfs/featuresData/entropy/train')
training_df_entropy.show(5)

training_df_complex_features = training_df_poincare.join(training_df_entropy,['NumId', 'Chunk'])
training_df_complex_features.show()

In [None]:
testing_df_chunks = spark.read.parquet('/cephfs/featuresData/chunks/test')
val_df_chunks = spark.read.parquet('/cephfs/featuresData/chunks/val')

In [None]:
testing_df_poincare = testing_df_chunks.groupby(['NumId', 'Chunk']).apply(ts_features.poincare)
testing_df_poincare.show(5)
testing_df_entropy = testing_df_chunks.groupby(['NumId', 'Chunk']).apply(ts_features.entropy)
testing_df_entropy.show(5)

In [None]:
# testing_df_poincare.repartition('NumId').write.parquet('/cephfs/featuresData/poincare/test')
# testing_df_entropy.repartition('NumId').write.parquet('/cephfs/featuresData/entropy/test')

In [None]:
testing_df_poincare = spark.read.parquet('/cephfs/featuresData/poincare/test')
testing_df_poincare.show(5)
testing_df_entropy = spark.read.parquet('/cephfs/featuresData/entropy/test')
testing_df_entropy.show(5)

testing_df_complex_features = testing_df_poincare.join(testing_df_entropy,['NumId', 'Chunk'])
testing_df_complex_features.show()

In [None]:
val_df_poincare = val_df_chunks.groupby(['NumId', 'Chunk']).apply(ts_features.poincare)
val_df_poincare.show(5)

# val_df_poincare.repartition('NumId').write.parquet('/cephfs/featuresData/poincare/val')

In [None]:
val_df_entropy = val_df_chunks.groupby(['NumId', 'Chunk']).apply(ts_features.entropy)
val_df_entropy.show(5)

In [None]:
# val_df_entropy.repartition('NumId').write.parquet('/cephfs/featuresData/entropy/val')

val_df_poincare = spark.read.parquet('/cephfs/featuresData/poincare/val')
val_df_poincare.show(5)
val_df_entropy = spark.read.parquet('/cephfs/featuresData/entropy/val')
val_df_entropy.show(5)

val_df_complex_features = val_df_poincare.join(val_df_entropy,['NumId', 'Chunk'])
val_df_complex_features.show()

In [None]:
summary_stats_complete = os.path.exists('/cephfs/summary_stats/encoded/one_hot_train/summary_stats_cohort_bool_encoded.parquet')

if summary_stats_complete == False:
    training_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=training_df_chunks)
else:
    training_features_summary_stats=reading_data.read_in_pyspark_data_for_summary_stats('/cephfs/summary_stats/encoded/one_hot_train/summary_stats_cohort_bool_encoded.parquet')

    
training_features_summary_stats= spark.read.parquet('/cephfs/summary_stats/encoded/one_hot_train/summary_stats_cohort_bool_encoded.parquet')
training_features_summary_stats.show(3)



summary_stats_complete = os.path.exists('/cephfs/summary_stats/encoded/one_hot_test/summary_stats_cohort_bool_encoded.parquet')

if summary_stats_complete == False:
    testing_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=testing_df_chunks)
else:
    testing_features_summary_stats=reading_data.read_in_pyspark_data_for_summary_stats('/cephfs/summary_stats/encoded/one_hot_test/summary_stats_cohort_bool_encoded.parquet')

testing_features_summary_stats.show(3)



summary_stats_complete = os.path.exists('/cephfs/summary_stats/encoded/one_hot_val/summary_stats_cohort_bool_encoded.parquet')

if summary_stats_complete == False:
    val_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=val_df_chunks)
else:
    val_features_summary_stats=reading_data.read_in_pyspark_data_for_summary_stats('/cephfs/summary_stats/encoded/one_hot_val/summary_stats_cohort_bool_encoded.parquet')

val_features_summary_stats.show(3)

In [None]:
training_df_final = training_df_complex_features.join(training_features_summary_stats,['NumId', 'Chunk'])
training_df_final.show(5)


testing_df_final = testing_df_complex_features.join(testing_features_summary_stats,['NumId', 'Chunk'])
testing_df_final.show(5)


val_df_final = val_df_complex_features.join(val_features_summary_stats,['NumId', 'Chunk'])
val_df_final.show(5)

In [None]:
training_df_final.repartition('NumId').write.parquet('/cephfs/summary_stats/all_train_bool')
testing_df_final.repartition('NumId').write.parquet('/cephfs/summary_stats/all_test_bool')
val_df_final.repartition('NumId').write.parquet('/cephfs/summary_stats/all_val_bool')