# Data Exploration
- This notebook performs exploratory data analysis on the dataset.
- To expand on the analysis, attach this notebook to a cluster with runtime version **15.4.x-cpu-ml-scala2.12**, and rerun it.
- Explore completed trials in the [MLflow experiment](#mlflow/experiments/4326011134824737).

In [0]:
import os
import uuid
import pandas as pd
import shutil
import databricks.automl_runtime
import pyspark.pandas as ps

import mlflow

ps.options.plotting.backend = "matplotlib"

# Download input data from mlflow into a pyspark.pandas DataFrame
# create temp directory to download data
exp_temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], "tmp", str(uuid.uuid4())[:8])
os.makedirs(exp_temp_dir)

# download the artifact and read it
exp_data_path = mlflow.artifacts.download_artifacts(run_id="e4d9b882124846dc9b947f73116c189a", artifact_path="data", dst_path=exp_temp_dir)
exp_file_path = os.path.join(exp_data_path, "training_data")
exp_file_path  = "file://" + exp_file_path

df = ps.from_pandas(pd.read_parquet(exp_file_path)).spark.cache()

target_col = "UHII"
time_col = "timestamp"
id_cols = ["District"]

### Aggregate data

In [0]:
group_cols = [time_col] + id_cols

df_aggregated = df \
  .groupby(group_cols) \
  .agg(UHII=(target_col, "avg")) \
  .reset_index()

## Time column Analysis

Show the time range for the time series

In [0]:
df_time_range = df_aggregated.groupby(id_cols).agg(min=(time_col, "min"), max=(time_col, "max"))
display(df_time_range.reset_index())

## Target Value Analysis

Time series target value status

In [0]:
selected_cols = id_cols + [target_col]
target_stats_df = df_aggregated[selected_cols].groupby(id_cols).describe()
display(target_stats_df.reset_index())

Check the number of missing values in the target column.

In [0]:
def num_nulls(x):
  num_nulls = x.isnull().sum()
  return pd.Series(num_nulls)

null_stats_df = df_aggregated[selected_cols].groupby(id_cols).apply(num_nulls)[target_col]
display(null_stats_df.to_frame().reset_index())

## Visualize the Data

In [0]:
# Select one id from id columns
idx = df_aggregated[id_cols].to_pandas().astype(str).drop_duplicates()
idx_list = idx.loc[0].values.tolist() # change loc index here to see other identities
df_sub = df.loc[(df["District"] == idx_list[0])]

df_sub = df_sub.filter(items=[time_col, target_col])
df_sub.set_index(time_col, inplace=True)
df_sub[target_col] = df_sub[target_col].astype("float")

In [0]:
import matplotlib.pyplot as plt

plt.plot(df_sub, label=target_col)
plt.legend()
plt.show()

In [0]:
# delete the temp data
shutil.rmtree(exp_temp_dir)