# EDA with AML Datasets

In [None]:
## Check core SDK version number
import azureml.core
import mlflow
import os

from azureml.core import (Datastore, Dataset, Environment, Experiment, ScriptRunConfig,
                          Workspace)
from azureml.core.authentication import InteractiveLoginAuthentication
from IPython.display import display



print("[INFO] SDK version:", azureml.core.VERSION)

## due to diferent tenant -> typically customer tenant
# interactive_auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')

## set mlflow backend to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print("[INFO] MLFlow wired to AML:", "experiments.azureml.net" in mlflow.get_tracking_uri())

## Config

In [None]:
aml_compute = "aml-cluster"
aml_ds = "aml_data"# "mmaadlsgen2_test"
# aml_dset = 'noa_weather'
# aml_dset = "oj_sample_data"
aml_dset = "diabetes_multiple"
aml_experiment = "mlflow-azureml"
loc_data = "data/demo_data"

In [None]:
## set the datastore
ds = ws.datastores[aml_ds]
print(f"[INFO] Datastore: {ds.name}, type: {ds.datastore_type}")

##  EDA

In [None]:
wtds = Dataset.get_by_name(ws, name=aml_dset)
pdf = wtds.to_pandas_dataframe()
pdf.info()

https://www.kaggle.com/ekami66/detailed-exploratory-data-analysis-with-python

In [None]:
# !pip install seaborn

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

plt.style.use('bmh')

In [None]:
df = pdf.copy()

In [None]:
# eda_target = "Revenue"

## Weather
# eda_target = "temperature"
# categorical_features_list = ['usaf', 'wban','stationName', 'countryOrRegion', 'p_k',
#        'year', 'day', 'version', 'month', eda_target]
# quantitative_features_list = ['latitude', 'longitude', 'elevation',
#        'windAngle', 'windSpeed', eda_target]

## Diabetes
eda_target = 'Y'

categorical_features_list = ['SEX', eda_target]
quantitative_features_list = ['AGE', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', eda_target]


In [None]:
cols_to_drop = [] # or empty
cols_at_end = [eda_target]

Let's just remove [] and the features with 30% or less NaN values

In [None]:
# df.count() does not include NaN values
df2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]

for c in cols_to_drop:
    del df2[c]
    
# del df2['Id']
print("List of dropped columns:", end=" ")
for c in df.columns:
    if c not in df2.columns:
        print(c, end=", ")
print('\n')

df2 = df2[[c for c in df2 if c not in cols_at_end] 
        + [c for c in cols_at_end if c in df2]]

df = df2

In [None]:
df.columns

In [None]:
print(df[eda_target].describe())
plt.figure(figsize=(12, 6))
sns.histplot(df[eda_target], color='g', bins=100, kde=True);

#### Numerical data distribution

In [None]:
list(set(df.dtypes.tolist()))

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose informations

#### Correlation

In [None]:
df_num_corr = df_num.corr()[eda_target][:-1] # -1 because the latest row is SalePrice
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with Target {}:\n{}".format(len(golden_features_list), eda_target,golden_features_list))

In [None]:
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+5],
                y_vars=[eda_target])

In [None]:
golden_features_list

In [None]:
corr = df_num.drop(eda_target, axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

#### Q -> Q (Quantitative to Quantitative relationship)

In [None]:

df_quantitative_values = df[quantitative_features_list]
df_quantitative_values.head()

In [None]:
features_to_analyse = [x for x in quantitative_features_list if x in golden_features_list]
features_to_analyse.append(eda_target)
features_to_analyse

In [None]:
fig, ax = plt.subplots(round(len(features_to_analyse) / 3), 3, figsize = (18, 12))

for i, ax in enumerate(fig.axes):
    if i < len(features_to_analyse) - 1:
        sns.regplot(x=features_to_analyse[i],y=eda_target, data=df[features_to_analyse], ax=ax)

#### C -> Q (Categorical to Quantitative relationship)

In [None]:
# quantitative_features_list[:-1] as the last column is SalePrice and we want to keep it
categorical_features = [a for a in quantitative_features_list[:-1] + df.columns.tolist() if (a not in quantitative_features_list[:-1]) or (a not in df.columns.tolist())]
df_categ = df[categorical_features]
df_categ.head()

In [None]:
df_not_num = df_categ.select_dtypes(include = ['O'])
print('There is {} non numerical features including:\n{}'.format(len(df_not_num.columns), df_not_num.columns.tolist()))

In [None]:
features_to_analyse = categorical_features_list
# features_to_analyse.append(eda_target)

In [None]:
fig, ax = plt.subplots(round(len(features_to_analyse) / 3), 3, figsize = (18, 12))

for i, ax in enumerate(fig.axes):
    if i < len(features_to_analyse) - 1:
        sns.boxplot(x=features_to_analyse[i], y=eda_target, data=df_categ[features_to_analyse], ax=ax)

In [None]:
fig, axes = plt.subplots(round(len(df_not_num.columns) / 3), 3, figsize=(12, 30))

for i, ax in enumerate(fig.axes):
    if i < len(df_not_num.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=df_not_num.columns[i], alpha=0.7, data=df_not_num, ax=ax)

fig.tight_layout()

## Done

In [None]:
print("Done.")
