In [2]:
! pip install sweetviz

Collecting sweetviz
  Using cached sweetviz-2.1.3-py3-none-any.whl (15.1 MB)
Collecting importlib-resources>=1.2.0
  Using cached importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Installing collected packages: importlib-resources, sweetviz
Successfully installed importlib-resources-5.4.0 sweetviz-2.1.3


In [3]:
import sweetviz
import logging
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import os
import sys

In [28]:


# Use logging commands instead of print
logging.basicConfig(level=logging.INFO)

# Create the global variables
df = []

# Used when removing duplicate rows
patient_id_col = ['PatientId']

# We will fill null cells with mean
numerical_cols = ['PatientAge',
                  'bmdtest_height',
                  'bmdtest_weight',
                  'bmdtest_tscore_fn']

# We will fill null cells with mode
nominal_cols = [
    'PatientGender',
    'parentbreak',
    'ptunsteady',
    'alcohol',
    'wasfractdue2fall',
    'ptfall',
    'oralster',
    'smoke'
]

# We will fill null cells with 0
special_nominal_cols = ['arthritis',
                        'cancer',
                        'diabetes',
                        'heartdisease',
                        'respdisease',
                        'howbreak',
                        'hip',
                        'ankle',
                        'clavicle',
                        'elbow',
                        'femur',
                        'spine',
                        'wrist',
                        'shoulder',
                        'tibfib'
                        ]


def set_directory(temp_path):
    # detect the current working directory and add the sub directory
    main_path = os.getcwd()
    absolute_path = main_path + temp_path
    try:
        os.mkdir(absolute_path)
    except OSError:
        logging.error("Creation of the directory %s failed. Folder already exists." % absolute_path)
    else:
        logging.info("Successfully created the directory %s " % absolute_path)


# Remove the duplicates using the Patient ID and Baseline ID. ID's are unique, meaning we shouldn't have duplicates
def remove_duplicates_with_id():
    try:
        df.drop_duplicates(subset=['PatientId'], inplace=True)
        df.reset_index(drop=True, inplace=True)

    except ValueError as er:
        logging.error(str(er))


# Weight values are in kilograms
def lbs_to_kg(weight_value):
    if weight_value is not None:
        weightKg = weight_value * 0.45359237
        return weightKg


# Height and Weight values are in kilograms
def calculate_bmi(idx, height_value, weight_value):
    try:
        return weight_value / ((height_value / 100) ** 2)
    except ValueError as er:
        logging.error(str(er))
        logging.error(f'Unable to calculate BMI for patient id = {idx}')


# Converting Values into Metric
def data_to_metric(idx, height_value, weight_value):
    try:

        heightCm = 0
        weightKg = 0
        isMetric = True  # Flag is used when checking if the weight is metric

        # Lets convert the height
        if 1 < height_value < 2.2:
            heightCm = height_value * 100  # Convert METERS to CM

        elif 50 < height_value < 84:
            # Height value is too high to be METERS or FEET and too low to be CM.
            # Assume the height is INCHES and convert to CM
            heightCm = height_value * 2.54
            isMetric = False

        elif height_value > 125:
            # The height is probably in CM
            heightCm = height_value

        # Lets convert the weight
        if isMetric:
            weightKg = weight_value
        else:
            weightKg = lbs_to_kg(weight_value)

        return heightCm, weightKg

    except ValueError as er:
        logging.error(str(er))
        logging.error(f'Unable to convert height and weight to metric for patient id = {idx}')


def fill_bmi_with_mean():
    try:
        df['bmi'].replace(0, df['bmi'].mean(), inplace=True)
        df['bmi'].fillna((df['bmi'].mean()), inplace=True)
    except ValueError as er:
        logging.error(str(er))


def fill_numerical_with_mean():
    for column in numerical_cols:
        try:
            mean = df[column].mean()
            df[column].fillna(mean, inplace=True)
            df[column].replace(0, mean, inplace=True)
        except ValueError as er:
            logging.error(str(er))


def fill_nominal_with_mode():
    for column in nominal_cols:
        try:
            mode = df[column].mode()[0]
            df[column].fillna(mode, inplace=True)
        except ValueError as er:
            logging.error(str(er))


def fill_special_nominal_with_zero():
    for column in special_nominal_cols:
        try:
            df[column].fillna(0, inplace=True)
        except ValueError as er:
            logging.error(str(er))


def create_html_report(data, save_path):
    try:
        logging.info(f'Creating sweetviz graph for {save_path}')
        temp_analysis = sweetviz.analyze(data)
        temp_analysis.show_html(save_path, open_browser=False)

    except ValueError as er:
        logging.error(str(er))



In [39]:
# Loading the data
try:
    logging.info(f'Loading File\n')
    file_name = 'Raw.csv'
    df = pd.read_csv(file_name)

except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info(f'Setting Directories\n')
    set_directory('Output')
    set_directory('Output/pre_cleaning_results')
    set_directory('Output/analysis_results')
except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info("Selecting features from Data\n")
    all_columns = np.concatenate((patient_id_col, numerical_cols, nominal_cols, special_nominal_cols))
    df = df[all_columns]

except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info(f'Performing analysis on the unclean data\n')
    create_html_report(df, 'Output/pre_cleaning_results/pre_analysis.html')

except ValueError as e:
    logging.error(str(e))

try:
    logging.info('Performing analysis on the unclean female and male data\n')
    female_data = df[df['PatientGender'] == 1]
    create_html_report(female_data, 'Output/pre_cleaning_results/pre_analysis_female.html')

    male_data = df[df['PatientGender'] == 2]
    create_html_report(male_data, 'Output/pre_cleaning_results/pre_analysis_male.html')

except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info("Removing duplicates\n")
    remove_duplicates_with_id()
except ValueError as e:
    logging.error(str(e))
    quit()

try:

    logging.info('Converting Height and Weight to Metric\n')

    converted_data_tuple = [
        data_to_metric(df.loc[idx, 'PatientId'],
                       df.loc[idx, 'bmdtest_height'],
                       df.loc[idx, 'bmdtest_weight'])
        for idx
        in range(len(df))]

    # Get heights from tuple
    df['bmdtest_height'] = [x[0] for x in converted_data_tuple]

    # Get weights from tuple
    df['bmdtest_weight'] = [x[1] for x in converted_data_tuple]

except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info("Imputing Data into numerical Columns\n")
    fill_numerical_with_mean()
except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info("Imputing Data into nominal Columns\n")
    fill_nominal_with_mode()
except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info("Imputing Data into special nominal Columns\n")
    fill_special_nominal_with_zero()
except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info('Creating BMI column\n')

    df['bmi'] = 0
    df['bmi'] = [
        calculate_bmi(df.loc[idx, 'PatientId'],
                      df.loc[idx, 'bmdtest_height'],
                      df.loc[idx, 'bmdtest_weight'])
        for idx
        in range(len(df))]

    fill_bmi_with_mean()
except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info('Saving Data to CSV file\n')

    path = Path("Clean_Data_Main.csv")
    df.replace(r'\s+', np.nan, regex=True)
    df.to_csv(path, index=False)

    logging.info(f'Data saved to {path}\n')

except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info('Performing Analysis on all the Data\n')
    create_html_report(df, 'Output/analysis_results/analysis.html')
except ValueError as e:
    logging.error(str(e))
    quit()

try:
    logging.info('Performing Analysis on the female and male data\n')
    female_data = df[df['PatientGender'] == 1]
    create_html_report(female_data, 'Output/analysis_results/analysis_female.html')

    male_data = df[df['PatientGender'] == 2]
    create_html_report(male_data, 'Output/analysis_results/analysis_male.html')

except ValueError as e:
    logging.error(str(e))
    quit()

INFO:root:Loading File

INFO:root:Setting Directories

ERROR:root:Creation of the directory C:\Users\alexc\Pictures\Main_Osteoporosis\Osteoporosis\1-Data_CleaningOutput failed. Folder already exists.
ERROR:root:Creation of the directory C:\Users\alexc\Pictures\Main_Osteoporosis\Osteoporosis\1-Data_CleaningOutput/pre_cleaning_results failed. Folder already exists.
ERROR:root:Creation of the directory C:\Users\alexc\Pictures\Main_Osteoporosis\Osteoporosis\1-Data_CleaningOutput/analysis_results failed. Folder already exists.
INFO:root:Selecting features from Data

INFO:root:Performing analysis on the unclean data

INFO:root:Creating sweetviz graph for Output/pre_cleaning_results/pre_analysis.html


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=29.0), HTML(value='')), layout=Layout(dis…

INFO:root:Performing analysis on the unclean female and male data

INFO:root:Creating sweetviz graph for Output/pre_cleaning_results/pre_analysis_female.html



Report Output/pre_cleaning_results/pre_analysis.html was generated.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=29.0), HTML(value='')), layout=Layout(dis…

INFO:root:Creating sweetviz graph for Output/pre_cleaning_results/pre_analysis_male.html



Report Output/pre_cleaning_results/pre_analysis_female.html was generated.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=29.0), HTML(value='')), layout=Layout(dis…

INFO:root:Removing duplicates

INFO:root:Converting Height and Weight to Metric

INFO:root:Imputing Data into numerical Columns

INFO:root:Imputing Data into nominal Columns

INFO:root:Imputing Data into special nominal Columns

INFO:root:Creating BMI column

INFO:root:Saving Data to CSV file

INFO:root:Data saved to Clean_Data_Main.csv

INFO:root:Performing Analysis on all the Data

INFO:root:Creating sweetviz graph for Output/analysis_results/analysis.html



Report Output/pre_cleaning_results/pre_analysis_male.html was generated.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=30.0), HTML(value='')), layout=Layout(dis…

INFO:root:Performing Analysis on the female and male data

INFO:root:Creating sweetviz graph for Output/analysis_results/analysis_female.html



Report Output/analysis_results/analysis.html was generated.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=30.0), HTML(value='')), layout=Layout(dis…

INFO:root:Creating sweetviz graph for Output/analysis_results/analysis_male.html



Report Output/analysis_results/analysis_female.html was generated.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=30.0), HTML(value='')), layout=Layout(dis…


Report Output/analysis_results/analysis_male.html was generated.
