In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
# Function to rename columns

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function picks a Pandas DataFrame and renames specific columns in lower case
    Inputs:
    df: Pandas DataFrame
    Outputs:
    A Pandas DataFrame with renamed columns and in lower case
    '''

    columns = []
    for colname in df.columns:
        columns.append(colname.lower())
            
    df.columns = columns
    df = df.rename(columns={'pcos (y/n)':'has_pcos',
                                ' age (yrs)':'age',
                                'blood group':'blood_group',
                                'pulse rate(bpm) ':'pulse_rate',
                                'rr (breaths/min)':'respiratory_rate',
                                'hb(g/dl)':'hb',
                                'cycle(r/i)':'cycle_type',
                                'cycle length(days)':'menstrual_phase_days',
                                'pregnant(y/n)':'is_pregnant',
                                'no. of aborptions':'n_of_abortions',
                                'beta-hcg(miu/ml)_first':'beta_hcg_first',
                                'beta-hcg(miu/ml)_second':'beta_hcg_second',
                                'fsh/lh':'fsh_lh_ratio',
                                'waist:hip ratio':'waist_hip_ratio',
                                'tsh (miu/l)':'tsh',
                                'amh(ng/ml)':'amh',
                                'prl(ng/ml)':'prl',
                                'vit d3 (ng/ml)':'vit_d3',
                                'prg(ng/ml)':'prg',
                                'rbs(mg/dl)':'rbs',
                                'weight gain(y/n)':'has_weight_gain',
                                'hair growth(y/n)':'has_hair_growth',
                                'skin darkening (y/n)':'has_skin_darkening',
                                'hair loss(y/n)':'has_hair_loss',
                                'pimples(y/n)':'has_pimples',
                                'fast food (y/n)':'eats_fast_food',
                                'reg.exercise(y/n)':'exercises_reg',
                                'bp _systolic (mmhg)':'blood_pressure_systolic',
                                'bp _diastolic (mmhg)':'blood_pressure_diastolic',
                                'follicle no. (l)':'n_of_follicles_left',
                                'follicle no. (r)':'n_of_follicles_right',
                                'avg. f size (l) (mm)':'avg_follicle_size_left',
                                'avg. f size (r) (mm)':'avg_follicle_size_right',
                                'endometrium (mm)':'endometrium_size',
                                }, inplace=True)
        
    return df

In [None]:
# Function to change separator in values

def change_separator(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function takes a dataframe with commas as separators in its columns and changes them dots
    Inputs:
    df: Pandas DataFrame
    Outputs:
    A Pandas DataFrame with values with dots as separators
     '''

    columns = []
    for col_name in df.select_dtypes(include='object').columns:
        df[col_name] = df[col_name].str.replace(',', '.')
    
    return df

In [None]:
# Function to split dataframes into continuous and discrete

def split_dataframes(df: pd.DataFrame):
    '''
    Insert one dataframe will all data and return two: one with float64 variables and another with int64
    '''

    continuous_cols = df.select_dtypes(include=['float64']).columns
    discrete_cols = df.select_dtypes(include=['int64']).columns
    
    continuous_df = df[continuous_cols]
    discrete_df = df[discrete_cols]
    
    return continuous_df, discrete_df

In [None]:
# Function to create histograms for continuous variables

def create_histograms(df: pd.DataFrame):
    '''
    This function histograms for all the columns in a dataframe that only has continuous variables
    '''

    num_cols = df.shape[1]
    num_rows = num_cols // 3 + 1
    
    fig, axes = plt.subplots(3, num_rows, figsize=(12, 12))

    axes = axes.flatten()

    for i, column in enumerate(df.columns):
        sns.histplot(df, x = column, bins=50, ax=axes[i])

    plt.tight_layout()
    plt.show()

In [None]:
# Function to create barplots for discrete variables

def create_barplots(df: pd.DataFrame):
    '''
    This function creates barplots for all the columns in a dataframe that only has discrete variables
    '''

    num_cols = df.shape[1]
    num_rows = num_cols // 2 + 1
    
    fig, axes = plt.subplots(2, num_rows, figsize=(10, 10))

    axes = axes.flatten()

    for i, column in enumerate(df.columns):
        sns.histplot(x=column, data=df, ax=axes[i])
        #ax[i].set_ylim(0, 8000)

    plt.tight_layout()
    plt.show()