In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Read csv

pcos_data = pd.read_csv('data/raw_data/pcos_data_full.csv')

In [3]:
# Review first 5 rows of the dataset

pcos_data.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,446,152,193,15,78,22,...,0,1.0,0,110,80,3,3,18,18,85
1,2,2,0,36,65,1615,2492116286,15,74,20,...,0,0.0,0,120,70,3,5,15,14,37
2,3,3,1,33,688,165,2527089073,11,72,18,...,1,1.0,0,120,80,13,15,18,20,10
3,4,4,0,37,65,148,2967494522,13,72,20,...,0,0.0,0,120,70,2,2,15,14,75
4,5,5,0,25,52,161,2006095444,11,72,18,...,0,0.0,0,120,80,3,4,16,14,7


In [4]:
# Let's review the shape of the dataset

pcos_data.shape

# This dataset contains 541 rows and 44 rows

(541, 44)

In [5]:
# After reviewing the columns available, we will drop some of them that we won't use in our analysis. Those columns are the following ones:

# Sl. No: this column works as a patient ID and we will drop it as it's not relevant for our analysis.
# Patient File No.: this column works as a patient ID as well and we will drop it as it's not relevant for our analysis.
# Marraige Status (Yrs): we presume this column is here in order to perform a deeper analysis about fertility. We won't consider it in our analysis though.
# Hip(inch): we already have the Waist:Hip ratio calculation. Therefore, we will drop this one.
# Waist(inch): we already have the Waist:Hip ratio calculation. Therefore, we will drop this one.
# FSH(mIU/mL): we already have the FSH/LH ratio calculation. We will drop this one.
# LH(mIU/mL): we already have the FSH/LH ratio calculation. We will drop this one.
# Height: this column is in the dataset in order to calculate the BMI. We will just keep the BMI column in order to follow the same logic used with the
# ratios columns.
# Weight: this column is in the dataset in order to calculate the BMI. We will just keep the BMI column in order to follow the same logic used with the
# ratios columns.


pcos_data.drop(['Sl. No','Patient File No.','Marraige Status (Yrs)','Hip(inch)','Waist(inch)','FSH(mIU/mL)','LH(mIU/mL)','Weight (Kg)','Height(Cm) '], axis=1, inplace=True)
pcos_data

# We now have 35 columns instead of the original 44

Unnamed: 0,PCOS (Y/N),Age (yrs),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),Hb(g/dl),Cycle(R/I),Cycle length(days),Pregnant(Y/N),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,0,28,193,15,78,22,1048,2,5,0,...,0,1.0,0,110,80,3,3,18,18,85
1,0,36,2492116286,15,74,20,117,2,5,1,...,0,0.0,0,120,70,3,5,15,14,37
2,1,33,2527089073,11,72,18,118,2,5,1,...,1,1.0,0,120,80,13,15,18,20,10
3,0,37,2967494522,13,72,20,12,2,5,0,...,0,0.0,0,120,70,2,2,15,14,75
4,0,25,2006095444,11,72,18,10,2,5,1,...,0,0.0,0,120,80,3,4,16,14,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,0,35,185,17,72,16,11,2,5,0,...,0,0.0,0,110,70,1,0,175,10,67
537,0,30,253,15,72,18,108,2,5,1,...,0,0.0,0,110,70,9,7,19,18,82
538,0,36,234,13,74,20,108,2,6,0,...,0,0.0,0,110,80,1,0,18,9,73
539,0,27,222,15,74,20,12,4,2,0,...,1,0.0,0,110,70,7,6,18,16,115


In [6]:
# Before renaming the columns, let's review if any of them have blank spaces

print(pcos_data.columns)

# Only " Age (yrs)", "Height(Cm) ", "Pulse rate(bpm) " have blank spaces before/after the column name (we should consider them in our function)

Index(['PCOS (Y/N)', ' Age (yrs)', 'BMI', 'Blood Group', 'Pulse rate(bpm) ',
       'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)',
       'Pregnant(Y/N)', 'No. of aborptions', 'beta-HCG(mIU/mL)_first',
       'beta-HCG(mIU/mL)_second', 'FSH/LH', 'Waist:Hip Ratio', 'TSH (mIU/L)',
       'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)',
       'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)',
       'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)'],
      dtype='object')


In [7]:
# Now that we dropped the rows and columns that we are not gonna use in our analysis, we will rename our columns with lower case and underscores
# in order to make it easier to work with them. We will create a rename_columns() function for this.

# If you want to review the metric units of each columns, please refer to the READ.ME file with the description of all of them.

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function picks a Pandas DataFrame and renames specific columns in lower case
    Inputs:
    df: Pandas DataFrame
    Outputs:
    A Pandas DataFrame with renamed columns and in lower case
    '''

    columns = []
    for colname in df.columns:
        columns.append(colname.lower())
            
    df.columns = columns
    df = df.rename(columns={'pcos (y/n)':'has_pcos',
                                ' age (yrs)':'age',
                                'blood group':'blood_group',
                                'pulse rate(bpm) ':'pulse_rate',
                                'rr (breaths/min)':'respiratory_rate',
                                'hb(g/dl)':'hb',
                                'cycle(r/i)':'cycle_type',
                                'cycle length(days)':'menstrual_phase_days',
                                'pregnant(y/n)':'is_pregnant',
                                'no. of aborptions':'n_of_abortions',
                                'beta-hcg(miu/ml)_first':'beta_hcg_first',
                                'beta-hcg(miu/ml)_second':'beta_hcg_second',
                                'fsh/lh':'fsh_lh_ratio',
                                'waist:hip ratio':'waist_hip_ratio',
                                'tsh (miu/l)':'tsh',
                                'amh(ng/ml)':'amh',
                                'prl(ng/ml)':'prl',
                                'vit d3 (ng/ml)':'vit_d3',
                                'prg(ng/ml)':'prg',
                                'rbs(mg/dl)':'rbs',
                                'weight gain(y/n)':'has_weight_gain',
                                'hair growth(y/n)':'has_hair_growth',
                                'skin darkening (y/n)':'has_skin_darkening',
                                'hair loss(y/n)':'has_hair_loss',
                                'pimples(y/n)':'has_pimples',
                                'fast food (y/n)':'eats_fast_food',
                                'reg.exercise(y/n)':'exercises_reg',
                                'bp _systolic (mmhg)':'blood_pressure_systolic',
                                'bp _diastolic (mmhg)':'blood_pressure_diastolic',
                                'follicle no. (l)':'n_of_follicles_left',
                                'follicle no. (r)':'n_of_follicles_right',
                                'avg. f size (l) (mm)':'avg_follicle_size_left',
                                'avg. f size (r) (mm)':'avg_follicle_size_right',
                                'endometrium (mm)':'endometrium_size',
                                }, inplace=True)
        
    return df

In [8]:
# Execute the function and review column names

rename_columns(pcos_data)
print(pcos_data.columns)

Index(['has_pcos', 'age', 'bmi', 'blood_group', 'pulse_rate',
       'respiratory_rate', 'hb', 'cycle_type', 'menstrual_phase_days',
       'is_pregnant', 'n_of_abortions', 'beta_hcg_first', 'beta_hcg_second',
       'fsh_lh_ratio', 'waist_hip_ratio', 'tsh', 'amh', 'prl', 'vit_d3', 'prg',
       'rbs', 'has_weight_gain', 'has_hair_growth', 'has_skin_darkening',
       'has_hair_loss', 'has_pimples', 'eats_fast_food', 'exercises_reg',
       'blood_pressure_systolic', 'blood_pressure_diastolic',
       'n_of_follicles_left', 'n_of_follicles_right', 'avg_follicle_size_left',
       'avg_follicle_size_right', 'endometrium_size'],
      dtype='object')


In [9]:
# Review data types in order to understand if any convertion would be needed

pcos_data.dtypes

# We can see that many of the columns that are numbers are listed as "object". We will need to change their type to either int64 or float64 in order
# to be able to work properly with them

has_pcos                      int64
age                           int64
bmi                          object
blood_group                   int64
pulse_rate                    int64
respiratory_rate              int64
hb                           object
cycle_type                    int64
menstrual_phase_days          int64
is_pregnant                   int64
n_of_abortions                int64
beta_hcg_first               object
beta_hcg_second              object
fsh_lh_ratio                 object
waist_hip_ratio              object
tsh                          object
amh                          object
prl                          object
vit_d3                       object
prg                          object
rbs                          object
has_weight_gain               int64
has_hair_growth               int64
has_skin_darkening            int64
has_hair_loss                 int64
has_pimples                   int64
eats_fast_food              float64
exercises_reg               

In [10]:
# Even if all variables are listed as numbers by looking at the dataset (not at the .dtypes), not all of them are numerical.
# The following variables are originally categorical: has_pcos, blood_group, cycle_type, is_pregnant, has_weight_gain,
# has_hair_growth, has_skin_darkening, has_hair_loss, has_pimples, eats_fast_food, exercises_reg

In [11]:
# Our dataset has "," instead of "." in our numerical columns. We will need to change this in order to be able to transform the columns to float64/int64.
# Let's create a function for this

def change_separator(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function takes a dataframe with commas as separators in its columns and changes them dots
    Inputs:
    df: Pandas DataFrame
    Outputs:
    A Pandas DataFrame with values with dots as separators
     '''

    columns = []
    for col_name in df.select_dtypes(include='object').columns:
        df[col_name] = df[col_name].str.replace(',', '.')
    
    return df

In [12]:
# Execute function

change_separator(pcos_data)

Unnamed: 0,has_pcos,age,bmi,blood_group,pulse_rate,respiratory_rate,hb,cycle_type,menstrual_phase_days,is_pregnant,...,has_pimples,eats_fast_food,exercises_reg,blood_pressure_systolic,blood_pressure_diastolic,n_of_follicles_left,n_of_follicles_right,avg_follicle_size_left,avg_follicle_size_right,endometrium_size
0,0,28,19.3,15,78,22,10.48,2,5,0,...,0,1.0,0,110,80,3,3,18,18,8.5
1,0,36,24.92116286,15,74,20,11.7,2,5,1,...,0,0.0,0,120,70,3,5,15,14,3.7
2,1,33,25.27089073,11,72,18,11.8,2,5,1,...,1,1.0,0,120,80,13,15,18,20,10
3,0,37,29.67494522,13,72,20,12,2,5,0,...,0,0.0,0,120,70,2,2,15,14,7.5
4,0,25,20.06095444,11,72,18,10,2,5,1,...,0,0.0,0,120,80,3,4,16,14,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,0,35,18.5,17,72,16,11,2,5,0,...,0,0.0,0,110,70,1,0,17.5,10,6.7
537,0,30,25.3,15,72,18,10.8,2,5,1,...,0,0.0,0,110,70,9,7,19,18,8.2
538,0,36,23.4,13,74,20,10.8,2,6,0,...,0,0.0,0,110,80,1,0,18,9,7.3
539,0,27,22.2,15,74,20,12,4,2,0,...,1,0.0,0,110,70,7,6,18,16,11.5


In [13]:
# Rename one value on column beta_hcg_second that was inputed with an additional dot (this is not letting us transform the variable to float64)

pcos_data.at[123, 'beta_hcg_second'] = "1.99" # As this is a string, we will change the value of this cell

pcos_data['beta_hcg_second'][123] # Review value after change

'1.99'

In [14]:
# There's another similar issue but with a value from the column amh (we have a letter instead of a number)

pcos_data = pcos_data.replace("a", np.NaN) # First, let's transform this value to a NaN

pcos_data['amh'][305] # Review the new value

nan

In [15]:
# Transform object columns in float64

columns_to_convert = pcos_data.select_dtypes(include='object').columns.tolist()
pcos_data[columns_to_convert] = pcos_data[columns_to_convert].astype('float64')

In [16]:
# Let's review if all our columns are now numerical

pcos_data.dtypes

# We can confirm that all of our variables are numerical now

has_pcos                      int64
age                           int64
bmi                         float64
blood_group                   int64
pulse_rate                    int64
respiratory_rate              int64
hb                          float64
cycle_type                    int64
menstrual_phase_days          int64
is_pregnant                   int64
n_of_abortions                int64
beta_hcg_first              float64
beta_hcg_second             float64
fsh_lh_ratio                float64
waist_hip_ratio             float64
tsh                         float64
amh                         float64
prl                         float64
vit_d3                      float64
prg                         float64
rbs                         float64
has_weight_gain               int64
has_hair_growth               int64
has_skin_darkening            int64
has_hair_loss                 int64
has_pimples                   int64
eats_fast_food              float64
exercises_reg               

In [17]:
# We still need to fix the nan value from the column 'amh'. In this case, we will replace it for the median. We are doing this, instead of replacing it for
# the mean, because this second measure is highly affected by extreme values. As we can see in the dataset, there's one extreme value of 66 (some studies show
# some measurements of the AMH of 48 ng/mL though).
# As a normal range for this hormone is between 1.5 and 5.0 (median = 3.5) and this patient doesn't have PCOS (the higher this value, the bigger the probability of
# getting PCOS), we will replace this value for the median.

#pcos_data['amh'].median() # 3.7

median_value = pcos_data['amh'].median()
pcos_data['amh'] = pcos_data['amh'].fillna(median_value)

pcos_data['amh'][305] # Review value

3.7

In [18]:
# Let's check if we still have some nan values

pcos_data.isna().sum()

# We can see here that there's still one missing value under the column "eats_fast_food". Let's review it.

has_pcos                    0
age                         0
bmi                         0
blood_group                 0
pulse_rate                  0
respiratory_rate            0
hb                          0
cycle_type                  0
menstrual_phase_days        0
is_pregnant                 0
n_of_abortions              0
beta_hcg_first              0
beta_hcg_second             0
fsh_lh_ratio                0
waist_hip_ratio             0
tsh                         0
amh                         0
prl                         0
vit_d3                      0
prg                         0
rbs                         0
has_weight_gain             0
has_hair_growth             0
has_skin_darkening          0
has_hair_loss               0
has_pimples                 0
eats_fast_food              1
exercises_reg               0
blood_pressure_systolic     0
blood_pressure_diastolic    0
n_of_follicles_left         0
n_of_follicles_right        0
avg_follicle_size_left      0
avg_follic

In [19]:
# Let's check the frequency counts for this variable

pcos_data['eats_fast_food'].value_counts()

# There's not an important level of imbalance data in this case. As we only have two values for this column (no = '0.0' and yes = '1.0'), we will replace this 
# value for "0.0" in order to balance the data a little bit more.

eats_fast_food
1.0    278
0.0    262
Name: count, dtype: int64

In [20]:
# Replace nan value in column "eats_fast_food"

pcos_data['eats_fast_food'] = pcos_data['eats_fast_food'].fillna(0)

In [21]:
# Review the amount of values for 0.0

pcos_data['eats_fast_food'].value_counts()

# As we can see, we have 263 instead of 262 values with 0.0. The replacement was performed properly.


eats_fast_food
1.0    278
0.0    263
Name: count, dtype: int64

In [22]:
# We shouldn't have any nan values at the moment but let's review it again just in case

pcos_data.isna().sum()

has_pcos                    0
age                         0
bmi                         0
blood_group                 0
pulse_rate                  0
respiratory_rate            0
hb                          0
cycle_type                  0
menstrual_phase_days        0
is_pregnant                 0
n_of_abortions              0
beta_hcg_first              0
beta_hcg_second             0
fsh_lh_ratio                0
waist_hip_ratio             0
tsh                         0
amh                         0
prl                         0
vit_d3                      0
prg                         0
rbs                         0
has_weight_gain             0
has_hair_growth             0
has_skin_darkening          0
has_hair_loss               0
has_pimples                 0
eats_fast_food              0
exercises_reg               0
blood_pressure_systolic     0
blood_pressure_diastolic    0
n_of_follicles_left         0
n_of_follicles_right        0
avg_follicle_size_left      0
avg_follic

In [23]:
# A minor change that we will make is again on the column eats_fast_food. As we can see here, this column has two possible values (no = '0.0' or yes = '1.0') but
# the variable is listed as float64 instead of int64 (other binary variables are int64). Therefore, we will change its type for consistency.

pcos_data['eats_fast_food'] = pcos_data['eats_fast_food'].astype('int64')

In [24]:
# Let's review the value_counts()

pcos_data['eats_fast_food'].value_counts() # The variable is now a int64 one

eats_fast_food
1    278
0    263
Name: count, dtype: int64

In [25]:
# We will make comments about some specific columns in our dataframe:

pcos_data.nunique()

# All columns that start with "has_", plus is_pregnant, eats_fast_food and exercises_reg have 2 possible values (no = '0' or yes = '1'). There's no more data 
# cleaning to do on them

has_pcos                      2
age                          29
bmi                         355
blood_group                   8
pulse_rate                   11
respiratory_rate              8
hb                           46
cycle_type                    3
menstrual_phase_days         12
is_pregnant                   2
n_of_abortions                6
beta_hcg_first              307
beta_hcg_second             202
fsh_lh_ratio                512
waist_hip_ratio              96
tsh                         308
amh                         300
prl                         481
vit_d3                      331
prg                          89
rbs                          55
has_weight_gain               2
has_hair_growth               2
has_skin_darkening            2
has_hair_loss                 2
has_pimples                   2
eats_fast_food                2
exercises_reg                 2
blood_pressure_systolic       6
blood_pressure_diastolic      5
n_of_follicles_left          21
n_of_fol

In [26]:
# age:
# Even if in some situations the variable 'age' is considered as continuous, we will keep it as a discrete variable (int64).

In [27]:
# bmi:

pcos_data['bmi'].describe()

# From using describe(), we can see that the BMI's mean is 24.3. When talking about bmi ranges, values higher than 30.0 fall into the overweight range (we can see
# that the max value is 38.0 in this case). As this dataset is from a city in India and the average BMI in this country is around 22, we can say that the mean's
# value is accurate.

count    541.000000
mean      24.311285
std        4.056399
min       12.417882
25%       21.641274
50%       24.238227
75%       26.634958
max       38.900000
Name: bmi, dtype: float64

In [28]:
# blood_group, pulse_rate, respiratory_rate, hb:

#pcos_data['blood_group'].value_counts() # This variable is numerical discrete and each value is associate to one blood type (more information on the READ.ME file)
#pcos_data['pulse_rate'].describe() # This variable's mean has an accurate value as well as the max. There are two min. outliers that, even if they are not affecting
# the mean, we will modify them with them median because it's impossible to have a pulse rate of 13 or 18 bpm.

pcos_data.loc[223, 'pulse_rate'] = 72.0
pcos_data.loc[296, 'pulse_rate'] = 72.0

#pcos_data['respiratory_rate'].describe() The same as before is applied for this variable: there are just a small number of outliers but overall values are accurate
#pcos_data['hb'].describe() # The average value of this hormone is between 12 to 16 g/dl. The mean is a bit under this range but it's because of some isolated
# min values. Overall, most of the data falls into the range value specified before.

# As our dataset is not that big, we are trying to avoid dropping rows or replacing values. Nevertheless, we think it's important to do an analysis and explanation
# of these variables in order to make it easier to understand the steps and thoughts behind the data cleaning process.


In [29]:
# cycle_type:

pcos_data['cycle_type'].value_counts()

# As we can see here, this numeric/discrete variable is accepting 3 possible values being '2' (regular) and '4' (irregular) the most common ones. There's also one 
# case that's equal to 5. Unfortunately, there's no information that could explain why that value is there (we presume this could be a human error). In this case,
# we decided to manipulate this value because in general women have a regular or irregular period as options. As '4' is the less representative value and this
# patient has PCOS (which is more common in women with irregular periods), we will replace this isolated case with '4'.


cycle_type
2    390
4    150
5      1
Name: count, dtype: int64

In [30]:
# Replace value in column 'cycle_type'

pcos_data['cycle_type'] = pcos_data['cycle_type'].replace(5,4)
pcos_data['cycle_type'].value_counts()


cycle_type
2    390
4    151
Name: count, dtype: int64

In [31]:
# mensutral_phase_days, n_of_abortions:

# pcos_data['menstrual_phase_days'].describe() # We have some extreme values in this column but it's possible that, at the time of this analysis, some women suffer
# from amenorrhea (lack of menstrual period) or some have a long one. Nevertheless, the mean is showing an accurate value (around 5 days).
pcos_data['n_of_abortions'].value_counts() # There are some outliers in this column but these are cases that can happen among women.

n_of_abortions
0    437
1     69
2     22
3     10
4      2
5      1
Name: count, dtype: int64

In [32]:
# beta_hcg_first, beta_hcg_second

#pcos_data['beta_hcg_first'].describe() # Even if these numbers seem very high, the value of this hormone during pregnancy can go up to 280.000 mlU/ml
pcos_data['beta_hcg_second'].describe() # This is a second measurement but the comments are the same as above


count      541.000000
mean       238.232993
std       1603.825706
min          0.990000
25%          1.990000
50%          1.990000
75%         97.630000
max      25000.000000
Name: beta_hcg_second, dtype: float64

In [33]:
# fsh_lh_ratio, waist_hip_ratio

pcos_data['fsh_lh_ratio'].describe() # There's one extreme value of 1372.83 that's completely out of range and we decided that its replacement for another value
# is necessary because it's not accurate. We will change its value for the median: we understand that the mean is heavily affected by extreme values and, after
# this modification, this measure will probably change its value drastically. Therefore, we want to keep our modifications the true to the overall data that we have.

count     541.000000
mean        6.904831
std        60.691822
min         0.002146
25%         1.416244
50%         2.169231
75%         3.959184
max      1372.826087
Name: fsh_lh_ratio, dtype: float64

In [34]:
# Let's replace this value

median_value_fsh_lh_ratio = pcos_data['fsh_lh_ratio'].median()
pcos_data['fsh_lh_ratio'] = pcos_data['fsh_lh_ratio'].replace(1372.826087,median_value_fsh_lh_ratio)
pcos_data['fsh_lh_ratio'].describe() # Now we can see that we have a new max value (it went from 1372 to 327 and the mean has changed from 6.9 to 4.3! - this is a
# good example of how the mean is affected by extreme values). This new value is also not accurate, let's replace it as well.

# No index was specified because we double checked that there was only one occurrence for each value


count    541.000000
mean       4.371269
std       14.900748
min        0.002146
25%        1.416244
50%        2.169231
75%        3.939394
max      327.000000
Name: fsh_lh_ratio, dtype: float64

In [35]:
# Replace new max values

pcos_data['fsh_lh_ratio'] = pcos_data['fsh_lh_ratio'].replace(327.0,median_value_fsh_lh_ratio)
pcos_data['fsh_lh_ratio'] = pcos_data['fsh_lh_ratio'].replace(61.875,median_value_fsh_lh_ratio)
pcos_data['fsh_lh_ratio'].describe()

# As we can see here, we replaced two values for the median. We can see that we still have some extreme values that are not accurate but on this specific situation,
# we will stop with the replacements: we don't want to modify the data that much and we've already replaced three values. Having said that, we consider that
# this was a good decision because the mean has decreased by 50% and it's now closer to the a more accurate range (2-3/4)

# No index was specified because we double checked that there was only once occurrence for each value


count    541.000000
mean       3.660481
std        4.760532
min        0.002146
25%        1.416244
50%        2.169231
75%        3.871287
max       50.000000
Name: fsh_lh_ratio, dtype: float64

In [36]:
# Let's see if we have a similar situation with the waist_hip_ratio column

pcos_data['waist_hip_ratio'].describe() # In this case, we can see that ratios are among the normal values. No modification is needed.

count    541.000000
mean       0.891895
std        0.046326
min        0.755556
25%        0.857143
50%        0.894737
75%        0.928571
max        0.979167
Name: waist_hip_ratio, dtype: float64

In [37]:
# tsh, amh, prl, vit_d3, prg, rbs

# pcos_data['tsh'].describe() # There extreme value at the moment is 65 which is not accurate. Nevertheless, our mean is inside of the acceptable range of 0.5 - 5 mlU/L.
# Therefore, no modification will be done in this case.
pcos_data['amh'].describe() # In this case, the extreme value of 66.0 is increasing the mean a little and it's taking it to an inaccurate level. Let's replace
# this value with the median.

count    541.000000
mean       5.620634
std        5.876742
min        0.100000
25%        2.010000
50%        3.700000
75%        6.900000
max       66.000000
Name: amh, dtype: float64

In [38]:
# Replace AMH extreme value

median_amh = pcos_data['amh'].median()
pcos_data['amh'] = pcos_data['amh'].replace(66.0,median_amh)
pcos_data['amh'] = pcos_data['amh'].replace(32.0,median_amh)
pcos_data['amh'] = pcos_data['amh'].replace(28.60,median_amh)
pcos_data['amh'].describe() # We can see that the mean has modify its value after three replacements but it was not that much. From this information, we can see
# that 50% of the women have AMH levels of 3.7 which is inside of the normal range (bigger than 4.0 it's a high value and could mean PCOS). Having said this,
# data is overall consistent and we will stop with the replacements. 

# No index was specified because we double checked that there was only one occurrence for each value

count    541.000000
mean       5.407140
std        5.049076
min        0.100000
25%        2.010000
50%        3.700000
75%        6.740000
max       26.800000
Name: amh, dtype: float64

In [39]:
# prl, vit_d3, prg, rbs

# pcos_data['prl'].describe() # In non-pregnant women a common value is less than 25 ng/mL. For pregnant women, this value can go up to 400 ng/mL. Therefore, we can
# say that data is accurate and no modification will be needed.
pcos_data['vit_d3'].describe() # The maximum value (6014.66) is too high and inaccurate. Therefore, we will replace it with the median.

count     541.000000
mean       49.915874
std       346.206599
min         0.000000
25%        20.800000
50%        25.900000
75%        34.500000
max      6014.660000
Name: vit_d3, dtype: float64

In [40]:
# Vit_d3 replacement

median_vit_d3 = pcos_data['vit_d3'].median()
pcos_data['vit_d3'] = pcos_data['vit_d3'].replace(6014.66,median_vit_d3)
pcos_data['vit_d3'] = pcos_data['vit_d3'].replace(5418.60,median_vit_d3)
pcos_data['vit_d3'] = pcos_data['vit_d3'].replace(90.0,median_vit_d3)
pcos_data['vit_d3'].describe() # By modifying three values, we can see how much the mean has dropped and it's now in an accurate value.

# No index was specified because we double checked that there was only one occurrence for each value

count    541.000000
mean      28.759571
std       12.239953
min        0.000000
25%       20.800000
50%       25.900000
75%       34.200000
max       87.200000
Name: vit_d3, dtype: float64

In [41]:
#pcos_data['prg'].describe() # Max levels of progesteron are reached in the third semester of pregnancy and they can go up to 214 ng/mL. No modification will be 
# performed here.
pcos_data['rbs'].describe() # Values are accurate and they can be higher than 200 mg/dL for people with diabetes.

count    541.000000
mean      99.835860
std       18.559298
min       60.000000
25%       92.000000
50%      100.000000
75%      107.000000
max      350.000000
Name: rbs, dtype: float64

In [42]:
#pcos_data['blood_pressure_systolic'].describe() # Overall values are accurate but there's an outlier that doesn't make
# sense as its value is 12 (a normal systolic blood pressure should be less than 120 mmHg but 12.0 is too low - probably, this measure was a human mistake.
# We will replace its value with the median (110.0)
pcos_data.loc[161, 'blood_pressure_systolic'] = 110.0

#pcos_data['blood_pressure_diastolic'].describe() # Overall values are accurate but there's an outlier that doesn't make
# sense as its value is 8 (a normal diastolic blood pressure should be less than 80 mmHg but 8.0 is too low - probably, this measure was a human mistake.
# We will replace its value with the median (80.0)
pcos_data.loc[200, 'blood_pressure_diastolic'] = 80.0

#pcos_data['n_of_follicles_left'].describe() # Normal values are between 6 - 10 and higher than 12 could mean a high follicle reserve. 

#pcos_data['n_of_follicles_right'].describe() # Normal values are between 6 - 10 and higher than 12 could mean a high follicle reserve.

pcos_data['avg_follicle_size_left'].describe() # Overall we have accurate values in this column. The min of 0.00 might be pushing down the mean a little
# but we don't want to modify this data as there's a chance of a woman to have POF (Premature ovarian failure): it is a disease in which the follicles in the ovaries
# rapidly decrease with none or few residual follicles in women younger than 40 years (which is exactly the case). Also, patients that show this value are also the ones that show
# n_of_follicles_left = 0.00. Therefore, data for them is accurate.
# Nevertheless, we have three cases that have inaccurate data because these patients have a n_of_follicles_left = 0.00 but they have an avg_follicle_size_left > 0.00
# and this cannot happen. We will modify those cases.

count    541.000000
mean      15.018115
std        3.566839
min        0.000000
25%       13.000000
50%       15.000000
75%       18.000000
max       24.000000
Name: avg_follicle_size_left, dtype: float64

In [43]:
# One modification that we will perform before continuing with another variables, is that we will change the blood_pressure_systolic
# and blood_pressure_diastolic data types from int64 to float64 as they are continuous variables instead of discrete.

pcos_data['blood_pressure_systolic'] = pcos_data['blood_pressure_systolic'].astype('float64')
pcos_data['blood_pressure_diastolic'] = pcos_data['blood_pressure_diastolic'].astype('float64')

In [44]:
# Review the variable types

# pcos_data['blood_pressure_systolic'].dtypes # float64
pcos_data['blood_pressure_diastolic'].dtypes # float64

dtype('float64')

In [45]:
# Let's modify the inaccurate avg_follicle_size_left

pcos_data.loc[128, 'avg_follicle_size_left'] = 0.0
pcos_data.loc[235, 'avg_follicle_size_left'] = 0.0
pcos_data.loc[251, 'avg_follicle_size_left'] = 0.0 # Inaccurate values were modified
pcos_data['avg_follicle_size_left'].describe()

count    541.000000
mean      14.927542
std        3.735317
min        0.000000
25%       13.000000
50%       15.000000
75%       18.000000
max       24.000000
Name: avg_follicle_size_left, dtype: float64

In [46]:
pcos_data['avg_follicle_size_right'].describe() # Overall values are accurate but we have the same issue as before: patients with no follicles in a specific ovary
# have existing measures of follicle sizes. We understand that either the average value or the number of follicles can have wrong measures but we will consider
# that the first defines the value of the second. Therefore, we will need to modify 10 values. This looks like a lot (even for the size of this dataset) but 
# the percentage of these modifications will be performed only in 0,02% of the dataset. 

pcos_data.loc[127, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[261, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[288, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[298, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[299, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[409, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[528, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[531, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[536, 'avg_follicle_size_right'] = 0.0
pcos_data.loc[538, 'avg_follicle_size_right'] = 0.0 # Values modified

In [47]:
# endometrium_size

pcos_data['endometrium_size'].describe() # Overall values are accurate with an extreme max value of 18 and two extreme min values of 0. 
# We won't modify these values because they are not many. The min of 0.00 won't be modified because a couple of different situations can happen here being one
# of them the inability to measure it because of a distortion in the uterine cavity.


count    541.000000
mean       8.475915
std        2.165381
min        0.000000
25%        7.000000
50%        8.500000
75%        9.800000
max       18.000000
Name: endometrium_size, dtype: float64

In [48]:
# As a final step in the data cleaning process, we will perform rounding in some of the columns

pcos_data['bmi'] = pcos_data['bmi'].round(1)
pcos_data['hb'] = pcos_data['hb'].round(1)
pcos_data['fsh_lh_ratio'] = pcos_data['fsh_lh_ratio'].round(2)
pcos_data['waist_hip_ratio'] = pcos_data['waist_hip_ratio'].round(2)
pcos_data['vit_d3'] = pcos_data['vit_d3'].round(2)

In [49]:
# Last review of data and comments

display(pcos_data)

Unnamed: 0,has_pcos,age,bmi,blood_group,pulse_rate,respiratory_rate,hb,cycle_type,menstrual_phase_days,is_pregnant,...,has_pimples,eats_fast_food,exercises_reg,blood_pressure_systolic,blood_pressure_diastolic,n_of_follicles_left,n_of_follicles_right,avg_follicle_size_left,avg_follicle_size_right,endometrium_size
0,0,28,19.3,15,78,22,10.5,2,5,0,...,0,1,0,110.0,80.0,3,3,18.0,18.0,8.5
1,0,36,24.9,15,74,20,11.7,2,5,1,...,0,0,0,120.0,70.0,3,5,15.0,14.0,3.7
2,1,33,25.3,11,72,18,11.8,2,5,1,...,1,1,0,120.0,80.0,13,15,18.0,20.0,10.0
3,0,37,29.7,13,72,20,12.0,2,5,0,...,0,0,0,120.0,70.0,2,2,15.0,14.0,7.5
4,0,25,20.1,11,72,18,10.0,2,5,1,...,0,0,0,120.0,80.0,3,4,16.0,14.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,0,35,18.5,17,72,16,11.0,2,5,0,...,0,0,0,110.0,70.0,1,0,17.5,0.0,6.7
537,0,30,25.3,15,72,18,10.8,2,5,1,...,0,0,0,110.0,70.0,9,7,19.0,18.0,8.2
538,0,36,23.4,13,74,20,10.8,2,6,0,...,0,0,0,110.0,80.0,1,0,18.0,0.0,7.3
539,0,27,22.2,15,74,20,12.0,4,2,0,...,1,0,0,110.0,70.0,7,6,18.0,16.0,11.5


In [50]:
# Now that we finished the data cleaning process, we will save the dataframe into a csv file so that we can  use it to create our machine learning model

pcos_data.to_csv('pcos_data_cleaned.csv',index = False, sep=";")

In [51]:
# End of data cleaning process