In [1]:
import pandas as pd
#reading the data
data = pd.read_csv('raw_data.csv', encoding='latin1') 
df=pd.DataFrame(data)
df.head()

Unnamed: 0,ID,Cycle length,Phase,Mood,Workout,Age,Sleep
0,1,26,Menstrual,3.0,57 hours,,
1,2,30 days,Menstrual,2.0,Less than 2 hours,,
2,3,28,Menstrual,5.0,0 hours (I dont work out),,
3,4,28 days,Menstrual,5.0,Less than 2 hours,,
4,5,25-28 days,Menstrual,6.0,0 hours (I dont work out),,


In [2]:
#checking for null values
df.isna().sum()

ID                0
Cycle length      5
Phase             0
Mood              5
Workout           5
Age             135
Sleep           145
dtype: int64

## Cleaning the data

In [13]:
#renaming the cycle length column
df.rename(columns={'Cycle length': 'Cycle_length'}, inplace=True)
#check for the change
df.columns

Index(['ID', 'Cycle_length', 'Phase', 'Mood', 'Workout', 'Age', 'Sleep'], dtype='object')

In [14]:
#cleaning cycle length
import re
import numpy as np

def clean_days(day_string: str) -> float:
    if pd.isna(day_string):
        return day_string  # Return NaN as is
    else:
        days=re.findall(r'\d+',day_string)   #find all the specified dates e.g., from 28 to 30 = [28,30]
        days=list(map(float, days))
        avg_days=np.average(days)   #calculate the average of the days
        return avg_days

df.Cycle_length=df.Cycle_length.apply(clean_days)
#replacing null values
df.Cycle_length.fillna(df.Cycle_length.mean(), inplace=True)
#checking the changes
df.head()

TypeError: expected string or bytes-like object, got 'float'

In [5]:
#clean for question misunderstanding
def remove_outliers(days):
    if days < 20:
        return 28
    else:
        return days
    
df.Cycle_length=df.Cycle_length.apply(remove_outliers)
#checking the changes
df.head()

Unnamed: 0,ID,Cycle_length,Phase,Mood,Workout,Age,Sleep
0,1,26.0,Menstrual,3.0,57 hours,,
1,2,30.0,Menstrual,2.0,Less than 2 hours,,
2,3,28.0,Menstrual,5.0,0 hours (I dont work out),,
3,4,28.0,Menstrual,5.0,Less than 2 hours,,
4,5,26.5,Menstrual,6.0,0 hours (I dont work out),,


In [6]:
# Cleaning all other columns
df.Age.fillna(25, inplace=True)
df.Sleep.fillna('6-8 hours', inplace=True)
df.Mood.fillna(df.Mood.mean(), inplace=True)
df.Workout.fillna(df.Workout.mode().iloc[0], inplace=True)

print("\nChecking for remaining null values:")
print(df.isna().sum())


Checking for remaining null values:
ID              0
Cycle_length    0
Phase           0
Mood            0
Workout         0
Age             0
Sleep           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Age.fillna(25, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Sleep.fillna('6-8 hours', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as 

In [7]:
#drop the ID column for analysis
df2=df.drop(columns=['ID'])
df2.columns

Index(['Cycle_length', 'Phase', 'Mood', 'Workout', 'Age', 'Sleep'], dtype='object')

In [8]:
# Get dummy variables for categorical columns
df_final=pd.get_dummies(df2, columns=['Sleep', 'Workout'], drop_first=True)

In [9]:
df_final.head()

Unnamed: 0,Cycle_length,Phase,Mood,Age,Sleep_4-5 hours,Sleep_6-7 hours,Sleep_6-8 hours,Sleep_8-9 hours,Sleep_Less than 4 hours,Workout_24 hours,Workout_57 hours,Workout_810 hours,Workout_Less than 2 hours,Workout_More than 10 hours
0,26.0,Menstrual,3.0,25,False,False,True,False,False,False,True,False,False,False
1,30.0,Menstrual,2.0,25,False,False,True,False,False,False,False,False,True,False
2,28.0,Menstrual,5.0,25,False,False,True,False,False,False,False,False,False,False
3,28.0,Menstrual,5.0,25,False,False,True,False,False,False,False,False,True,False
4,26.5,Menstrual,6.0,25,False,False,True,False,False,False,False,False,False,False


In [10]:
#bootstrap sampling
df_final=df_final.sample(n=1500, replace=True, random_state=42)


In [11]:
#export the clean data for analysis
clean_data=df_final.to_csv('clean_data.csv', index=False)

In [12]:
df_final.describe()

Unnamed: 0,Cycle_length,Mood
count,1500.0,1500.0
mean,29.263148,5.526013
std,6.24007,2.613084
min,21.0,1.0
25%,28.0,4.0
50%,28.0,5.0
75%,29.5,8.0
max,80.0,10.0
