In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns

from statistics import mean
from datetime import timedelta
from datetime import datetime

### Remove null rows ###

In [2]:
def remove_null_rows(
        data_frame: pd.DataFrame,
        email_col: str) -> pd.DataFrame: 
    """Removes null rows from the dataframe."""
    data = data_frame.dropna(subset = email_col).reset_index(drop = True)

    return data

### Remove test rows ### 

In [3]:
def remove_test_rows(
        data_frame: pd.DataFrame,
        email_col: str) -> pd.DataFrame: 
    """Removes test participants from the dataframe."""
    data = data_frame[data_frame[email_col].str[:4].str.isdigit()].reset_index(drop = True)
    
    return data

### Separate data by timepoint 

In [4]:
def separate_time_points(
        data_frame: pd.DataFrame, 
        email_col: str,
        time_point: str) -> pd.DataFrame:
    """Separates data by timepoint."""
    valid_time_points = {"baseline", "6 month", "12 month"}
    if (time_point not in valid_time_points): 
        raise ValueError("time_point must be one of %r," % valid_time_points)
        
    if (time_point == "baseline"): 
        data = data_frame[~data_frame[email_col].str.contains("6mo|12mo")]
        
    if (time_point == '6 month'): 
        data = data_frame[data_frame[email_col].str.contains("6mo")]
        
    if (time_point == "12 month"): 
        data = data_frame[data_frame[email_col].str.contains("12mo")]
    
    return data.reset_index(drop = True)

### Remove participants

In [5]:
def remove_participants(
        data_frame: pd.DataFrame,
        id_col) -> pd.DataFrame:
    """Removes dropped participants."""
    data = data_frame[data_frame[id_col].str.contains(
        "6089|6093|6096|6097|6099|6100|6102|6104|6105|6110|6111|6112|6113|6114|6115|6116|6117|6118|6120|6127|6128|6139|6154|6166|6172|6176|6178"
    ) == False].reset_index(drop = True)
    
    return data

### Main cleaning

In [6]:
def main_cleaning(
        data_frame: pd.DataFrame,
        time_point: str,
        id_col: str,
        email_col: str) -> pd.DataFrame: 
    """Cleans a given dataframe accordingly."""
    # Remove null rows 
    data = remove_null_rows(data_frame = data_frame, 
                            email_col = email_col)
    
    # Remove test rows 
    data = remove_test_rows(data_frame = data,
                            email_col = email_col) 
    
    # Separate based on time points 
    data = separate_time_points(data_frame = data,
                               email_col = email_col, 
                               time_point = time_point)
    
    # Remove dropped participants 
    data = remove_participants(data_frame = data,
                              id_col = id_col) 
    
    return data