In [1]:
# Import packages 

import pandas as pd 
import numpy as np
import statistics
import seaborn as sns

In [2]:
# Import data and create copies 

new_use = pd.read_csv("C:/Users/denni/OneDrive/Desktop/irp_data/new_use_feb_3_original.csv",
                     low_memory=False)
follow_up = pd.read_csv("C:/Users/denni/OneDrive/Desktop/irp_data/follow_up_feb_3.csv",
                       low_memory=False)

nu = new_use.copy()
fu = follow_up.copy()

In [3]:
# Selecting columns of interest

nu_col = ['Email', 'Survey Name',
          'Survey Started Date', 'Survey Started Time',
          'Survey Submitted Date', 'Survey Submitted Time', 
          'Trigger Date', 'Trigger Time', 
          'Do you PLAN to use cannabis today?',
          'Do you PLAN to drink alcohol today?',
          'How high are you right now?',
          'How intoxicated are you&nbsp;right now?',
          'Where have you used cannabis&nbsp;in the past 2 hours?&nbsp;_Home',
          'Where have you used alcohol&nbsp;in the past 2 hours?_Home',
          'Which of the following have you used within the past 2 hours?_Cannabis',
          'Which of the following have you used within the past 2 hours?_Alcohol',
          'Who have you been with when using alcohol&nbsp;in the past 2 hours?&nbsp;_Alone',
          'Who have you been with when using cannabis&nbsp;in the past 2 hours?_Alone']   

In [4]:
# Creating identical columns 

fu['Do you PLAN to use cannabis today?'] = pd.Series(int)
fu['Do you PLAN to drink alcohol today?'] = pd.Series(int)
fu['How high are you right now?'] = pd.Series(int)
fu['How intoxicated are you&nbsp;right now?'] = pd.Series(int)
fu['Where have you used cannabis&nbsp;in the past 2 hours?&nbsp;_Home'] = pd.Series(int)
fu['Which of the following have you used within the past 2 hours?_Cannabis'] = pd.Series(int)
fu['Where have you used alcohol&nbsp;in the past 2 hours?_Home'] = pd.Series(int)
fu['Which of the following have you used within the past 2 hours?_Alcohol'] = pd.Series(int)
fu['Who have you been with when using alcohol&nbsp;in the past 2 hours?&nbsp;_Alone'] = pd.Series(int)
fu['Who have you been with when using cannabis&nbsp;in the past 2 hours?_Alone'] = pd.Series(int)

In [5]:
# Selecting columns of interest

fu_col = ['Email', 'Survey Name',
          'Survey Started Date', 'Survey Started Time',
          'Survey Submitted Date', 'Survey Submitted Time', 
          'Trigger Date', 'Trigger Time', 
          'Do you PLAN to use cannabis today?',
          'Do you PLAN to drink alcohol today?',
          'How high are you right now?',
          'How intoxicated are you&nbsp;right now?',
          'Where have you used cannabis&nbsp;in the past 2 hours?&nbsp;_Home',
          'Where have you used alcohol&nbsp;in the past 2 hours?_Home',
          'Which of the following have you used within the past 2 hours?_Cannabis',
          'Which of the following have you used within the past 2 hours?_Alcohol',
          'Who have you been with when using alcohol&nbsp;in the past 2 hours?&nbsp;_Alone',
          'Who have you been with when using cannabis&nbsp;in the past 2 hours?_Alone']   

In [6]:
# Creating subsets of the data 

nu = nu.loc[:, nu_col]
fu = fu.loc[:, fu_col]

In [7]:
# Preprocessing the dataset 

## Combining the datasets

data = pd.concat([nu, fu], ignore_index = 1)


## Separating the data by timepoint 

data = data.dropna(subset = ["Email"]).reset_index(drop = 1)
data['id'] = data['Email'].str[:4]

In [8]:
# Functions

def time_point_sep(data, time_point):
    """
    Given a dataframe, specify the desired time point.
    Returns a dataframe
    """
    valid = {"baseline", "6_month", "12_month"}
    if time_point not in valid:
        raise ValueError("time_point must be one of %r." % valid)
    if time_point == "baseline":
        bl_data = (data[~data["Email"]
                        .str.contains("6mo|12mo")])
        bl_data = bl_data.assign(test_case = bl_data["Email"]
                                             .str[:4]
                                             .str.isnumeric())
        bl_data = (bl_data[bl_data["test_case"]==True]
                   .reset_index(drop=True))
        return bl_data
    if time_point == "6_month":
        six_month_data = (data[data["Email"]
                               .str.contains("6mo")])
        six_month_data = six_month_data.assign(test_case = six_month_data["Email"]
                                                           .str[:4]
                                                           .str.isnumeric())
        six_month_data = (six_month_data[six_month_data["test_case"]==True]
                          .reset_index(drop=True))
        return six_month_data
    if time_point == "12_month":
        yr_data = (data[data["Email"]
                        .str.contains("12mo")])
        yr_data = yr_data.assign(test_case = yr_data["Email"]
                                             .str[:4]
                                             .str.isnumeric())
        return yr_data
    
def date_clean(data, survey_start, survey_submit, trigger):
    """
    Converts all date and time related columns to datetime dtype
    Returns a sorted dataset by ID and survey submit time
    """
    data[survey_start] = (data["Survey Started Date"] 
                          + " " 
                          + data["Survey Started Time"])
    data[survey_start] = pd.to_datetime(data[survey_start], 
                                        format="%d/%m/%Y %H:%M:%S")
    data[survey_submit] = (data["Survey Submitted Date"] 
                           + " " 
                           + data["Survey Submitted Time"])
    data[survey_submit] = pd.to_datetime(data[survey_submit], 
                                format="%d/%m/%Y %H:%M:%S")
    data[trigger] = (data["Trigger Date"] 
                     + " " 
                     + data["Trigger Time"])
    data[trigger] = pd.to_datetime(data[trigger], 
                                 format="%d/%m/%Y %H:%M:%S")
    data = data.sort_values(by=["id", "survey_submit"]).reset_index(drop=True)
    return data


def add_session(data, var_name = "session"): 
    """
    Takes dataframe as input
    Returns dataframe with session variable based on user ID
    """
    # Initialize variable and n 
    data[var_name] = pd.Series(int) 
    n = 0
    
    # Iterate through the dataset determining the session number
    for i in data.index: 
        cur = data.at[i, "id"]
        
        # If not at the end of the dataset
        if (i != pd.Index(data).size - 1): 
            nex = data.at[i + 1, "id"]
            
            # If IDs are identical
            if (cur == nex): 
                
                # If the current survey is a New Use Survey
                if (data.at[i, "Survey Name"] == "New Use Survey"):
                    n += 1
                    data.at[i, var_name] = n
                
                else: 
                    data.at[i, var_name] = n
            
            else:
                data.at[i, "session"] = n 
                n = 0
        
        else:
            # If the current survey is a New Use Survey
                if (data.at[i, "Survey Name"] == "New Use Survey"):
                    n += 1
                    data.at[i, var_name] = n
                
                else: 
                    data.at[i, var_name] = n
    
    return data

def compliance_1fu(data, var_name = "comp_1fu"): 
    """
    Takes dataset as input
    Computes assessment compliance as one follow-up completed 
    within two hours of the new use survey
    Returns dataframe with added columns
    """
    # Initialize columns 
    data[var_name] = pd.Series(float)
    
    ## Indices and variables 
    fu_count = 0 
    
    ses_idx = data.columns.get_loc("session")
    survey_idx = data.columns.get_loc('Survey Name')
    comp_idx = data.columns.get_loc(var_name)
    submit_idx = data.columns.get_loc('survey_submit')
    
    
    # Iterate through the dataset and calculate the compliance
    for i in data.index: 
        
        # Initialize current row session
        c_ses = data.iloc[i, ses_idx]
        
        # Check if at the end of the dataset
        if (i != pd.Index(data).size - 1): 
            
            # Initialize the session of the next row
            n_ses = data.iloc[i + 1, ses_idx]
            
            # Compare sessions 
            if (c_ses == n_ses): 
                
                # Compare survey type
                if (data.iloc[i, survey_idx] == "New Use Survey"): 
                    nu_idx = i
                    nu_time = data.iloc[i, submit_idx]
                    data.iloc[i, comp_idx] = 0
                
                else: 
                    # Calculate the time difference from new use submission 
                    # time and current follow-up submission time 
                    time_diff = (data.iloc[i, submit_idx] - nu_time).total_seconds()
                
                    # If submitted on time, increment the completed count
                    if (time_diff < 10800):
                        fu_count += 1
            
            # Compare when the sessions change
            else:
                
                # Compare survey type 
                if (data.iloc[i, survey_idx] == "New Use Survey"):
                    data.iloc[i, comp_idx] = 0
                
                else: 
                    # Compare submission time 
                    time_diff = (data.iloc[i, submit_idx] - nu_time).total_seconds()
                    
                    if (time_diff < 10800): 
                        fu_count += 1
                        
                        # Calculate compliance 
                        if (fu_count >= 1): 
                            data.iloc[nu_idx, comp_idx] = 1
                            fu_count = 0
                    
                        else:
                            data.iloc[nu_idx, comp_idx] = 0
                            fu_count = 0
                            
                    else: 
                        if (fu_count >= 1): 
                            data.iloc[nu_idx, comp_idx] = 1
                            fu_count = 0
                    
                        else:
                            data.iloc[nu_idx, comp_idx] = 0
                            fu_count = 0
                            
        else: 
            if (data.iloc[i, survey_idx] == 'New Use Survey'): 
                data.iloc[i, comp_idx] = 0
            
            else: 
                time_diff = (data.iloc[i, submit_idx] - nu_time).total_seconds()
            
                if (time_diff < 10800):
                    fu_count += 1
                        
                    if (fu_count >= 1): 
                        data.iloc[nu_idx, comp_idx] = 1
                        fu_count = 0
                    
                    else:
                        data.iloc[nu_idx, comp_idx] = 0
                        fu_count = 0
                    
                else:
                    if (fu_count >= 1): 
                        data.iloc[nu_idx, comp_idx] = 1
                        fu_count = 0
                    
                    else:
                        data.iloc[nu_idx, comp_idx] = 0
                        fu_count = 0
                        
    return data

def column_gwc(data, column): 
    """
    Takes dataset and column as input
    Computes the mean of the column
    Concatenates a grand-mean centered column
    Returns the dataset
    """
    
    # Initialize the grand-mean centered column 
    column_gwc = column + "_gwc"
    data[column_gwc] = pd.Series(object)
    
    # Initialize the mean of the column 
    avg = data[column].mean()
    
    # Iterate through the dataset and compute the grand-mean centered value 
    
    ## Identify the indices of the columns
    gwc_idx = data.columns.get_loc(column_gwc)
    var_idx = data.columns.get_loc(column)
    
    ## Iterate
    for i in data.index: 
        data.iloc[i, gwc_idx] = data.iloc[i, var_idx] - avg
    
    return data

def column_wth_var(data, var, between_var): 
    """
    Takes dataset, column and between-person column as inputs 
    Computes the within-person difference
    Returns the dataset
    """
    # Computes the within-person variable
    data = data.assign(wth_var = data[var] - data[between_var])
    
    return data