In [43]:
import pandas as pd # Pandas is an open source library providing high-performance, easy-to-use data structures and data analysis tools for Python
import json # JSON encoder and decoder for Python
import re
from functools import reduce
import numpy as np
from math import isnan
from anonymizeip import anonymize_ip

Note it is possible that there are different Response IDs with the same IP address


In [50]:
qualtrics_data = pd.read_csv("C:/Users/huixin/Dropbox/research/menoplan/beta_phase/qualtrics_data/numeric_20oct.csv")
ga_data = pd.read_csv("C:/Users/huixin/Dropbox/research/menoplan/beta_phase/ga_data/20oct_page_visited.csv")
#click_event_data = pd.read_csv("C:/Users/huixin/Dropbox/research/menoplan/beta_phase/ga_data/20oct_events.csv")
qualtrics_data = qualtrics_data.iloc[2: , :]
click_event_data = click_event_data.drop(columns=['Event Value', 'Avg. Value'])

In [51]:
qualtrics_data['EndDate']
qualtrics_data['Date'] = pd.to_datetime(qualtrics_data['EndDate']).dt.date

In [52]:
ga_data = ga_data.rename(columns={'IP Address': 'IPAddress'})
ga_data.head(4)
ga_data.loc[ga_data['Page'] == '/', 'Page'] = 'homepage'
ga_data['Page'] = ga_data['Page'].str.strip("/")
ga_data = ga_data.drop(columns="Page Value")
# 

In [56]:
mask = (qualtrics_data['Date'] > pd.to_datetime("2021-09-23").date()) #filter out dates on 23 Sept and before
qualtrics_data = qualtrics_data[mask] 

There are 431 unique response Ids but only 305 unique IP addresses.
That means people are starting new qualtrics surveys from the same device. 

In [57]:
merged_df = pd.merge(ga_data, qualtrics_data, on='IPAddress')
merged_df.columns


Index(['Page', 'IPAddress', 'Pageviews', 'Unique Pageviews',
       'Avg. Time on Page', 'Entrances', 'Bounce Rate', '% Exit', 'StartDate',
       'EndDate', 'Status', 'Progress', 'Duration (in seconds)', 'Finished',
       'RecordedDate', 'ResponseId', 'RecipientLastName', 'RecipientFirstName',
       'RecipientEmail', 'ExternalReference', 'LocationLatitude',
       'LocationLongitude', 'DistributionChannel', 'UserLanguage',
       'Q_RecaptchaScore', 'Q2', 'Q30_First Click', 'Q30_Last Click',
       'Q30_Page Submit', 'Q30_Click Count', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8',
       'Q9', 'Q10', 'Q11', 'Q12', 'Q13_1', 'Q13_2', 'Q13_3', 'Q13_4', 'Q13_5',
       'Q13_6', 'Q14_1', 'Q14_2', 'Q14_3', 'Q14_4', 'Q14_5', 'Q14_6', 'Q15',
       'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
       'Q27', 'Q28', 'Date'],
      dtype='object')

There are 640 unique Response IDs but 415 unique IP addresses. What could lead to this discrepancy?
In merged_df there are multiple rows for each uniqueIP because each row corresponds to a page visited

In [74]:
print(merged_df.ResponseId.nunique(), merged_df.IPAddress.nunique()) 

640 415


In [59]:
merged_df.to_csv("merged_ga_qualtrics.csv")
merged_df = pd.read_csv("merged_ga_qualtrics.csv")

In [60]:
viewsPerPageById = merged_df.groupby(['ResponseId', 'Page'], as_index=False).agg({'Pageviews':'sum', 'StartDate':'first'})
viewsPerPageById.columns = viewsPerPageById.columns.str.strip('/?=')
viewsPerPageById= viewsPerPageById.rename(columns={"": "homepage"})

In [61]:
ls = list(viewsPerPageById.columns)
r= re.compile('\d*([.,\/]?\d+)')# use regex to find dates
newlist = list(filter(r.match, ls)) # create list of dates 
viewsPerPageById = viewsPerPageById.drop(columns=newlist) #we do not want the dates

In [62]:
insomnia_cols = [col for col in viewsPerPageById.columns if 'test/insomnia-severity-index' in col and len(col) >28]
anxiety_cols = [col for col in viewsPerPageById.columns if 'test/anxiety-assessment' in col and len(col) >23]
depression_cols = [col for col in viewsPerPageById.columns if 'test/depression-assessment' in col and len(col) >26]

In [63]:
viewsPerPageById['insomnia-severity-index-results'] = viewsPerPageById[insomnia_cols].sum(axis=1)
viewsPerPageById['anxiety-assessment-results'] = viewsPerPageById[anxiety_cols].sum(axis=1)
viewsPerPageById['depression-assessment-results'] = viewsPerPageById[depression_cols].sum(axis=1)

In [64]:
viewsPerPageById = viewsPerPageById.drop(columns=insomnia_cols)
viewsPerPageById = viewsPerPageById.drop(columns=anxiety_cols)
viewsPerPageById = viewsPerPageById.drop(columns=depression_cols)

In [65]:
merged_df['AllViewedPages']= merged_df[['ResponseId', 'EndDate','Page']].groupby(['ResponseId', 'EndDate'])['Page'].transform(lambda x: ','.join(x))
merged_df[['ResponseId','EndDate','AllViewedPages']].drop_duplicates() #drop rows if allViewedPages, ResponseId and EndDate are the same
merged_df.reset_index(drop=True, inplace=True)

In [66]:
viewedPagesSummary = merged_df.groupby(['ResponseId','AllViewedPages'], as_index=False).agg({'Pageviews':'sum', 'Unique Pageviews':"sum"})

viewedPagesSummary has ResponseId, all viewed pages, page views and unique page views.

In [67]:
# compile the list of dataframes you want to merge
data_frames = [viewedPagesSummary,viewsPerPageById, qualtrics_data]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['ResponseId'], how='inner'), data_frames)

In [69]:
df_merged['IPFreq']=df_merged['IPAddress'].map(df_merged['IPAddress'].value_counts())
df_merged = df_merged.sort_values(by=['IPFreq', 'IPAddress'])

In [70]:
df_saved = df_merged
df_saved_no_ip = df_saved.drop('IPAddress', axis=1)  #remove IPAddress column
df_saved_no_ip.to_csv("C:/Users/huixin/Dropbox/research/menoplan/beta_phase/user_data_20oct_no_ip.csv")

In [71]:
df_saved.to_csv("C:/Users/huixin/Dropbox/research/menoplan/beta_phase/user_data_20oct.csv")

dfByUser is a df with a single user as row. AllViewedPages is a list of all pages a used has visited. This df also unique page views and total page views. However, this df is not useful because it is merged with qualtrics data that has NOT been preprocessed by Leslie.


In [72]:
dfByUser = pd.read_csv("C:/Users/huixin/Dropbox/research/menoplan/beta_phase/user_data_20oct.csv")

In [73]:
df_dropNaCols = dfByUser.dropna(axis=1, how='all') #only these columns are all NAs
#dfByUser.columns[dfByUser.isnull().all(0)] #'RecipientLastName', 'RecipientFirstName', 'RecipientEmail','ExternalReference'