In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

sns.set()

In [6]:
df = pd.read_csv('output.csv')
df["Correctness_Value"] = df["Correctness"]
df["Correctness"] = df["Correctness"].map({0: "Incorrect", 1:"Correct"})
df['Correctness'] = pd.Categorical(df['Correctness'],
                                   categories=["Correct", "Incorrect"],
                                   ordered=True)
df['Order Difference'] = pd.Categorical(df['Order Difference'],
                                   categories=[-1, 0, 1],
                                   ordered=True)
df['Question ID'] = pd.Categorical(df['Question ID'],
                                   categories=sorted(df['Question ID'].value_counts().keys()),
                                   ordered=True)
df["Given-Expected Answer Pair"] = df.apply(lambda row:(row["Answer"], row["Correct Answer"]), axis=1)
df["Given-Expected Answer Pair"] = pd.Categorical(df["Given-Expected Answer Pair"],
                                                 categories=sorted(df["Given-Expected Answer Pair"].value_counts().keys()),
                                                 ordered=True)
print(len(df["Participant ID"].value_counts().to_dict().keys()))
print(set(df["Participant ID"].value_counts().to_dict().values()))

126
{40, 41}


In [3]:
print("collection and sharing question correctness rate:", df[df["Question Type"] == 1]["Correctness_Value"].mean())
concept_groups = df["Concept Group"].value_counts().to_dict().keys()
for concept_group in concept_groups:
    print(f"{concept_group} correctness rate:", df[df["Concept Group"] == concept_group]["Correctness_Value"].mean())

collection and sharing question correctness rate: 0.3852520692249812
basic correctness rate: 0.7005291005291006
Webviews correctness rate: 0.33073929961089493
Service Providers correctness rate: 0.37209302325581395
Libraries/SDKs correctness rate: 0.6031746031746031
On-device processing and transfers correctness rate: 0.392
User Consent correctness rate: 0.11290322580645161
Share Basic correctness rate: 0.5757575757575758
Legal Request correctness rate: 0.10606060606060606
Ephemeral Processing correctness rate: 0.4
Anonymization correctness rate: 0.15873015873015872
Pseudynoymization correctness rate: 0.6666666666666666
Optional Data Collection correctness rate: 0.2857142857142857
Collect Basic correctness rate: 0.7258064516129032
Off-device Server Transfers correctness rate: 0.48333333333333334
End to End Encryption correctness rate: 0.4166666666666667


In [13]:
#calculating precision
questions = df['Question Name'].value_counts().to_dict().keys()
questionDf = df[df["Question Type"] == 1]
questionDf = questionDf.replace({"Both": "Collected, Shared"})
#number of correct collection answers
collectionDf = questionDf[questionDf['Correct Answer'].str.contains("Collected")]
correctCollection = collectionDf[collectionDf['Answer'].str.contains("Collected")]
totalCollectionCorrect = len(correctCollection.iloc[:, 0])

#number of correct shared answers
sharingDf = questionDf[questionDf['Correct Answer'].str.contains("Shared")]
correctShared = sharingDf[sharingDf['Answer'].str.contains("Shared")]
totalSharedCorrect = len(correctShared.iloc[:, 0])

allCollection = questionDf[questionDf["Answer"].str.contains("Collected")]
totalCollectionAnswered = len(allCollection.iloc[:, 0])
allShared = questionDf[questionDf["Answer"].str.contains("Shared")]
totalSharedAnswered = len(allShared.iloc[:, 0])
precisionCollection = totalCollectionCorrect / totalCollectionAnswered
precisionShared = totalSharedCorrect /totalSharedAnswered
print("precision for collection = " + str(precisionCollection))
print("precision for shared = " + str(precisionShared))

precision for collection = 0.8233034571062741
precision for shared = 0.3286264441591784


In [14]:
#calculating recall
questions = df['Question Name'].value_counts().to_dict().keys()
questionDf = df[df["Question Type"] == 1]
#number of correct collection answers
collectionDf = questionDf[questionDf['Correct Answer'].str.contains("Collected")]
correctCollection = collectionDf[collectionDf['Answer'].str.contains("Collected")]
totalCollectionCorrect = len(correctCollection.iloc[:, 0])
totalCollection = len(collectionDf.iloc[:, 0])

#number of correct shared answers
sharingDf = questionDf[questionDf['Correct Answer'].str.contains("Shared")]
correctShared = sharingDf[sharingDf['Answer'].str.contains("Shared")]
totalSharedCorrect = len(correctShared.iloc[:, 0])
totalShared = len(sharingDf.iloc[:, 0])
recallCollection = totalCollectionCorrect / totalCollection
recallShared = totalSharedCorrect / totalShared
print("recall for collection = " + str(recallCollection))
print("recall for shared = " + str(recallShared))



recall for collection = 0.3981723237597911
recall for shared = 0.3333333333333333


In [None]:
questions = df['Question Name'].value_counts().to_dict().keys()
cr_per_concept_group = {}
all_collection_sharing_cr_list = []
for question in questions:
    cr = df[df['Question Name'] == question]["Correctness_Value"].mean()
    keys = list(df[df['Question Name'] == question]["Concept Group"].value_counts().to_dict().keys())
    assert len(keys) == 1
    concept = keys[0]
    if concept == "basic":
        continue
    if not concept in cr_per_concept_group:
        cr_per_concept_group[concept] = []
    cr_per_concept_group[concept].append(cr)
    all_collection_sharing_cr_list.append(cr)

# print(cr_per_concept_group)
for concept, cr_list in cr_per_concept_group.items():
    print(f"{concept} correctness rate:", np.mean(cr_list))
print("collection and sharing question correctness rate:", np.mean(all_collection_sharing_cr_list))

In [None]:
from pymer4.models import Lmer

df_lmer = df[["Participant ID", "Prompt Condition", "Question Type", "Correctness_Value"]]
df_lmer = df_lmer.rename(columns={"Participant ID": "Participant_ID", "Prompt Condition": "Prompt_Condition", "Question Type": "Question_Type"})
df_lmer["Prompt_Condition"] = df_lmer["Prompt_Condition"].map({1:"Blank", 2:"Brief", 3:"Full"})
df_lmer["Question_Type"] = df_lmer["Question_Type"].map({1:"Collect/Share", 2:"Data Type", 3:"Data Purpose"})
# print(df_lmer)
model = Lmer("Correctness_Value ~ Prompt_Condition * Question_Type + (Question_Type|Participant_ID)",
             data=df_lmer, family = 'binomial')

print(model.fit())

In [None]:
sns.set(rc = {'figure.figsize':(6,5)})
ax = sns.pointplot(x="Question_Type", y="Correctness_Value", hue="Prompt_Condition", data=df_lmer)

In [None]:
sns.set(rc = {'figure.figsize':(6,5)})
ax = sns.pointplot(x="Question_Type", y="Correctness_Value", hue="Prompt_Condition", data=df_lmer[df_lmer["Prompt_Condition"].isin(["Blank", "Brief"]) & df_lmer["Question_Type"].isin(["Collect/Share", "Data Purpose"])])
plt.savefig("brief_purpose_interaction_plot.pdf", bbox_inches='tight')
plt.savefig("brief_purpose_interaction_plot.png", dpi=300, bbox_inches='tight')

In [None]:
sns.set(rc = {'figure.figsize':(6,5)})
ax = sns.pointplot(x="Question_Type", y="Correctness_Value", hue="Prompt_Condition", data=df_lmer[df_lmer["Prompt_Condition"].isin(["Blank", "Full"]) & df_lmer["Question_Type"].isin(["Collect/Share", "Data Type"])])
plt.savefig("full_data_type_interaction_plot.pdf", bbox_inches='tight')
plt.savefig("full_data_type_interaction_plot.png", dpi=300, bbox_inches='tight')

In [None]:
sns.set(rc = {'figure.figsize':(5,5)})
g = sns.barplot(x= "Question Type", y= "Correctness_Value", data=df)
plt.ylabel("Correct rate")
plt.savefig("question_type_correct_rate.pdf", bbox_inches='tight')
plt.savefig("question_type_correct_rate.png", dpi=300, bbox_inches='tight')

In [None]:
sns.set(rc = {'figure.figsize':(5,5)})
g = sns.barplot(x= "Prompt Condition", y= "Correctness_Value", data=df)
plt.ylabel("Correct rate")
plt.savefig("prompt_condition_correct_rate.pdf", bbox_inches='tight')
plt.savefig("prompt_condition_correct_rate.png", dpi=300, bbox_inches='tight')

In [None]:
type1 = df[df["Question Type"] == 1]
types = type1.groupby("Prompt Condition")["Correctness"].value_counts(normalize=True)
types = types.drop(labels=['Incorrect'], level = 1)
df1 = pd.DataFrame(types)
df1 = df1.rename(columns={"Correctness" : "Rate"})
g = sns.barplot(x= "Prompt Condition", y= "Rate", data=df1.reset_index())


In [None]:
type2 = df[df["Question Type"] == 2]
types = type2.groupby("Prompt Condition")["Correctness"].value_counts(normalize=True)
types = types.drop(labels=['Incorrect'], level = 1)
df1 = pd.DataFrame(types)
df1 = df1.rename(columns={"Correctness" : "Rate"})
g = sns.barplot(x= "Prompt Condition", y= "Rate", data=df1.reset_index())


In [None]:
type3 = df[df["Question Type"] == 3]
types = type3.groupby("Prompt Condition")["Correctness"].value_counts(normalize=True)
types = types.drop(labels=['Incorrect'], level = 1)
df1 = pd.DataFrame(types)
df1 = df1.rename(columns={"Correctness" : "Rate"})
g = sns.barplot(x= "Prompt Condition", y= "Rate", data=df1.reset_index())


In [None]:
df_incorrect = df[df["Order Difference"] != 0]
print(df_incorrect[df_incorrect["Question Type"] == 2]["Order Difference"].value_counts(normalize=True))
print(df_incorrect[df_incorrect["Question Type"] == 3]["Order Difference"].value_counts(normalize=True))
g = sns.FacetGrid(df_incorrect, col = "Question Type")
graph = g.map_dataframe(sns.histplot, x = "Order Difference", stat="probability")
graph.set_xlabels("Order Difference")
graph.set_ylabels("Rate")

In [None]:
df1 = pd.read_csv('output.csv')
df1["Prompt Condition"] = df1["Prompt Condition"].map({1:"Blank", 2:"Brief", 3:"Full"})

In [None]:
def plot_correct_rate_per_condition(df1, target_question_category):
    type1 = df1[df1["Question Type"] == 1]
    type1 = type1[type1["Question Category"] == target_question_category]
    print(type1.apply(lambda row:row["Question Name"] + row["Prompt Condition"], axis=1).value_counts())
    type1["Question Name"] = type1.apply(lambda row: row["Question Name"].split("Please select")[0], axis=1)
    sns.set(rc = {'figure.figsize':(5,5)})
    g = sns.barplot(y= "Question Name", x= "Correctness", hue = "Prompt Condition", data=type1, orient="h")
    prefix = target_question_category.replace(" ", "_")
    plt.savefig(f"{prefix}_question_correct_rate_per_condition.png", dpi=300, bbox_inches='tight')

In [None]:
def print_count_per_question_condition(df1, target_question_type):
    type1 = df1[df1["Question Type"] == target_question_type]
    type1_dict = type1.apply(lambda row:row["Question Name"] + row["Prompt Condition"], axis=1).value_counts().to_dict()
    print(len(type1_dict.keys()))
    print(type1_dict)

In [None]:
print_count_per_question_condition(df1, 1)

In [None]:
print_count_per_question_condition(df1, 2)

In [None]:
print_count_per_question_condition(df1, 3)

In [None]:
plot_correct_rate_per_condition(df1, "collect basic")

In [None]:
plot_correct_rate_per_condition(df1, "collect special")

In [None]:
plot_correct_rate_per_condition(df1, "collect exempt")

In [None]:
plot_correct_rate_per_condition(df1, "share basic")

In [None]:
plot_correct_rate_per_condition(df1, "share special")

In [None]:
plot_correct_rate_per_condition(df1, "share exempt")

In [None]:
type1 = df1[df1["Question Type"] == 1]
type1 = type1.rename(columns={"Correctness Value" : "Correct Rate", "Question Category": "C/S Question Category"})
sns.set(rc = {'figure.figsize':(15,8)})
g = sns.barplot(y= "C/S Question Category", x= "Correctness", hue = "Prompt Condition", data=type1, orient="h", order=["collect basic", "share basic", "collect special", "share special", "collect exempt", "share exempt"])
plt.savefig("collect_share_question_category_correct_rate.pdf", bbox_inches='tight')
plt.savefig("collect_share_question_category_correct_rate.png", dpi=300, bbox_inches='tight')

In [None]:
type2 = df1[df1["Question Type"] == 2]
sns.set(rc = {'figure.figsize':(20,30)})
g = sns.barplot(y= "Question Name", x= "Correctness", hue = "Prompt Condition", data=type2, orient='h')

In [None]:
type3 = df1[df1["Question Type"] == 3]
sns.set(rc = {'figure.figsize':(20,10)})
g = sns.barplot(y= "Question Name", x= "Correctness", hue = "Prompt Condition", data=type3, orient="h")

In [None]:
df = pd.read_csv('output.csv')
df["Correctness"] = df["Correctness"].map({0: "Incorrect", 1:"Correct"})
df['Correctness'] = pd.Categorical(df['Correctness'],
                                   categories=["Correct", "Incorrect"],
                                   ordered=True)
df['Order Difference'] = pd.Categorical(df['Order Difference'],
                                   categories=[-1, 0, 1],
                                   ordered=True)
df["Given-Expected Answer Pair"] = df.apply(lambda row:(row["Answer"], row["Correct Answer"]), axis=1)


In [None]:
g = sns.FacetGrid(df[df["Correct Answer"] == "Shared"], col = "Given-Expected Answer Pair", col_wrap=1, aspect=9.5,
                 col_order=sorted(df[df["Correct Answer"] == "Shared"]["Given-Expected Answer Pair"].value_counts().keys()))
graph = g.map_dataframe(sns.histplot, x = "Question ID", stat="count")
graph.set_xlabels("Answer")
graph.set_ylabels("Rate")
g.set_titles(col_template="{col_name}", row_template="{row_name}")

# Add a title for the entire plot
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Given-Expected Answer Pair")

In [None]:
t1 = pd.read_csv('output.csv', usecols = [4, 5, 6])
t1 = t1[t1["Question Type"] == 1]
t1 = t1.drop("Question Type", 1)
correctTotals = dict()
correctAnswers = t1.loc[:, "Correct Answer"]
actualAnswers = t1.loc[:, "Answer"]
answerPairs = dict()
for (i,j) in zip(correctAnswers, actualAnswers):
    correctTotals[i]= correctTotals.get(i, 0) + 1
    answerPairs[(i, j)] = answerPairs.get((i, j), 0) + 1
#create ratios for actual/correct answers
for (correct, actual) in answerPairs:
    answerPairs[(correct, actual)] = answerPairs[(correct, actual)] / correctTotals[correct]
rowNames = ["Collected", "Shared", "Both", "Neither"]
type1 = pd.DataFrame(columns=rowNames, index=rowNames)
for i in range(len(rowNames)):
    for j in range(len(rowNames)):
        if (rowNames[i], rowNames[j]) in answerPairs:
            type1.at[rowNames[i], rowNames[j]] = answerPairs[(rowNames[i], rowNames[j])]
        else:
            type1.at[rowNames[i], rowNames[j]] = 0
type1 = type1.astype(float)
print(type1)
sns.set(rc = {'figure.figsize':(7,5)})
g = sns.heatmap(type1, square=True)
g.set(xlabel= "Answer", ylabel = "Correct Answer")
plt.savefig("collect_sharing_heatmap.pdf", bbox_inches='tight')
plt.savefig("collect_sharing_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
t2 = pd.read_csv('output.csv', usecols = [4, 5, 6])
t2 = t2[t2["Question Type"] == 2]
t2 = t2.drop("Question Type", 1)
correctTotals = dict()
correctAnswers = t2.loc[:, "Correct Answer"]
actualAnswers = t2.loc[:, "Answer"]
answerPairs = dict()
for (i,j) in zip(correctAnswers, actualAnswers):
    correctTotals[i]= correctTotals.get(i, 0) + 1
    answerPairs[(i, j)] = answerPairs.get((i, j), 0) + 1
#create ratios for actual/correct answers
for (correct, actual) in answerPairs:
    answerPairs[(correct, actual)] = answerPairs[(correct, actual)] / correctTotals[correct]
rowNames = ["Location", "Personal Information", "Financial Information", "Health and Fitness", "Messages", "Photos or Videos", "Audio Files", "Files and docs", "Calendar", "Contacts", "App Activity", "Web Browsing", "App Information and Performance", "Device or Other Identifiers"]
type2 = pd.DataFrame(columns=rowNames, index=rowNames)
for i in range(len(rowNames)):
    for j in range(len(rowNames)):
        if (rowNames[i], rowNames[j]) in answerPairs:
            type2.at[rowNames[i], rowNames[j]] = answerPairs[(rowNames[i], rowNames[j])]
        else:
            type2.at[rowNames[i], rowNames[j]] = 0
type2 = type2.astype(float)
sns.set(rc = {'figure.figsize':(7,5)})
g = sns.heatmap(type2, square=True)
g.set(xlabel= "Answer", ylabel = "Correct Answer")
plt.savefig("data_type_heatmap.pdf", bbox_inches='tight')
plt.savefig("data_type_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
t3 = pd.read_csv('output.csv', usecols = [4, 5, 6])
t3 = t3[t3["Question Type"] == 3]
t3 = t3.drop("Question Type", 1)
correctTotals = dict()
correctAnswers = t3.loc[:, "Correct Answer"]
actualAnswers = t3.loc[:, "Answer"]
answerPairs = dict()
for (i,j) in zip(correctAnswers, actualAnswers):
    correctTotals[i]= correctTotals.get(i, 0) + 1
    answerPairs[(i, j)] = answerPairs.get((i, j), 0) + 1
#create ratios for actual/correct answers
for (correct, actual) in answerPairs:
    answerPairs[(correct, actual)] = answerPairs[(correct, actual)] / correctTotals[correct]
rowNames = ["App functionality", "Analytics", "Developer communications", "Advertising or marketing", "Fraud prevention, security, and compliance", "Personalization", "Account management"]
type3 = pd.DataFrame(columns=rowNames, index=rowNames)
for i in range(len(rowNames)):
    for j in range(len(rowNames)):
        if (rowNames[i], rowNames[j]) in answerPairs:
            type3.at[rowNames[i], rowNames[j]] = answerPairs[(rowNames[i], rowNames[j])]
        else:
            type3.at[rowNames[i], rowNames[j]] = 0
type3 = type3.astype(float)
sns.set(rc = {'figure.figsize':(7,5)})
g = sns.heatmap(type3, square=True)
g.set(xlabel= "Answer", ylabel = "Correct Answer")
plt.savefig("data_purpose_heatmap.pdf", bbox_inches='tight')
plt.savefig("data_purpose_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
t3 = pd.read_csv('output.csv', usecols = [4, 5, 6])
t3 = t3[t3["Question Type"] == 3]
t3 = t3.drop("Question Type", 1)
correctTotals = dict()
correctAnswers = t3.loc[:, "Correct Answer"]
actualAnswers = t3.loc[:, "Answer"]
answerPairs = dict()
for (i,j) in zip(correctAnswers, actualAnswers):
    correctTotals[i]= correctTotals.get(i, 0) + 1
    answerPairs[(i, j)] = answerPairs.get((i, j), 0) + 1
#create ratios for actual/correct answers
for (correct, actual) in answerPairs:
    answerPairs[(correct, actual)] = answerPairs[(correct, actual)] / correctTotals[correct]
rowNames = ["App functionality", "Analytics", "Developer communications", "Advertising or marketing", "Fraud prevention, security, and compliance", "Personalization", "Account management"]
type3 = pd.DataFrame(columns=rowNames, index=rowNames)
for i in range(len(rowNames)):
    for j in range(len(rowNames)):
        if (rowNames[i], rowNames[j]) in answerPairs:
            type3.at[rowNames[i], rowNames[j]] = answerPairs[(rowNames[i], rowNames[j])]
        else:
            type3.at[rowNames[i], rowNames[j]] = 0
type3 = type3.astype(float)
sns.set(rc = {'figure.figsize':(7,5)})
g = sns.heatmap(type3, square=True)
g.set(xlabel= "Answer", ylabel = "Correct Answer")
plt.savefig("data_purpose_heatmap.pdf", bbox_inches='tight')
plt.savefig("data_purpose_heatmap.png", dpi=300, bbox_inches='tight')