In [12]:
import pandas as pd
import numpy as np
from numpy import nan
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
import seaborn as sns

In [471]:
merged_image = pd.read_csv('./data/merged_image_match_facial_recognition.csv')
matched = merged_image[merged_image['image_match']>0.4]
merged_df = pd.read_csv('./data/merged.csv')
merged_df['bd'] = merged_df['bd'].astype(str).str[:4] #just taking the year
#looking at yelp and tinder merged dataframe after cross matching

Wanted to look into l-diversity based on our merged table, which is the table we are working with, which is a combination of 2 public tables to see how secure the table is after combining all the information based on what we know. I am looking at user_name as it is one of the more specific quasi-identifiers that are used and since names are a very helpful clue for connecting social media platforms.

In [144]:
merged_df["user_name"].value_counts()

Kevin L.      240
Chris M.      221
Michael U.    168
Chris L.      136
John D.       135
             ... 
Patrick B.      1
Shannon Z.      1
Shannon B.      1
Shannon D.      1
Elena C.        1
Name: user_name, Length: 2159, dtype: int64

In [145]:
name = merged_df['name'].value_counts()
single = name[name==1]
single

Lorena        1
Kevan         1
Brooks        1
Arun          1
Buck          1
             ..
Maximilian    1
Kegan         1
Cherry        1
Mady          1
Rin           1
Name: name, Length: 69, dtype: int64

As you can see here, there are a lot of Kevins, Chris', Michaels, etc. but there are still a lot of very specific names like Elena C. and Shannon D.. With these, this means that the equivalence classes for names leads to a k-anonymity score of only 1 for the whole table since that is the only number that satisfies all of the equivalence classes.This makes it very easy for anyone to point out who these people are in the dataset and know the urls to their Yelp profile, which can then reveal a lot of information about their lifestyle and the places they frequent.

I also looked at different combinations of equivalence classes, and the minimum k was still 1 for many. This means that there are no other people that could mask their identitity, and if someone looks up the name Rin and Kegan, they will be identified and and known within the dataset.

So let's say for now that the user_first name is the sensitive attribute even though the url to their Yelp page is what is going to give away the most information. Since those will all be unique values, I wanted to explore the names since there are 69 different unique names we have.

In [368]:
qID = {"bd", "gender", "user_first_name"}
equivalence_classes = merged_df[qID].drop_duplicates()
k_list = merged_df.groupby(["bd", "gender", "user_first_name"], dropna = False).size().reset_index(name='k')
ec_sorted = k_list.sort_values(["bd", "gender", "user_first_name"], ascending = [True, True, True])
k = min(ec_sorted["k"])
k

1

In [341]:
bd_l = []
for i in merged_df["bd"].drop_duplicates():
    val = merged_df[merged_df["bd"] == str(i)] #get all the rows with that year
    val_per_type = val[{"bd", "user_first_name"}].value_counts()
    bd_l.append(len(val_per_type))
    p = val_per_type/14888
    bd_entropy = 10**((-sum(val_per_type * np.log10(p)))/14888)
bd_distinct = min(bd_l) #wanted to see what the min was with more than 1 quasi-identifierdiversity 
print(bd_distinct, bd_entropy) 

3 1.077942907891244


quasi-identifier birth year (bd) satisfy 3 Distinct l-diversity and is 1.0779 entropy diverse

In [336]:
gender_l = []
for i in merged_df["gender"].drop_duplicates(): #go through each ec
    val = merged_df[merged_df["gender"] == str(i)] #get all the rows with that year
    val_per_type = val[{"gender", "user_first_name"}].value_counts()
    gender_l.append(len(val_per_type))
    p = val_per_type/14888
    gender_entropy = 10**((-sum(val_per_type * np.log10(p)))/14888)
gender_distinct = min(gender_l) #wanted to see what the min was with more than 1 quasi-identifierdiversity 
print(gender_distinct, gender_entropy)


38 1.2391340474474293


quasi-identifier gender satisfy 38 Distinct l-diversity and is 1.2391 entropy diverse

In [477]:
gender_l = []
mm = merged_df[["bd","gender"]].drop_duplicates()
for i in np.arange(len(mm)): #go through each ec
    gen = mm.iloc[i, :][1]
    b = mm.iloc[i, :][0]
    val = merged_df[(merged_df["gender"] == gen) & (merged_df["bd"] == b)] #get all the rows
    val_per_type = val[{"gender", "bd","user_first_name"}].value_counts()
    gender_l.append(len(val_per_type))
    p = val_per_type/14888
    gender_entropy = 10**((-sum(val_per_type * np.log10(p)))/14888)
gender_distinct = min(gender_l) #wanted to see what the min was with more than 1 quasi-identifierdiversity 
print(gender_distinct, gender_entropy)

1 1.0017162141298677


quasi-identifier gender satisfy 1 Distinct l-diversity and is 1.0017 entropy diverse

In [467]:
#trying it out with birthday year and city
gender_l = []
mmm = merged_df[["bd","city"]].drop_duplicates()
mmm['city'].fillna('Unknown')
for i in np.arange(len(mmm)): #go through each ec
    city = mmm.iloc[i, :][1]
    b = mmm.iloc[i, :][0]
    merged_df['city'] = merged_df['city'].fillna('Unknown')
    val = merged_df[(merged_df["city"] == city) & (merged_df["bd"] == b)] #get all the rows
    val_per_type = val[{"city", "bd","user_first_name"}].value_counts()
    gender_l.append(len(val_per_type))
    p = val_per_type/14888
    gender_entropy = 10**((-sum(val_per_type * np.log10(p)))/14888)
gender_distinct = min(gender_l) #wanted to see what the min was with more than 1 quasi-identifierdiversity 
print(gender_distinct, gender_entropy)

1 1.0011983482384246


Satisfies 1 Distinct l-diversity and 1.0012 entropy diverse

Seeing the numbers here, having more than 2 quasi-identifiers leads to an l-diversity distinct score of 1, which isn't too great and also a low entropy diverse score. This means that the user_name can be hurtful for those with more unique names as people can more accurately identify them on both Atinder and Yelp and thus lead a potential stalker to inofrmation that can reveal information on lifestyle.

Now looking into l-diveristy and t-closeness, we wanted to explore delta-presence to 

In [213]:
yelp_identities = pd.read_csv('./data/reviews_with_first_name.csv')
tinder_identities = pd.read_csv('./data/tinder_users_no_dup.csv')

In [472]:
quasi_identifiers = {"gender", "city", "school"}
matched = matched.rename(columns={"4": "gender", "5": "city", "6": "distance", "7": "company", "8": "job_title", "9": "school"})
identities = matched[{"gender", "city", "school", "user_name"}]
identities

Unnamed: 0,gender,city,school,user_name
190,not displayed,Mountain View,,Ryan M.
203,not displayed,Mountain View,,Ryan H.
212,not displayed,Mountain View,,Ryan G.
236,not displayed,Berkeley,"University of California, Berkeley",Daniel D.
242,not displayed,Berkeley,"University of California, Berkeley",Daniel X.
335,male,,"University of California, Berkeley",Matthew T.
350,male,,"University of California, Berkeley",Matthew M.
376,not displayed,,San Francisco State University,Levi S.


Looking at this, we can see that just looking at the 8 different identities that we were able to successfully match based on multiple factors such as city and facial expression. We can see that if we just take into accoun thte quasi-identifiers that only Levi is the only person that doesn't have the same exact quasi-identifiers as another person. However, this is a small list anyways, but considering this, there are a lot of overlap for the other individuals, which can help mask them more in terms of l-diversity.

The number isn't too big though, so adding more quasi-identifiers that reveal more information can still be good for an attacker as they only have to elimate 2 out of 3 entries for the Ryans in Mountain View and 1 out of 2 for the Daniels and Matthews.

In [473]:
iden = matched[{"gender", "city", "school"}].drop_duplicates()

In [468]:
pqID = merged_df[{ "gender", "city", "school"}]
iqID = iden.iloc[0, :]
left, right = pqID.align(iqID, axis=1, copy=False)
check = left == right
check[(check["city"] == True) & (check["gender"] == True) & (check["school"] == True)]
prob_of_identifying = 0

In [476]:
pqID = merged_df[{ "gender", "city", "school"}]
iqID = iden.iloc[1, :]
left, right = pqID.align(iqID, axis=1, copy=False)
check = left == right
check[(check["city"] == True) & (check["gender"] == True) & (check["school"] == True)]
prob_of_identifying = 1/532

In [132]:
pqID = merged_df[{ "gender", "city", "school"}]
iqID = iden.iloc[2, :]
left, right = pqID.align(iqID, axis=1, copy=False)
check = left == right
check[(check["city"] == True) & (check["gender"] == True) & (check["school"] == True)]
prob_of_identifying = 0

In [133]:
pqID = merged_df[{ "gender", "city", "school"}]
iqID = iden.iloc[3, :]
left, right = pqID.align(iqID, axis=1, copy=False)
check = left == right
check[(check["city"] == True) & (check["gender"] == True) & (check["school"] == True)]
prob_of_identifying = 0

We can see here that we were unable to locate most of these in our overall merged dataset after combining the Tinder and Yelp data except for the Berkeley entries for the city and school.

Extra just to see what the numbers looked like in terms of the difference between positive reviews and if there that could arise from privacy concerns

In [266]:
def words_in_texts(words, texts):
    indicator_array = []
    for x in texts:
        boo = []
        for i in words:
            if i in x:
                boo.append(1)
            else:
                boo.append(0)        
        indicator_array.append(boo)     
    return np.array(indicator_array)

In [280]:
l = ['amazing', 'delicious', 'worth', 'satisfied', 'awful', 'negative','concern', 'angry', 'rude']
ba = words_in_texts(l, merged_df['text'])
words = pd.DataFrame({"words": l, "count": sum(ba)})
words

array([710, 870, 206,  22,  19,  40,  39,  17,  49])