# Finding co-occurrence


### reading the data from file

* The data is in a tsv (tab-separated values) file. 
* The first line is the header describing the columns (person)
* The fist column is the label for the days. 
* each cell contains an X if person i did not stay at a hotel that day or the id of the hotel where he stayed (H1, H2, H3, H4 and H5)
| Person      	| P1  	| P2  | P3  	| `...`  	|
|-------------	|:-----:|:-----:|:-----:|-----:|
| Hotel Day 1 	| H1  	| H3  	| X   	| `...`  	|
| Hotel Day 2 	| X   	| H1  	| H5  	| `...`  	|
| `...`       	| `...` 	| `...`  	| `...`  	| `...` |




### Python Application

* The goal is to introduce Python in the context of an application
* We will start with a verbose implementation to explore the language 
  * Some construct are not efficient but are useful to abstract complexity
* Will define a compact version at the end


In [43]:
for line in open("data/hotel_data.tsv"):
    print(line)

Person 	P1	P2	P3	P4	P5	P6	P7	P8	P9	P10

Hotel day 1	H1	H4	X	H2	X	X	H2	X	H5 	H1

Hotel day 2	X	X	X	X	H1	X	X	X	X	X

Hotel day 3	X	X	X	X	X	X	X	H4	X	X

Hotel day 4	X	X	X	X	X	X	X	X	X	H3

Hotel day 5	X	X	H1	X	X	H1	X	X	X	X

Hotel day 6	X	X	X	X	X	H4	X	X	X	X

Hotel day 7	X	X	X	X	X	X	X	X	X	X

Hotel day 8	X	H3	X	X	X	X	X	H5	X	X

Hotel day 9	X	X	X	X	X	X	X	X	X	X

Hotel day 10	X	X	X	X	X	X	X	X	X	X

Hotel day 11	X	X	X	X	X	X	H2	X	X	X

Hotel day 12	H4	X	X	X	X	X	X	X	X	X

Hotel day 13	X	X	X	X	X	X	X	X	X	X

Hotel day 14	X	X	X	H2	X	X	X	X	X	X

Hotel day 15	X	X	X	X	X	X	X	X	X	X

Hotel day 16	X	X	X	X	X	X	X	X	X	X

Hotel day 17	X	X	X	X	X	X	X	X	H3	X

Hotel day 18	X	X	X	X	H3	X	X	X	X	X

Hotel day 19	X	X	H5	X	X	X	X	X	X	X

Hotel day 20	X	X	X	H3	X	X	H3	X	X	X


In [44]:
# Skip the header

hotel_days = []
i = 0
for line in open("data/hotel_data.tsv"):
    if i > 0:
        print(line)
    i+=1 


Hotel day 1	H1	H4	X	H2	X	X	H2	X	H5 	H1

Hotel day 2	X	X	X	X	H1	X	X	X	X	X

Hotel day 3	X	X	X	X	X	X	X	H4	X	X

Hotel day 4	X	X	X	X	X	X	X	X	X	H3

Hotel day 5	X	X	H1	X	X	H1	X	X	X	X

Hotel day 6	X	X	X	X	X	H4	X	X	X	X

Hotel day 7	X	X	X	X	X	X	X	X	X	X

Hotel day 8	X	H3	X	X	X	X	X	H5	X	X

Hotel day 9	X	X	X	X	X	X	X	X	X	X

Hotel day 10	X	X	X	X	X	X	X	X	X	X

Hotel day 11	X	X	X	X	X	X	H2	X	X	X

Hotel day 12	H4	X	X	X	X	X	X	X	X	X

Hotel day 13	X	X	X	X	X	X	X	X	X	X

Hotel day 14	X	X	X	H2	X	X	X	X	X	X

Hotel day 15	X	X	X	X	X	X	X	X	X	X

Hotel day 16	X	X	X	X	X	X	X	X	X	X

Hotel day 17	X	X	X	X	X	X	X	X	H3	X

Hotel day 18	X	X	X	X	H3	X	X	X	X	X

Hotel day 19	X	X	H5	X	X	X	X	X	X	X

Hotel day 20	X	X	X	H3	X	X	H3	X	X	X


In [45]:
hotel_days = []
i = 0
for line in open("data/hotel_data.tsv"):
    if i > 0:
        hotel_days.append(line.rstrip().split("\t")[1:])
    i+=1 
hotel_days

[['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1'],
 ['X', 'X', 'X', 'X', 'H1', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'H4', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'H3'],
 ['X', 'X', 'H1', 'X', 'X', 'H1', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'H4', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'H3', 'X', 'X', 'X', 'X', 'X', 'H5', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'H2', 'X', 'X', 'X'],
 ['H4', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'H2', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'H3', 'X'],
 ['X', 'X', 'X', 'X', 'H3', 'X', 'X', 'X', 'X', 'X'],
 ['X', 'X', 'H5', 'X', 'X'

In [46]:
# Trial Version

def find_groups(day_i_hotels):
    """ 
    Given a day log, return group of people (2 or more) who stayed at same hotel
    ex. 
    given: ['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1']
    return: [{1, 10}, {4, 7}] where 1, 10, 4, 7 and are people ids
    """
    hotels_to_people = {}
    i = 1
    # initialize the day with an empty list
    # ex. hotels_to_people["H1"] = []
    # this is not necessary but useful to abstract complexity
    for hotel_id in day_i_hotels:
        if hotel_id != "X":
            hotels_to_people[hotel_id] = []

    for hotel_id in day_i_hotels:
        if hotel_id != "X":
            hotels_to_people[hotel_id].append(i)
        i+=1
    
    return hotels_to_people

In [47]:
find_groups(['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1'])

{'H1': [1, 10], 'H4': [2], 'H2': [4, 7], 'H5 ': [9]}

In [48]:
# Trial Version

def find_groups(day_i_hotels):
    """ 
    Given a day log, return group of people (2 or more) who stayed at same hotel
    ex. 
    given: ['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1']
    return: [{1, 10}, {4, 7}] where 1, 10, 4, 7 and are people ids
    """
    hotels_to_people = {}
    
    i = 1
    # initialize the day with an empty list
    # ex. hotels_to_people["H1"] = []
    # this is not necessary but useful to abstract complexity
    for hotel_id in day_i_hotels:
        if hotel_id != "X":
            hotels_to_people[hotel_id] = []

    for hotel_id in day_i_hotels:
        if hotel_id != "X":
            hotels_to_people[hotel_id].append(i)
        i+=1
    
    days_to_remove = []
    for key, value in hotels_to_people.items():
        if len(value) < 2:
            days_to_remove.append(key)
    
    print(f"need to remove days {days_to_remove}")
    
    return hotels_to_people

In [49]:
find_groups(['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1'])

need to remove days ['H4', 'H5 ']


{'H1': [1, 10], 'H4': [2], 'H2': [4, 7], 'H5 ': [9]}

In [50]:
# Trial Version

def find_groups(day_i_hotels):
    """ 
    Given a day log, return group of people (2 or more) who stayed at same hotel
    ex. 
    given: ['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1']
    return: [{1, 10}, {4, 7}] where 1, 10, 4, 7 and are people ids
    """
    hotels_to_people = {}
    
    i = 1
    # initialize the day with an empty list
    # ex. hotels_to_people["H1"] = []
    # this is not necessary but useful to abstract complexity
    for hotel_id in day_i_hotels:
        if hotel_id != "X":
            hotels_to_people[hotel_id] = []

    for hotel_id in day_i_hotels:
        if hotel_id != "X":
            hotels_to_people[hotel_id].append(i)
        i+=1
    
    days_to_remove = []
    for key, value in hotels_to_people.items():
        if len(value) < 2:
            days_to_remove.append(key)

    for day in days_to_remove:
        del(hotels_to_people[day])
    
    return hotels_to_people

In [51]:
find_groups(['H1', 'H4', 'X', 'H2', 'X', 'X', 'H2', 'X', 'H5 ', 'H1'])

{'H1': [1, 10], 'H2': [4, 7]}

In [52]:
find_groups(['H2', 'H2', 'X', 'X', 'X', 'X', 'X', 'X', 'H5 ', 'H1'])

{'H2': [1, 2]}

In [53]:
find_groups(['H1', 'X', 'H2', 'X', 'X', 'X', 'H4', 'X', 'X ', 'X'])

{}

In [54]:
# Here, we will generate groups for all the days 
# Store them in the list

groups_per_days = []
for day  in hotel_days:    
    groups_per_days.append(find_groups(day))

In [55]:
groups_per_days

[{'H1': [1, 10], 'H2': [4, 7]},
 {},
 {},
 {},
 {'H1': [3, 6]},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {'H3': [4, 7]}]

In [66]:
# given a list (ex. [1,2,3,4,5])
# generate the list of pairwise comparisons of that list
for i in range(1, 5):
    for j in range(i+1, 6):
        print(i,j, end="\t") 

1 2	1 3	1 4	1 5	2 3	2 4	2 5	3 4	3 5	4 5	

In [68]:
# generate the list of pairwise comparisons we will do
pairwise_comps = []

for i in range(0, len(hotel_days)-1):
    for j in range(i+1, len(hotel_days)):
        print(i,j, end="\t") 

0 1	0 2	0 3	0 4	0 5	0 6	0 7	0 8	0 9	0 10	0 11	0 12	0 13	0 14	0 15	0 16	0 17	0 18	0 19	1 2	1 3	1 4	1 5	1 6	1 7	1 8	1 9	1 10	1 11	1 12	1 13	1 14	1 15	1 16	1 17	1 18	1 19	2 3	2 4	2 5	2 6	2 7	2 8	2 9	2 10	2 11	2 12	2 13	2 14	2 15	2 16	2 17	2 18	2 19	3 4	3 5	3 6	3 7	3 8	3 9	3 10	3 11	3 12	3 13	3 14	3 15	3 16	3 17	3 18	3 19	4 5	4 6	4 7	4 8	4 9	4 10	4 11	4 12	4 13	4 14	4 15	4 16	4 17	4 18	4 19	5 6	5 7	5 8	5 9	5 10	5 11	5 12	5 13	5 14	5 15	5 16	5 17	5 18	5 19	6 7	6 8	6 9	6 10	6 11	6 12	6 13	6 14	6 15	6 16	6 17	6 18	6 19	7 8	7 9	7 10	7 11	7 12	7 13	7 14	7 15	7 16	7 17	7 18	7 19	8 9	8 10	8 11	8 12	8 13	8 14	8 15	8 16	8 17	8 18	8 19	9 10	9 11	9 12	9 13	9 14	9 15	9 16	9 17	9 18	9 19	10 11	10 12	10 13	10 14	10 15	10 16	10 17	10 18	10 19	11 12	11 13	11 14	11 15	11 16	11 17	11 18	11 19	12 13	12 14	12 15	12 16	12 17	12 18	12 19	13 14	13 15	13 16	13 17	13 18	13 19	14 15	14 16	14 17	14 18	14 19	15 16	15 17	15 18	15 19	16 17	16 18	16 19	17 18	17 19	18 19	

In [69]:
# generate the list of pairwise comparisons we will do
pairwise_comps = []

for i in range(0, len(hotel_days)-1):
    for j in range(i+1, len(hotel_days)):
        pairwise_comps.append((i,j))

In [72]:
pairwise_comps[:10]

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10)]

In [75]:
# finding group overlap using set intersection

group_1 = [1,2,5]
group_2 = [2,3,5]

set(group_1).intersection(group_2)




{2, 5}

In [80]:
def compare_two_days(day_i, day_j):
    match = []
    for group_i in day_i.values():
        for group_j in day_j.values():
            intersect = set(group_i).intersection(group_j)
            if len(intersect) > 1 :
                match.append(intersect)
    return match

In [84]:

compare_two_days({"H1": [1,8], "H4": [4,6,7]}, {"H3":[1,4,5,7], "H5": [2,9]})

[{4, 7}]

In [87]:
for comp in pairwise_comps:
    match = compare_two_days(groups_per_days[comp[0]], groups_per_days[comp[1]])
    print(f"for days {comp}, the overlap was: {match}")

for days (0, 1), the overlap was: []
for days (0, 2), the overlap was: []
for days (0, 3), the overlap was: []
for days (0, 4), the overlap was: []
for days (0, 5), the overlap was: []
for days (0, 6), the overlap was: []
for days (0, 7), the overlap was: []
for days (0, 8), the overlap was: []
for days (0, 9), the overlap was: []
for days (0, 10), the overlap was: []
for days (0, 11), the overlap was: []
for days (0, 12), the overlap was: []
for days (0, 13), the overlap was: []
for days (0, 14), the overlap was: []
for days (0, 15), the overlap was: []
for days (0, 16), the overlap was: []
for days (0, 17), the overlap was: []
for days (0, 18), the overlap was: []
for days (0, 19), the overlap was: [{4, 7}]
for days (1, 2), the overlap was: []
for days (1, 3), the overlap was: []
for days (1, 4), the overlap was: []
for days (1, 5), the overlap was: []
for days (1, 6), the overlap was: []
for days (1, 7), the overlap was: []
for days (1, 8), the overlap was: []
for days (1, 9), the o