In [2]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from collections import Counter
import pickle as pkl 


In [3]:
dedis = pd.read_csv('files/dedis-2.csv', sep=',', names=['email', 'movie', 'date', 'stars'],
                    skipinitialspace=True)
imdb = pd.read_csv('files/imdb-2.csv', sep=',', names=['email', 'movie', 'date', 'stars'], skipinitialspace=True)

# Finding the Movie2Hash and Hash2Movie Mappings

In [4]:
movie_list = imdb['movie'].values.tolist()
freq_list = Counter(movie_list)
# freq_list.values()
# freq_list.keys()

freq = pd.DataFrame(list(zip(list(freq_list.keys()), list(freq_list.values()))), columns=['movie', 'freq']) 
freq = freq.sort_values('freq').reset_index(drop=True)
freq

Unnamed: 0,movie,freq
0,Sunrise,3
1,Ikiru,6
2,The Night of the Hunter,9
3,Double Indemnity,12
4,Paths of Glory,15
5,Fanny and Alexander,18
6,Metropolis,21
7,The 400 Blows,24
8,Chinatown,27
9,The Dark Knight,30


In [5]:
hashed_movie_list = dedis['movie'].values.tolist()
hashed_freq_list = Counter(hashed_movie_list)
# hashed_freq_list.values()
# hashed_freq_list.keys()

hashed_freq = pd.DataFrame(list(zip(list(hashed_freq_list.keys()), list(hashed_freq_list.values()))), columns=['movie', 'freq'])
hashed_freq = hashed_freq.sort_values(by='freq').reset_index(drop=True)
# len(hashed_freq)
hashed_freq

Unnamed: 0,movie,freq
0,cdaa02e75f4648634af3da6f554f8a5860a445dadea58a...,11
1,0255f23a80b2c82806283862bcbaf76b1b9f3efa934cda...,14
2,b9d6dfd5475e843cb678cadf8d83340a574700de1e05e1...,17
3,cebdecbe21a3420f309876160546cfd99c44fcec717bf2...,20
4,de5283b411fe3062ad14c713fb20bd732ad914c3f53d90...,23
5,f1f78b13b93fa47787254034522437ad8cc07df61f5c85...,26
6,3ae4014d1bb1c56a4f80b08359702991d5b6ae55f3bea2...,28
7,eae00955db278d83499b0e8dd758326d060e9948ecc9e0...,31
8,3c0b335dd20ef908175e66d6a793a0efebef11b9d9b9eb...,35
9,0fbae516b2f412320b0d8af07669944d44033aef3b95fd...,37


In [7]:
hash_discovered = pd.concat((freq['movie'], hashed_freq['movie']), axis=1)
hash_discovered.columns = ['name', 'hash']
hash_discovered
name2hash = dict(zip(hash_discovered['name'], hash_discovered['hash']))
hash2name = {v: k for k, v in name2hash.items()}

with open('hash2name.dict', 'wb') as f:
    pkl.dump(hash2name, f)
# name2hash
# hash2name

Unnamed: 0,name,hash
0,Sunrise,cdaa02e75f4648634af3da6f554f8a5860a445dadea58a...
1,Ikiru,0255f23a80b2c82806283862bcbaf76b1b9f3efa934cda...
2,The Night of the Hunter,b9d6dfd5475e843cb678cadf8d83340a574700de1e05e1...
3,Double Indemnity,cebdecbe21a3420f309876160546cfd99c44fcec717bf2...
4,Paths of Glory,de5283b411fe3062ad14c713fb20bd732ad914c3f53d90...
5,Fanny and Alexander,f1f78b13b93fa47787254034522437ad8cc07df61f5c85...
6,Metropolis,3ae4014d1bb1c56a4f80b08359702991d5b6ae55f3bea2...
7,The 400 Blows,eae00955db278d83499b0e8dd758326d060e9948ecc9e0...
8,Chinatown,3c0b335dd20ef908175e66d6a793a0efebef11b9d9b9eb...
9,The Dark Knight,0fbae516b2f412320b0d8af07669944d44033aef3b95fd...


# My Movies in IMDB

In [56]:
# imdb.loc[imdb['email'] == 'mohammad.yaghini@epfl.ch']
myIMDBmovies = imdb.loc[imdb['email'] == 'mohammad.yaghini@epfl.ch']
myIMDBmoviesList= myIMDBmovies['movie'].values.tolist()
# name2hash[myIMDBmovies]
# myIMDBmovies
myIMDBmovies = myIMDBmovies.assign(hash=list(map(name2hash.__getitem__, myIMDBmoviesList)))
myIMDBmovies
myHashes = myIMDBmovies['hash'].values.tolist()
myHashes

Unnamed: 0,email,movie,date,stars,hash
2336,mohammad.yaghini@epfl.ch,The Godfather,07/11/03,5,b66f8ceafa5ec4812a4074499500dfa89d9a725a6d4fc5...
2337,mohammad.yaghini@epfl.ch,Seven Samurai,07/11/03,3,698b924c57315dacdb2f8c8258908013d669fb67f85819...
2338,mohammad.yaghini@epfl.ch,The Godfather: Part II,07/11/03,3,972392e382e6ff68f4ab99389989952867ecb915e40a2a...
2339,mohammad.yaghini@epfl.ch,Pulp Fiction,07/11/03,5,64cc984865c51051bc410fbe65258131eab6c45a373e45...
2340,mohammad.yaghini@epfl.ch,12 Angry Men,07/11/03,4,645839b2bb180bc3d9120b162a06896521ea59038ca4a3...
2341,mohammad.yaghini@epfl.ch,Lawrence of Arabia,07/11/03,3,687ae0e3ec80d28daada0c1cfbfe1dd2817727c2a6677f...
2342,mohammad.yaghini@epfl.ch,Sunset Blvd.,07/11/03,2,898c2452e1a02c472e61a6d26483e99883837a169fc237...
2343,mohammad.yaghini@epfl.ch,M,07/11/03,4,19656d4177b8da5474f2cc9b5e4ef49fac5f878383bc07...
2344,mohammad.yaghini@epfl.ch,Spirited Away,07/11/03,2,7b6bc812b3a97c70e5a1042dd20db364aecd4e71c351ff...
2345,mohammad.yaghini@epfl.ch,City Lights,07/11/03,3,3f5331403d29c761c47c393d7c3b5be976512f572baaac...


['b66f8ceafa5ec4812a4074499500dfa89d9a725a6d4fc5faf918768de2c821cb',
 '698b924c57315dacdb2f8c8258908013d669fb67f85819abcf458a55aab4d14c',
 '972392e382e6ff68f4ab99389989952867ecb915e40a2a77dd6a420e34b37a59',
 '64cc984865c51051bc410fbe65258131eab6c45a373e45e5bccc9682a95bff01',
 '645839b2bb180bc3d9120b162a06896521ea59038ca4a3a7b3694b8e429c4729',
 '687ae0e3ec80d28daada0c1cfbfe1dd2817727c2a6677fea37df6afb4d9583d6',
 '898c2452e1a02c472e61a6d26483e99883837a169fc237da08c31077972659b4',
 '19656d4177b8da5474f2cc9b5e4ef49fac5f878383bc07d318185f0754a0825a',
 '7b6bc812b3a97c70e5a1042dd20db364aecd4e71c351ff2267d7efd9abb2e361',
 '3f5331403d29c761c47c393d7c3b5be976512f572baaac351c276aaa6052c31a',
 'aa10b50b28259802a50216292aedc45289b602cfff1b7668a0eddf42a41a3fac',
 '57ad10c9b450612f881fe893e7dbc3e4ba93d4813c4b969c4aea7a16c6fe7ab4',
 '902ee423aed2c334d98bf2bcee45aadcb462f5628d982eddc6f61189695e5125',
 '585ca03150a1adc505875fc36b9d64bb2acf351a323e714dc9a8cdbeee2b115c',
 '6f07b7e73b77a698f8c2b6c38c11115c

# Finding my email hash in DEDIS

In [81]:
found_email_hashes = []
for myhash in myHashes:
    found = dedis.loc[dedis['movie'] == myhash]['email'].values.tolist()
    for i in found : found_email_hashes.append(i)
    
found_email_hashes_counted = Counter(found_email_hashes)
# found_email_hashes_counted
email_hashes = pd.DataFrame(list(zip(list(found_email_hashes_counted.keys()), list(found_email_hashes_counted.values()))), columns=['email', 'freq'])
my_email_hash = email_hashes.sort_values(by='freq', ascending=False).reset_index(drop=True).loc[0,:]['email']

# Finding my movies in DEDIS

In [91]:
myMoviesHashList = dedis.loc[dedis['email']==my_email_hash]['movie'].values.tolist()
myMovieList = list(map(hash2name.__getitem__, myMoviesHashList))
myMovieList

with open('ex1_2_movies.txt', 'w') as f:
    for m in myMovieList:
        print(m, file=f)

['Seven Samurai',
 'The Godfather',
 'The Godfather: Part II',
 'Pulp Fiction',
 '12 Angry Men',
 'Lawrence of Arabia',
 'Sunset Blvd.',
 'M',
 'Spirited Away',
 'City Lights',
 'The Good, the Bad and the Ugly',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb',
 'Apocalypse Now',
 'Rear Window',
 'Bicycle Thieves',
 'Psycho',
 'The Third Man',
 'Once Upon a Time in the West',
 'Star Wars: Episode V - The Empire Strikes Back',
 'Vertigo']