# NOTE: For ALL bashscripts in this project, I copied and pasted bash scripts into code cells because JD told me to do it as pasting it into markdown cells deleted some part of the script.

# 1. Finding Trends

### 1.1 a-c

In [25]:
import re
import time
from itertools import islice

def mapper_tweet(tweet):
    hashtags = re.findall(r'\s#\S+',tweet)
    hashtags.extend(re.findall(r'^#\S+', tweet))
    for i in range(len(hashtags)):
        hashtag = hashtags[i]
        hashtag = re.sub(r'\W+', '',hashtag)
        hashtag = hashtag.lower()
        hashtags[i] = hashtag
    output = []
    for i in range(len(hashtags)):
        output.append((hashtags[i], 1))
    return output

def reducer_tweet(key, value):
    return key, sum(value)

def mapreduce_execute_tweet(tweets, mapper, reducer):
    hashtags = map(mapper, tweets)
    hashtags = list(hashtags)
    the_hashtags = {}
    for hashtag in hashtags:
        for k, v in hashtag:
            if k not in the_hashtags:
                the_hashtags[k] = [v]
            else:
                the_hashtags[k].append(v)
    output = [reducer(k, v) for k,v in the_hashtags.items()]
    output.sort(key = lambda x: x[1])  
    output = output[-10:]
    output.reverse()
    return output

start_time = time.time()
with open("test_set_tweets.txt", encoding = "utf8") as file:
    tweets = list(islice(file, 500000))
print(mapreduce_execute_tweet(tweets, mapper_tweet, reducer_tweet))
print("Time: " + str(time.time() - start_time))

[('ff', 3522), ('nowplaying', 1799), ('fb', 1362), ('mm', 1017), ('fail', 628), ('random', 601), ('haiti', 586), ('shoutout', 516), ('musicmonday', 451), ('followfriday', 449)]
Time: 7.478104114532471


### 1.1 d)

In [3]:
#!/bin/bash
head -500000 test_set_tweets.txt | sed -e 's/\(.*\)/\L\1/'| grep -o '^#\S\+\|\s#\S\+' | sed 's/\W//g' | sort | uniq -c | sort | tail -n 10

<p>449 followfriday</p>
<p>451 musicmonday</p>
<p>516 shoutout</p>
<p>586 haiti</p>
<p>601 random</p>
<p>628 fail</p>
<p>1017 mm</p>
<p>1362 fb</p>
<p>1799 nowplaying</p>
<p>3522 ff</p>

### 1.1 e)

In [4]:
#!/bin/bash
time head -500000 test_set_tweets.txt | sed -e 's/\(.*\)/\L\1/'| grep -o '^#\S\+|\s#\S\+' | sed 's/\W//g' | sort | uniq -c | sort | tail -n 10

<p>real    0m11.963s</p>
<p>user    0m11.700s</p>
<p>sys     0m0.776s</p>

### 1.1 f)

According to the time command on both, it looks like my shell script is slightly faster than my python. However, I was surprised that it was as close as it was since unix is usually a lot faster since it is low-level. I think it may be the piping that is causing the unix command to run a bit slower.

# 1.2

### 1.2.1. a-c

In [26]:
import re
import time

def mapper_user(tweet):
    users = re.findall(r'\s@\S+',tweet)
    users.extend(re.findall(r'^@\S+', tweet))
    output = []
    for i in range(len(users)):
        output.append((users[i][1:], 1))
    return output

start_time = time.time()
file = open("tweets.txt", encoding="utf8")
tweets = file.readlines()
print(mapreduce_execute_tweet(tweets, mapper_user, reducer_tweet))
print("Time: " + str(time.time() - start_time))

[('@RevRunWisdom:', 1229), ('@listensto', 939), ('@DonnieWahlberg', 523), ('@OGmuscles', 441), ('@addthis', 429), ('@breatheitin', 407), ('@justinbieber', 354), ('@MAV25', 347), ('@karlievoice', 304), ('@mtgcolorpie', 291)]
Time: 11.382555723190308


### 1.2.1 d)

In [None]:
#!/bin/bash
cat tweets.txt | grep -o '\s@\S\+\|^@\S+' | sed 's/^.//' | sort | uniq -c | sort | tail -n 10

<p>291 @mtgcolorpie</p>
<p>304 @karlievoice</p>
<p>347 @MAV25</p>
<p>354 @justinbieber</p>
<p>407 @breatheitin</p>
<p>429 @addthis</p>
<p>441 @OGmuscles</p>
<p>523 @DonnieWahlberg</p>
<p>939 @listensto</p>
<p>1229 @RevRunWisdom:</p>

### 1.2.1 e)

In [None]:
#!/bin/bash
time cat tweets.txt | grep -o '\s@\S\+\|^@\S+' | sed 's/^.//' | sort | uniq -c | sort | tail -n 10

<p>real    0m7.541s</p>
<p>user    0m8.013s</p>
<p>sys     0m0.743s</p>

### 1.2.1 f)

Similar to the last question, the shell script takes a shorter time than the python function to run. However, this time the difference between unix and python was easy to see and the unix command ran significantly faster. This makes sense due to the low-level nature of unix which allows commands to run very fast.

### 1.2.2 a-c

In [27]:
def mapper_two_hashtags(tweet):
    hashtags = re.findall(r'\s#\S+',tweet)
    hashtags.extend(re.findall(r'^#\S+', tweet))
    return (tweet, len(hashtags))

def reducer_two_hashtags(tweet, length_hashtag):
    return (tweet, length_hashtag >= 2)

def mapreduce_execute_two_hashtags(tweets, mapper, reducer):
    two_hashtags = map(mapper, tweets)
    two_hashtags = list(two_hashtags)
    result = 0
    for i in range(len(two_hashtags)):
        two_hashtags[i] = reducer_two_hashtags(two_hashtags[i][0], two_hashtags[i][1])
        if two_hashtags[i][1]:
            result += 1
    return result

start_time = time.time()
file = open("tweets.txt", encoding="utf8")
tweets = file.readlines()
print(mapreduce_execute_two_hashtags(tweets, mapper_two_hashtags, reducer_two_hashtags))
print("Time: " + str(time.time() - start_time))

15004
Time: 9.184471130371094


### 1.2.2 d)

In [None]:
#!/bin/bash
cat tweets.txt | grep -o -n '^#\S\+\|\s#\S\+'| cut -d : -f1 | uniq -c | awk '$1 > 1 {count++} END {print count}' 

15004

### 1.2.2 e)

In [None]:
#!/bin/bash
time cat tweets.txt | grep -o -n '^#\S\+\|\s#\S\+'| cut -d : -f1 | uniq -c | awk '$1 > 1 {count++} END {print count}' 

<p>real    0m0.969s</p>
<p>user    0m1.044s</p>
<p>sys     0m0.168s</p>

### 1.2.2 f)

Similar to 1.2.1, the shell script is significantly faster than the python command. Again, this is probably due to unix being low level so scripts are able to run a lot faster.

# 2. Finding Reciprocal Followers

## In the files created in this question, I copied the contents of the file and pasted into markdown cells below the code that created it.

In [None]:
#!/bin/bash
cat <(head -500000 test_set_tweets.txt) <(head -250000 training_set_tweets.txt) > tweets.txt

I used the above unix command to create tweets.txt

### 2. a-b

In [28]:
import pandas as pd

start_time = time.time()
myDF = pd.read_csv("edges.csv", header = None, nrows=500000)
true_vals = list(set(list(myDF[0])).intersection(set(list(myDF[1]))))
miniDF = myDF[myDF[0].isin(true_vals)]
miniDF = miniDF[miniDF[1].isin(true_vals)]

def mapper_followers(first_id, second_id, df):
    mutual = (((df[0] == first_id) & (df[1] == second_id)).any() & ((df[1] == first_id) & (df[0] == second_id)).any())
    return first_id, second_id, mutual

def reducer_followers(first_id, second_id, mutual):
    if mutual:
        return first_id, second_id
    else:
        return None

def mapreduce_execute_followers(mapper_followers, reducer_followers, df, vals):
    results = []
    for i in range(len(vals)):
        for j in range(i, len(vals)):
            var = mapper_followers(vals[i], vals[j], df)
            if var[2]:
                results.append(reducer_followers(vals[i], vals[j], var[2]))
    return results
                
results = mapreduce_execute_followers(mapper_followers, reducer_followers, miniDF, true_vals)
print(results)
print("Time: " + str(time.time() - start_time))

[(135684, 135546), (40997, 41039), (40997, 62623), (40997, 40704), (40997, 201063), (70696, 70772), (70696, 60887), (15926, 15574), (20033, 19628), (93260, 93427), (93260, 65435), (41039, 40704), (33884, 34046), (33884, 34101), (3682, 5276), (31866, 32002), (78464, 78182), (89222, 89350), (19628, 19821), (122546, 102898), (22196, 76473), (32452, 32173), (62167, 33099), (80092, 80096), (100591, 100721), (134409, 134410), (63255, 65435), (63255, 13232), (18205, 13232), (192865, 192899), (201078, 201607), (65411, 65435), (58783, 58875), (41422, 23503)]
Time: 17.772157669067383


### 2. c)

In [30]:
vals = []
for i in range(len(true_vals)):
        for j in range(len(true_vals)):
            var = mapper_followers(true_vals[i], true_vals[j], miniDF)
            if var[2]:
                vals.append(reducer_followers(true_vals[i], true_vals[j], var[2]))

smaller_DF = pd.DataFrame(results, columns = [0, 1])

writer = open("Q2_mutual_followers_python.txt", "w")

for i in range(len(vals)):
    writer.write(str(vals[i][0]) + ", " + str(vals[i][1]) + "\n")
writer.close()

total_edges = len(myDF)
total_nodes = len(set(myDF[0]).union(set(myDF[1])))


subset_nodes = len(set(smaller_DF[0]).union(set(smaller_DF[1])))
subset_edges = len(vals)

print("Number of Edges on original edges.csv: " + str(total_edges))
print("Number of Nodes on original edges.csv: " + str(total_nodes))
print("Number of Edges on subset of edges.csv: " + str(subset_edges))
print("Number of Nodes on subset of edges.csv: " + str(subset_nodes))

Number of Edges on original edges.csv: 500000
Number of Nodes on original edges.csv: 249402
Number of Edges on subset of edges.csv: 68
Number of Nodes on subset of edges.csv: 55


## The following is the contents of the file I created in 2c

135684, 135546<br />
40997, 41039<br />
40997, 62623<br />
40997, 40704<br />
40997, 201063<br />
70696, 70772<br />
70696, 60887<br />
15926, 15574<br />
20033, 19628<br />
93260, 93427<br />
93260, 65435<br />
41039, 40997<br />
41039, 40704<br />
33884, 34046<br />
33884, 34101<br />
3682, 5276<br />
70772, 70696<br />
31866, 32002<br />
78464, 78182<br />
89222, 89350<br />
5276, 3682<br />
62623, 40997<br />
19628, 20033<br />
19628, 19821<br />
122546, 102898<br />
22196, 76473<br />
76473, 22196<br />
32452, 32173<br />
15574, 15926<br />
62167, 33099<br />
80092, 80096<br />
80096, 80092<br />
100591, 100721<br />
93427, 93260<br />
34046, 33884<br />
32002, 31866<br />
40704, 40997<br />
40704, 41039<br />
89350, 89222<br />
134409, 134410<br />
134410, 134409<br />
63255, 65435<br />
63255, 13232<br />
18205, 13232<br />
34101, 33884<br />
33099, 62167<br />
192865, 192899<br />
78182, 78464<br />
201063, 40997<br />
19821, 19628<br />
100721, 100591<br />
201078, 201607<br />
135546, 135684<br />
192899, 192865<br />
65411, 65435<br />
201607, 201078<br />
65435, 93260<br />
65435, 63255<br />
65435, 65411<br />
58783, 58875<br />
32173, 32452<br />
13232, 63255<br />
13232, 18205<br />
41422, 23503<br />
23503, 41422<br />
60887, 70696<br />
102898, 122546<br />
58875, 58783<br />


### 2. d)

In [None]:
#!/bin/bash
head edges.csv -n 500000 | awk -F, '{if ($1 > $2){var = $1; $1 = $2; $2 = var;} print $0}' | sed 's/\W/,/g' | sort | uniq -d | awk -F, '{print $1","$2"\n"$2","$1}' > Q2_friends_unix.txt

The contents of the file created look something like the following...

100591,100721<br />
100721,100591<br />
102898,122546<br />
122546,102898<br />
13232,18205<br />
18205,13232<br />
13232,63255<br />
63255,13232<br />
134409,134410<br />
134410,134409<br />
135546,135684<br />
135684,135546<br />
15574,15926<br />
15926,15574<br />
192865,192899<br />
192899,192865<br />
19628,19821<br />
19821,19628<br />
19628,20033<br />
20033,19628<br />
201078,201607<br />
201607,201078<br />
22196,76473<br />
76473,22196<br />
23503,41422<br />
41422,23503<br />
31866,32002<br />
32002,31866<br />
32173,32452<br />
32452,32173<br />
33099,62167<br />
62167,33099<br />
33884,34046<br />
34046,33884<br />
33884,34101<br />
34101,33884<br />
3682,5276<br />
5276,3682<br />
40704,40997<br />
40997,40704<br />
40704,41039<br />
41039,40704<br />
40997,201063<br />
201063,40997<br />
40997,41039<br />
41039,40997<br />
40997,62623<br />
62623,40997<br />
58783,58875<br />
58875,58783<br />
60887,70696<br />
70696,60887<br />
63255,65435<br />
65435,63255<br />
65411,65435<br />
65435,65411<br />
65435,93260<br />
93260,65435<br />
70696,70772<br />
70772,70696<br />
78182,78464<br />
78464,78182<br />
80092,80096<br />
80096,80092<br />
89222,89350<br />
89350,89222<br />
93260,93427<br />
93427,93260<br />


### 2. e)

In [None]:
#!/bin/bash
time head edges.csv -n 500000 | awk -F, '{if ($1 > $2){var = $1; $1 = $2; $2 = var;} print $0}' | sed 's/\W/,/g' | sort | uniq -d | awk -F, '{print $1","$2"\n"$2","$1}' > Q2_friends_unix.txt

<p>real    0m4.768s</p>
<p>user    0m5.716s</p>
<p>sys     0m0.261s</p>

### 2. f)

The Unix is faster than Python. This makes sense as my python mapreduce() function uses two for loops to parse all the data which was inefficient, making my python function significantly slower than my shell script.

# 3. Finding Friend of Friends

In [23]:
def mapper_friends(first_friend, second_friend, df):
    cond1 = df[0].isin([first_friend,second_friend])
    cond2 = df[1].isin([first_friend,second_friend])
    mutual_friends = df[cond1 | cond2]
    mutual_friends = list(set(list(mutual_friends[0])).intersection(set(list(mutual_friends[1]))))
    actual_friends = []
    for i in range(len(mutual_friends)):
        if mutual_friends[i] != first_friend and mutual_friends[i] != second_friend:
            if mapper_followers(mutual_friends[i], first_friend, df)[2] and mapper_followers(mutual_friends[i], second_friend, df)[2]:
                actual_friends.append(mutual_friends[i])
    return first_friend, second_friend, actual_friends

def reducer_friends(first_friend, second_friend, actual_friends):
    return first_friend, second_friend, len(actual_friends)

def mapreduce_execute_friends(mapper, reducer, values, df):
    mutual_friends = []
    for i in range(len(values)):
        var = mapper_friends(values[i][0], values[i][1], df)
        mutual_friends.append(reducer_friends(var[0], var[1], var[2]))
    return mutual_friends

filer = open("Q2_mutual_followers_python.txt", "r")
my_values = []
for line in filer.readlines():
    ind = line.index(",")
    my_values.append((int(line[0:ind]), int(line[ind + 1:].strip("\n"))))
print(my_values)

mutual_friends = mapreduce_execute_friends(mapper_friends, reducer_friends, my_values, miniDF)

[(135684, 135546), (40997, 41039), (40997, 62623), (40997, 40704), (40997, 201063), (70696, 70772), (70696, 60887), (15926, 15574), (20033, 19628), (93260, 93427), (93260, 65435), (41039, 40997), (41039, 40704), (33884, 34046), (33884, 34101), (3682, 5276), (70772, 70696), (31866, 32002), (78464, 78182), (89222, 89350), (5276, 3682), (62623, 40997), (19628, 20033), (19628, 19821), (122546, 102898), (22196, 76473), (76473, 22196), (32452, 32173), (15574, 15926), (62167, 33099), (80092, 80096), (80096, 80092), (100591, 100721), (93427, 93260), (34046, 33884), (32002, 31866), (40704, 40997), (40704, 41039), (89350, 89222), (134409, 134410), (134410, 134409), (63255, 65435), (63255, 13232), (18205, 13232), (34101, 33884), (33099, 62167), (192865, 192899), (78182, 78464), (201063, 40997), (19821, 19628), (100721, 100591), (201078, 201607), (135546, 135684), (192899, 192865), (65411, 65435), (201607, 201078), (65435, 93260), (65435, 63255), (65435, 65411), (58783, 58875), (32173, 32452), (13

In [24]:
mutual_friends.sort(key = lambda x: x[2], reverse=True)
mutual_friends = mutual_friends[:10]
for i in range(len(mutual_friends)):
    print("Friend #1: " + str(mutual_friends[i][0]) + ", Friend #2: " + str(mutual_friends[i][1]) + ", # of Mutual Friends: " + str(mutual_friends[i][2]))


Friend #1: 40997, Friend #2: 41039, # of Mutual Friends: 1
Friend #1: 40997, Friend #2: 40704, # of Mutual Friends: 1
Friend #1: 41039, Friend #2: 40997, # of Mutual Friends: 1
Friend #1: 41039, Friend #2: 40704, # of Mutual Friends: 1
Friend #1: 40704, Friend #2: 40997, # of Mutual Friends: 1
Friend #1: 40704, Friend #2: 41039, # of Mutual Friends: 1
Friend #1: 135684, Friend #2: 135546, # of Mutual Friends: 0
Friend #1: 40997, Friend #2: 62623, # of Mutual Friends: 0
Friend #1: 40997, Friend #2: 201063, # of Mutual Friends: 0
Friend #1: 70696, Friend #2: 70772, # of Mutual Friends: 0
