/
instagram_search_engine.py
125 lines (94 loc) · 3.31 KB
/
instagram_search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from preprocess import *
import math
import CustomGUI as gui
from collections import Counter
import operator
import webbrowser
RESULTS_PER_PAGE = 10
profiles = {}
with open('dataset/instagram_bio_dataset', 'r') as f:
line = f.readline()
line = f.readline()
n_profiles = 0
while line:
spl = line.split(',')
uid = spl[1]
bio = ''
for b in spl[2:]:
bio += b
profiles[uid] = preprocess(bio)
line = f.readline()
n_profiles += 1
print('read profiles:'+str(n_profiles))
print(len(profiles))
inverted_index = {}
for uid in profiles:
for word in profiles[uid]:
inverted_index.setdefault(word, {})[uid] = inverted_index.setdefault(word, {}).get(uid, 0) + 1
# print(inverted_index)
# print(inverted_index['model'])
# document frequency = number of docs containing a specific word, dictionary with key = word, value = num of docs
df = {}
# inverse document frequency
idf = {}
for key in inverted_index.keys():
df[key] = len(inverted_index[key].keys())
idf[key] = math.log(n_profiles / df[key], 2)
def tf_idf(w, doc):
return inverted_index[w][doc] * idf[w]
def inner_product_similarities(query):
# dictionary in which I'll sum up the similarities of each word of the query with each document in
# which the word is present, key is the doc number,
# value is the similarity between query and document
similarity = {}
for w in query:
wq = idf.get(w, 0)
if wq != 0:
for doc in inverted_index[w].keys():
similarity[doc] = similarity.get(doc, 0) + tf_idf(w, doc) * wq
return similarity
def doc_length(userid):
words_accounted_for = []
length = 0
for w in profiles[userid]:
if w not in words_accounted_for:
length += tf_idf(w, userid) ** 2
words_accounted_for.append(w)
return math.sqrt(length)
def query_length(query):
# IMPORTANT: in this HW no query has repeated words, so I can skip the term frequency calculation
# for the query, and just use idfs quared
length = 0
cnt = Counter()
for w in query:
cnt[w] += 1
for w in cnt.keys():
length += (cnt[w]*idf.get(w, 0)) ** 2
return math.sqrt(length)
def cosine_similarities(query):
similarity = inner_product_similarities(query)
for doc in similarity.keys():
similarity[doc] = similarity[doc] / doc_length(doc) / query_length(query)
return similarity
def rank_docs(similarities):
return sorted(similarities.items(), key=operator.itemgetter(1), reverse=True)
def new_query():
query = gui.ask_query()
if query is None:
exit()
# print(query)
query_tokens = preprocess(query)
ranked_similarities = rank_docs(cosine_similarities(query_tokens))
handle_show_query(ranked_similarities, query_tokens, RESULTS_PER_PAGE)
def handle_show_query(ranked_similarities, query_tokens, n):
choice = gui.display_query_results(ranked_similarities[:n], query_tokens)
if choice == 'Show more results':
handle_show_query(ranked_similarities, query_tokens, n + RESULTS_PER_PAGE)
else:
if choice is None:
new_query()
else:
open_website(choice)
def open_website(url):
webbrowser.open('https://www.instagram.com/'+url.split()[0]+'/', new=2, autoraise=True)
new_query()