In [1483]:
!jt -t monokai -f fira -fs 13 -nf ptsans -nfs 11 -N -kl -cursw 5 -cursc r -cellw 95% -T
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

# $$\triangleleft \triangleright$$
# <center>Backend
<center>The design mixes caching and Linux philosophy. Store the results of each step in a text file.

<center>We will start with a single topic search and work up to a model of meaning for any new page.

In [1032]:
database = './database'
def table(name, database=database):
    return f'{database}/{name}'

## $$\bigtriangledown$$
## <center>Topic Crawler 
## $$\bigtriangleup$$

In [None]:
from requests import get
from time import sleep
from json import loads
from urllib.parse import unquote

In [3]:
topic_seed = ['user experience design']

In [1028]:
def crawl_topics(topic_seed, depth=2):
    topic_url_format = 'https://api.duckduckgo.com/?q={search_term}&format=json&pretty=1'
    topic_df = pd.DataFrame()
    related_topic_df = pd.DataFrame()
    for d in range(depth):
        if not d:
            topics = topic_seed
        related_topics = []
        topic_rows = []
        related_topic_rows = []
        for topic in topics:
            try:
                print(topic)
                # whoa there
                sleep(1.0)
                response = get(topic_url_format.format(search_term=topic))
                content = loads(response.content)
                data = {}
                data['topic'] = content['Heading'].lower()
                data['abstract'] = content['Abstract']
                data['document_type'] = content['Type']
                data['topic_wiki_url'] = content['AbstractURL']
                topic_rows.append(data)   

                for result in content['RelatedTopics']:
                    row = {}
                    row['depth'] = d
                    row['related_topic'] = unquote(result['FirstURL'].replace(
                        'https://duckduckgo.com/', '').replace('c/', '').replace('_', ' ')).lower()
                    row['related_topic_text'] = result['Text']
                    row.update(data)
                    related_topic_rows.append(row)
                    related_topics.append(row['related_topic'])
            # some topics are so ambiguous they have no URL
            except KeyError as e:
                print('Error on topic: ' + topic + ' : ' + str(e))
        topics = related_topics
        topic_df = topic_df.append(pd.DataFrame(topic_rows))
        related_topic_df = related_topic_df.append(pd.DataFrame(related_topic_rows))
    return topic_df, related_topic_df

In [1037]:
_, related_topic = crawl_topics(topic_seed)

user experience design
action research
activity-centered design
customer experience
design thinking
paper prototyping
participatory design
web design
software development process
user interfaces
human–computer interaction
design
Error on topic: design : 'FirstURL'


In [1038]:
related_topic.head()

Unnamed: 0,abstract,depth,document_type,related_topic,related_topic_text,topic,topic_wiki_url
0,User experience design is the process of manip...,0,A,action research,Action research - Action research is a philoso...,user experience design,https://en.wikipedia.org/wiki/User_experience_...
1,User experience design is the process of manip...,0,A,activity-centered design,Activity-centered design - Activity-centered d...,user experience design,https://en.wikipedia.org/wiki/User_experience_...
2,User experience design is the process of manip...,0,A,customer experience,"Customer experience - In commerce, customer ex...",user experience design,https://en.wikipedia.org/wiki/User_experience_...
3,User experience design is the process of manip...,0,A,design thinking,Design thinking - Design thinking refers to th...,user experience design,https://en.wikipedia.org/wiki/User_experience_...
4,User experience design is the process of manip...,0,A,paper prototyping,Paper prototyping - In human–computer interact...,user experience design,https://en.wikipedia.org/wiki/User_experience_...


In [1039]:
len(related_topic)

81

In [1040]:
related_topic.to_csv(table('related_topic.csv'))

## $$\bigtriangledown$$
## <center>Search Engine Crawler
## $$\bigtriangleup$$

In [1043]:
from getpass import getpass
bing_api_key = getpass()
# what is your azure cognitive services API key?

········


In [1047]:
search_seed = set(related_topic_df['related_topic'].unique())
search_seed.update(set(related_topic_df['topic'].unique()))
search_seed = list(search_seed)[:10]
len(search_seed)

10

In [1054]:
# this also works with bing dorks like "feed:"
def search(search_term):
    search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search"
    headers = {"Ocp-Apim-Subscription-Key": bing_api_key}
    params = {
        "q": search_term, 
        "textDecorations": True, 
        "textFormat": "Raw", 
        "responseFilter": "webpages", # for some reason "webpages,relatedsearches" does not work
        "count": 50}
    sleep(1.0)
    response = get(search_url, headers=headers, params=params)
    response.raise_for_status()
    search_result = response.json()
    return search_result

In [1055]:
def filter_search_results(raw_search_results):
    keys = ['name', 'url', 'snippet', 'dateLastCrawled']
    for result in raw_search_results:
        topic = result['queryContext']['originalQuery']
        for page in result['webPages']['value']:
            row = {key: page[key] for key in keys}
            row['topic'] = topic
            yield row

In [1056]:
raw_search_result = [search(search_term=search_term) for search_term in search_seed]

In [1057]:
search_result = pd.DataFrame(filter_search_results(raw_search_results))

In [1058]:
search_result.head()

Unnamed: 0,dateLastCrawled,name,snippet,topic,url
0,2020-04-17T19:58:00.0000000Z,Lateral thinking - Wikipedia,Lateral thinking is a manner of solving prob...,lateral thinking,https://en.wikipedia.org/wiki/Lateral_thinking
1,2020-04-16T03:17:00.0000000Z,Lateral Thinking - How can Lateral Thinking...,Lateral Thinking Lateral Thinking is the S...,lateral thinking,https://www.edwddebono.com/lateral-thinking
2,2020-04-16T18:52:00.0000000Z,Lateral Thinking Puzzles - Preconceptions,Lateral thinking puzzles that challenge your...,lateral thinking,http://www.folj.com/lateral/
3,2020-04-17T00:42:00.0000000Z,How to Practice Lateral Thinking - ThoughtCo,Lateral thinking involves looking at a situa...,lateral thinking,https://www.thoughtco.com/lateral-thinking-185...
4,2020-04-17T20:46:00.0000000Z,What is Lateral Thinking? | Examples of Lat...,Lateral thinking means taking a creative app...,lateral thinking,https://successatschool.org/advicedetails/609/...


In [1059]:
len(search_result)

483

In [1060]:
search_result.to_csv(table('search_result.csv'))

## $$\bigtriangledown$$
## <center>Search Result Crawler
## $$\bigtriangleup$$

In [78]:
!pip install beautifulsoup4



In [1011]:
from bs4 import BeautifulSoup as b
from random import sample
from json import dump, loads

In [1062]:
def random_depth_crawl(seed_urls, depth=1, max_pages=10, max_links=5):
    visited = set()
    pages = 0
    for d in range(depth):
        next_seed_urls = set()
        for url in set(seed_urls):
            url.replace('http:', 'https:')
            if url not in visited:
                # only crawl each page one time
                visited.add(url)
                sleep(0.5)
                try:
                    response = get(url, timeout=5)
                    if response.status_code == 200:
                        content = response.content
                        # decode to make sure we can make sense of the content
                        yield {'url': url, 'content': content.decode()}
                        pages += 1
                        if pages > max_pages:
                            return
                        soup = b(content)
                        # choose random links to follow
                        links = [
                            a['href'] for a in soup.find_all('a', href=True)
                            if a['href'].startswith('https')]
                        n_links = min(len(links), max_links)
                        random_links = sample(links, n_links)
                        for link in random_links:
                            next_seed_urls.add(link)
                except Exception as e:
                    print('Error: ' + str(e) + ' on URL: ' + str(url))
        seed_urls = next_seed_urls

In [1067]:
seed_urls = search_result['url'].unique()

In [1068]:
max_pages = 10
raw_document_file = table('raw_document.txt')
for idx, document in enumerate(depth_crawl(seed_urls, max_pages=max_pages)):
    with open(raw_document_file, 'a') as f:
        dump(document, f)
        f.write('\n')

In [1075]:
def read_raw_document_file(raw_document_file):
    with open(raw_document_file, 'r') as f:
        for line in f.readlines():
            yield loads(line)

In [1076]:
raw_document = pd.DataFrame(read_raw_document_file(raw_document_file))

In [1077]:
raw_document.head()

Unnamed: 0,content,url
0,"<!DOCTYPE html> <html lang=""en""> <head> <meta ...",https://en.itu.dk/programmes/msc-programmes/so...
1,"<!DOCTYPE html><html lang=""en""><head><title>Th...",https://www.forbes.com/sites/phillewis1/2020/0...
2,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr""\n ...",https://www.truity.com/career-profile/graphic-...
3,"<!DOCTYPE html>\n<html class=""client-nojs"" lan...",https://www.wikihow.com/Think-Laterally
4,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",https://participaction.wordpress.com/2008/05/2...


In [1093]:
len(raw_document)

6

In [1161]:
document_topic = raw_document.merge(
    search_result,
    on='url')
td = document_topic # alias to make it concise below
# a document can be associated with multiple topics
td['topic_list'] = td.apply(lambda x: list(td[td['url'] == x['url']]['topic'].unique()), axis=1)

In [1162]:
document_topic.head()

Unnamed: 0,content,url,dateLastCrawled,name,snippet,topic,topic_list
0,"<!DOCTYPE html> <html lang=""en""> <head> <meta ...",https://en.itu.dk/programmes/msc-programmes/so...,2020-04-16T22:49:00.0000000Z,Software Design - ITU,Software Design is developed in close cooper...,software design,[software design]
1,"<!DOCTYPE html><html lang=""en""><head><title>Th...",https://www.forbes.com/sites/phillewis1/2020/0...,2020-04-17T22:10:00.0000000Z,The Most Valuable Skill In Difficult Times Is ...,Lateral thinking is the essence of creativit...,lateral thinking,[lateral thinking]
2,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr""\n ...",https://www.truity.com/career-profile/graphic-...,2020-04-16T01:53:00.0000000Z,Graphic Designer Career Profile | Job Descri...,Graphic design is important in the sales and...,graphic design,[graphic design]
3,"<!DOCTYPE html>\n<html class=""client-nojs"" lan...",https://www.wikihow.com/Think-Laterally,2020-04-16T16:06:00.0000000Z,How to Think Laterally: 13 Steps (with Pictu...,A key component in lateral thinking is curio...,lateral thinking,[lateral thinking]
4,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",https://participaction.wordpress.com/2008/05/2...,2020-04-17T08:13:00.0000000Z,Participatory research: Power and Proble...,Getting to grips with “what is participatory ...,participatory action research,[participatory action research]


In [1163]:
len(document_topic)

6

In [1335]:
document_topic.to_csv(table('document_topic.csv'))

Consider doing more feature engineering, such as Unix epoch time, domain and TLD, number of topic nearest neighbors

## $$\bigtriangledown$$
## <center>Document-Topic Model
## $$\bigtriangleup$$

In [260]:
!pip install gensim



In [1097]:
from gensim.utils import simple_preprocess
from gensim.models import doc2vec

In [1098]:
def get_better_text(document):
    soup = b(document)
    for tag in soup(["script", "style"]):
        tag.extract()
    return soup.get_text() #.replace('\r', '\n')

In [1164]:
def to_tokens(documents, index=None): 
    for idx, document in enumerate(documents):
        try:
            chars = str(get_better_text(document).encode())
            nchars = len(chars)
            frac_foreign_chars = chars.count('\\x')/nchars
            if frac_foreign_chars > 0.05:
                continue
        except (AttributeError, UnicodeDecodeError):
            continue
        tokens = simple_preprocess(get_better_text(document))
        value = idx
        if index is not None:
            value = index[idx]
        yield idx, doc2vec.TaggedDocument(tokens, value)

In [1165]:
# build training examples one by one, discarding malformed and non-English
n_train = 100 # maximum number of training examples
documents = document_topic['content'][:n_train]
index = document_topic['topic_list'][:n_train]

url_idx = []
train = []
for idx, val in enumerate(to_tokens(documents, index)):
    url_idx_, train_ = val
    url_idx.append(url_idx_)
    train.append(train_)

In [1166]:
# proceed only with the subset of documents that could be parsed
document_topic = document_topic.iloc[:, url_idx]
document_topic['train'] = train

In [1167]:
document_topic['n_tokens'] = document_topic['train'].apply(lambda training_example: len(training_example[0]))

In [1170]:
n_topics = len(set(document_topic['topic']))
n_tokens = document_topic['n_tokens'].sum()
n_documents = len(document_topic)
print(f'{n_topics} topics in corpus')
print(f'{n_documents} documents in corpus')
print(f'{n_tokens} tokens in corpus')

5 topics in corpus
6 documents in corpus
12419 tokens in corpus


In [1171]:
model = doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train)

In [1172]:
model.train(train, total_examples=model.corpus_count, epochs=model.epochs)

In [1173]:
model.save(table('doc2vec.pkl'))

$$\bigtriangledown$$
## <center> Model Inference
$$\bigtriangleup$$

In [1477]:
from ipywidgets import Button, HBox, interact
from IPython.display import display

In [1177]:
n_model_topics = len(model.docvecs.offset2doctag)
print(f'{n_model_topics} topics in model')

5 topics in model


In [1178]:
ranks = []
second_ranks = []
for document in train:
    inferred_vector = model.infer_vector(document.words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    top, _ = sims[0]
    rank = top in document.tags
    ranks.append(rank)
    second_ranks.append(sims[1])

In [1179]:
correct_topic  = 100*sum(ranks)/len(test)
print(f'{correct_topic}% of documents indexed with at least one correct topic')

100.0% of documents indexed with at least one correct topic


In [1180]:
# associate doc2vec document vector with each parseable document
document_topic['inferred_vector'] = document_topic['train'].apply(lambda x: model.infer_vector(x.words))

In [1246]:
document_topic = document_topic.reset_index(drop=True)

### <center> Similarity Metrics
$d_i$ is the ith document vector

$t_i$ is the learned topic of the ith document and it doubles as for the vector of that topic

$d_j(t_j=t_i) = d_j$ if the topic of the jth document is the same as the topic of the ith document, otherwise it is the zero vector

Define possible sorting metrics:
- cross similarity: $C_{i,t_i} = \sum_{i\neq j}d_i\cdot d_j(t_j=t_i)$

- topic similarity: $T_{i, t_{i}}=d_i\cdot t_i$

- product similarity: $P_{i,t_i}=C_{i,t_i}T_{i,t_i}$

In [1289]:
def cross_similarities(vector, norm=True):
    n = len(vector)
    sums = [0.0 for _ in range(n)] # pd.Series(n) doesn't work??
    size = n*n - n
    sims = pd.Series(size+1)
    for row, rv in enumerate(vector):
        for col, cv in enumerate(vector[row:]):
            col += row
            sim = rv.dot(cv)
            sims[row*col+row] = sim
            sums[row] += sim
            sums[col] += sim
    sums = pd.Series(sums)
    if norm:
        sims /= sims.max()
        sums /= sums.max()
    return sims, sums.copy() 

In [1427]:
cross_similarity = pd.DataFrame()
# computing norms over each topic so makes sense to loop over topics
for topic, df in document_topic.groupby('topic'):
    # find topic vector
    topic_idx = model.docvecs.offset2doctag.index(topic)
    # dot topic vector with each document vector
    document_topic.loc[df.index, 'topic_similarity'] = document_topic.loc[
        df.index, 'inferred_vector'].apply(lambda x: x.dot(model.docvecs[topic_idx]))
    document_topic.loc[df.index, 'topic_similarity'] /= document_topic.loc[df.index, 'topic_similarity'].max()
    # get lower triangle of similarities and write cross similarity sum to database table
    topic_cross_similarity, document_topic.loc[df.index, 'cross_similarity'] = cross_similarities(
        df['inferred_vector'])
    # might as well get a dataframe of lower triangles
    cross_similarity = cross_similarity.append(
        pd.DataFrame({'topic': topic, 'cross_similarity': topic_cross_similarity}))
# a metric that balances similarity to topic and to all other documents
document_topic['product_similarity'] = document_topic['cross_similarity']*document_topic['topic_similarity']

In [1473]:
def plot_similarities(df):
    _, ax = plt.subplots()
    keys = ['topic_similarity', 'cross_similarity', 'product_similarity']
    for key in keys:
        df[key].hist(ax=ax)
    ax.set_xlim(0, 1)
    ax.legend(keys)

In [1474]:
def interactive_similarities(topic):
    plot_similarities(document_topic.groupby('topic').get_group(topic))

In [1476]:
topics = document_topic['topic'].unique()
interact(interactive_similarities, topic=topics);

interactive(children=(Dropdown(description='topic', options=('software design', 'lateral thinking', 'graphic d…