In [2]:
!pip install jupyterthemes
!jt -t chesterish

In [1]:
from requests import get
from json import loads
from urllib.parse import unquote
import pandas as pd

## Topic Crawler

### Query DuckDuckGo REST API for topics
### Follow related topics
### Filter and normalize topics

In [2]:
topic_url_format = 'https://api.duckduckgo.com/?q={search_term}&format=json&pretty=1'

In [3]:
topic_seed = ['user experience design']

In [4]:
from time import sleep
topic_df = pd.DataFrame()
related_topic_df = pd.DataFrame()
for depth in range(2):
    if not depth:
        topics = topic_seed
    related_topics = []
    topic_rows = []
    related_topic_rows = []
    for topic in topics:
        try:
            print(topic)
            sleep(0.1)
            response = get(topic_url_format.format(search_term=topic))
            content = loads(response.content)
            data = {}
            data['topic'] = content['Heading'].lower()
            data['abstract'] = content['Abstract']
            data['document_type'] = content['Type']
            data['topic_wiki_url'] = content['AbstractURL']
            topic_rows.append(data)   

            for result in content['RelatedTopics']:
                row = {}
                row['depth'] = depth
                row['related_topic'] = unquote(result['FirstURL'].replace(
                    'https://duckduckgo.com/', '').replace('c/', '').replace('_', ' ')).lower()
                row['related_topic_text'] = result['Text']
                row.update(data)
                related_topic_rows.append(row)
                related_topics.append(row['related_topic'])
        except KeyError as e:
            print('Error on topic: ' + topic + ' : ' + str(e))
    topics = related_topics
    topic_df = topic_df.append(pd.DataFrame(topic_rows))
    related_topic_df = related_topic_df.append(pd.DataFrame(related_topic_rows))

user experience design
action research
activity-centered design
customer experience
design thinking
paper prototyping
participatory design
web design
software development process
user interfaces
human–computer interaction
design
Error on topic: design : 'FirstURL'


In [33]:
related_topic_df.head()

Unnamed: 0,abstract,depth,document_type,related_topic,related_topic_text,topic,topic_wiki_url
0,User experience design is the process of manip...,0,A,action research,Action research - Action research is a philoso...,user experience design,https://en.wikipedia.org/wiki/User_experience_...
1,User experience design is the process of manip...,0,A,activity-centered design,Activity-centered design - Activity-centered d...,user experience design,https://en.wikipedia.org/wiki/User_experience_...
2,User experience design is the process of manip...,0,A,customer experience,"Customer experience - In commerce, customer ex...",user experience design,https://en.wikipedia.org/wiki/User_experience_...
3,User experience design is the process of manip...,0,A,design thinking,Design thinking - Design thinking refers to th...,user experience design,https://en.wikipedia.org/wiki/User_experience_...
4,User experience design is the process of manip...,0,A,paper prototyping,Paper prototyping - In human–computer interact...,user experience design,https://en.wikipedia.org/wiki/User_experience_...


In [6]:
related_topic_df.to_csv('topic_database.csv')

In [7]:
### Write topics to graph JSON for d3

## Search Engine Crawler

In [9]:
from os import environ
# environ['BING_API_KEY'] = 'remove comment, replace with your API key, run the cell, remove key, add comment'
bing_api_key = environ['BING_API_KEY']

In [10]:
search_seed = set(related_topic_df['related_topic'].unique())
search_seed.update(set(related_topic_df['topic'].unique()))
len(search_seed)

71

In [11]:
def search(search_term):
    search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search"
    headers = {"Ocp-Apim-Subscription-Key": bing_api_key}
    params = {
        "q": search_term, 
        "textDecorations": True, 
        "textFormat": "Raw", 
        "responseFilter": "webpages", 
        "count": 50}
    sleep(0.4)
    response = get(search_url, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    return search_results

In [28]:
def filter_search_results(raw_search_results):
    keys = ['name', 'url', 'snippet', 'dateLastCrawled']
    for result in raw_search_results:
        topic = result['queryContext']['originalQuery']
        for page in result['webPages']['value']:
            row = {key: page[key] for key in keys}
            row['topic'] = topic
            yield row

In [141]:
raw_search_results = [search(search_term=search_term) for search_term in list(search_seed)[:2]]

In [142]:
search_results = pd.DataFrame(filter_search_results(raw_search_results))

In [143]:
search_results.head()

Unnamed: 0,dateLastCrawled,name,snippet,topic,url
0,2020-04-12T18:06:00.0000000Z,Lateral thinking - Wikipedia,Lateral thinking is a manner of solving prob...,lateral thinking,https://en.wikipedia.org/wiki/Lateral_thinking
1,2020-04-08T15:21:00.0000000Z,Lateral Thinking - How can Lateral Thinking...,Lateral Thinking Lateral Thinking is the S...,lateral thinking,https://www.edwddebono.com/lateral-thinking
2,2020-04-13T03:14:00.0000000Z,Lateral Thinking Puzzles - Preconceptions,Lateral thinking puzzles that challenge your...,lateral thinking,http://www.folj.com/lateral/
3,2020-04-13T06:39:00.0000000Z,How to Practice Lateral Thinking - ThoughtCo,Lateral thinking involves looking at a situa...,lateral thinking,https://www.thoughtco.com/lateral-thinking-185...
4,2020-04-13T01:05:00.0000000Z,Lateral Thinking,Welcome to Computer Assisted Thinking or CAT...,lateral thinking,https://www.lateralthinking.com/


## Search Result Crawler

In [78]:
!pip install beautifulsoup4



In [79]:
from bs4 import BeautifulSoup as b

In [133]:
def depth_crawl(seed_urls, depth=1, max_pages=5):
    visited = set()
    pages = 0
    for d in range(depth):
        print(seed_urls)
        for url in seed_urls:
            url.replace('http:', 'https:')
            sleep(0.5)
            print(url)
            try:
                print(visited)
                visited.add(url)
                response = get(url)
                response.raise_for_status()
                content = response.content
                yield {'url': url, 'content': content}
                pages += 1
                print(pages)
                if pages > max_pages:
                    return
                soup = b(content)
                links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('https')]
                n_links = len(links)
                print(f'{n_links} new links from URL {url}')
                new_links = []
                for link in links:
                    if link not in visited:
                        seed_urls.append(link)
            except Exception as e:
                print('Error: ' + str(e) + ' on URL: ' + str(url))

In [134]:
list(search_results['url'][-2:].values)

['https://www.bluecorona.com/internet-marketing-services/',
 'http://www.chilli-marketing.com/']

In [144]:
documents = [document for document in depth_crawl(list(search_results['url'][-2:].values))]

['https://www.marketingprofs.com/topic/articles/services-marketing', 'https://www.ebsglobal.net/documents/course-tasters/english/pdf/h17se-bk-taster.pdf']
https://www.marketingprofs.com/topic/articles/services-marketing
set()
1
18 new links from URL https://www.marketingprofs.com/topic/articles/services-marketing
https://www.ebsglobal.net/documents/course-tasters/english/pdf/h17se-bk-taster.pdf
{'https://www.marketingprofs.com/topic/articles/services-marketing'}
2


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


0 new links from URL https://www.ebsglobal.net/documents/course-tasters/english/pdf/h17se-bk-taster.pdf
https://www.marketingprofs.com/articles/2017/32715/the-biggest-leakage-in-a-saas-marketing-funnel-and-how-to-fix-it
{'https://www.marketingprofs.com/topic/articles/services-marketing', 'https://www.ebsglobal.net/documents/course-tasters/english/pdf/h17se-bk-taster.pdf'}
3
15 new links from URL https://www.marketingprofs.com/articles/2017/32715/the-biggest-leakage-in-a-saas-marketing-funnel-and-how-to-fix-it
https://www.marketingprofs.com/short-articles/2288/five-ways-to-search-optimize-your-images
{'https://www.marketingprofs.com/articles/2017/32715/the-biggest-leakage-in-a-saas-marketing-funnel-and-how-to-fix-it', 'https://www.marketingprofs.com/topic/articles/services-marketing', 'https://www.ebsglobal.net/documents/course-tasters/english/pdf/h17se-bk-taster.pdf'}
4
112 new links from URL https://www.marketingprofs.com/short-articles/2288/five-ways-to-search-optimize-your-images
ht

In [137]:
len(documents)

6

In [157]:
soup = b(documents[3]['content'])
for script in soup(["script", "style"]):
    script.extract()    # rip it out
document_text = soup.get_text().replace('\r', '\n').split('. ')

In [158]:
document_text

["\n\n\n\nSearch Engine Marketing - Five Ways to Search-Optimize Your Images : MarketingProfs Article\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMy Cart (0)\n\nMember Login | About Us | Become a Member | Contact Us\n\n\n\n\n\n\nOur Approach\nCourses\nLearning Paths\nEvents\nResources\nJoin Now\nLogin\n\n\n\nTopicsArticlesPodcastsOnline SeminarsTutorialsGuides and ReportsTools\n\n\nReal-World Education for Modern Marketers\nJoin Over 600,000 Marketing Professionals\n\nStart here!\n\n\n\n\n\n\n\n\n\n\n\nEmail\n\n\nPrint\n\n\nSave\n\n\n\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tText:\xa0\xa0A\xa0A\n\n\n\n\n\n\nN\n\nE\n\nX\n\nT\n\n\n\nFive Ways to Search-Optimize Your Images\n\n\n\n                    May 9, 2011\xa0\xa0 \n\n\t\t\t \n\n\n\n\n\n\n\nTweet\n\n\n\n\n\n\n\n\n\n\n\n\n\nEmail\n\n\n\n\n\n\n\nTop\n\n\n\n\n\nEver wonder how much meaningful t

In [40]:
aa = [1]
aa.extend([2])
aa

[1, 2]

In [None]:
pages = pages.reset_index(drop=True)
pages

In [None]:
document = get(pages['url'][0])

In [None]:
document.content

In [None]:
soup = b(document.content)

In [None]:
for script in soup(["script", "style"]):
    script.extract()    # rip it out
document_text = soup.get_text().replace('\r', '\n').split('. ')

In [None]:
document_text

In [None]:
len(document_text)

In [None]:
!pip install gensim

In [None]:
from gensim.utils import simple_preprocess
from gensim.models import doc2vec
def to_tokens(document_text, train=True): 
    for idx, line in enumerate(document_text):
        tokens = simple_preprocess(line)
        if train:
            yield doc2vec.TaggedDocument(tokens, [idx])
        else:
            yield tokens

In [None]:
train = list(to_tokens(document_text))
test = list(to_tokens(document_text, train=False))

In [None]:
train[:4]

In [None]:
model = doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(train)

In [None]:
model.train(train, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(train)):
    inferred_vector = model.infer_vector(train[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
len(model.docvecs)

In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train[sims[index][0]].words)))

In [None]:
import random
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test) - 1)
inferred_vector = model.infer_vector(test[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train[sims[index][0]].words)))