# Natural Language Processing

## Material

- https://arxiv.org/pdf/1805.03818.pdf
- https://cs.stanford.edu/people/chrismre/papers/dd.pdf
- https://cs.stanford.edu/people/chrismre/papers/deepdive_highlight.pdf
- https://arxiv.org/pdf/1711.10160.pdf

## Setup

In [None]:
%%bash

cd ~/
curl -LOJ http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
unzip stanford-corenlp-full-2018-10-05.zip
pip install stanfordnlp warc3-wet bs4

## CoreNLP

In [22]:
from os import environ
from pprint import pprint
from stanfordnlp.server import CoreNLPClient

environ['CORENLP_HOME'] = f'{environ["HOME"]}/stanford-corenlp-full-2018-10-05/'

### Setup Parsing Pipeline

In [35]:
annot = ['parse']
nlp = CoreNLPClient(
    annotators=annot,
    timeout=3000,
    memory='16G'
)

### Parse Sentence and Print Tree 

In [80]:
def prpr(t,v=False):
    w = [prpr(c) for c in t.child]
    w = [t.value, w] if v else w
    return w if t.child else t.value

In [85]:
nlp.register_properties_key('multiparse', {'parse.kbest': 10})
doc = nlp.annotate('I went to the store to get ice cream.', properties_key='multiparse')

pprint(prpr(doc.sentence[0].kBestParseTrees[0]))

[[[['I']],
  [['went'],
   [['to'], [['the'], ['store']]],
   [[['to'], [['get'], [['ice'], ['cream']]]]]],
  ['.']]]


## Common Crawl

In [283]:
import requests, gzip, io, warc, bs4, http, json

In [261]:
cc_base = 'https://commoncrawl.s3.amazonaws.com'
cc_path = 'crawl-data/CC-MAIN-2019-47'
cc_idx = 'crawl-data/CC-MAIN-2019-47/cc-index.paths.gz'
cc_url = f'{cc_base}/{cc_path}'

### List WARC files

In [262]:
res = requests.get(f'{cc_url}/warc.paths.gz')
paths = gzip.decompress(res.content).decode('utf8').split('\n')

### List Crawled URLs

In [272]:
res = requests.get(f'{cc_base}/{cc_idx}')
shards = gzip.decompress(res.content).decode('utf8').split('\n')

res = requests.get(f'{cc_base}/{shards[1]}')
urls = gzip.decompress(res.content).decode('utf8').split('\n')

### Map crawl records

In [285]:
url, timestamp, data = urls[0].split(' ', 2)
crawled_at = json.loads(data)
crawled_at

### Read records from WARC archive

In [287]:
res = requests.get(f'{cc_base}/{paths[0]}')
arc = warc.warc.WARCFile(
    fileobj=io.BytesIO(gzip.decompress(res.content))
)

In [246]:
record = next(iter(arc))
url = record['WARC-Target-URI']

### Parse records

In [None]:
headers = http.client.parse_headers(record.payload)
body = bs4.BeautifulSoup(record.payload.read(), 'html.parser')
links = [link.get('href') for link in body.find_all('a')]
text = body.text

### Bing API

Get API key from https://azure.microsoft.com/en-us/services/cognitive-services/bing-web-search-api/

In [298]:
BING_API_KEY=open('.bingAPIKey').read().strip()
BING_SEARCH_ENDPOINT = 'https://api.cognitive.microsoft.com/bing/v7.0/search'

In [321]:
query = '"powered by phpbb"'
results = requests.get(
    BING_SEARCH_ENDPOINT, 
    params={'q': query, 'count': 100, 'offset': 0},
    headers={'Ocp-Apim-Subscription-Key': BING_API_KEY}
)

In [322]:
res = results.json()
pages = res.get('webPages').get('value')
sites = [page.get('url') for page in pages]
total = res.get('totalEstimatedMatches')

## Reddit API

### Find subreddits

In [326]:
reddit_base = 'https://api.reddit.com'
sub_search = 'subreddits/search'

In [329]:
query = 'hexadecimal'
results = requests.get(
    f'{reddit_base}/{sub_search}',
    params={'q': query}
)

In [334]:
results = results.json().get('data').get('children')
subreddits = [sub.get('data').get('display_name') for sub in results]

#### Get new posts

In [365]:
subreddit = 'scifi'
results = requests.get(f'{reddit_base}/r/{subreddit}/top')

In [366]:
posts = results.json().get('data').get('children')

transform = lambda post: {
    'name': post.get('title'), 
    'id': post.get('id'),
    'subreddit': post.get('subreddit'),
    'created': post.get('created_utc'),
    'score': post.get('score'),
}

posts = [transform(post.get('data')) for post in posts]

#### Get comment tree

Use the series of comment timestamps to [predict popular posts](https://snikolov.wordpress.com/2012/11/14/early-detection-of-twitter-trends/)

In [491]:
results = requests.get(
    f'{reddit_base}/r/{subreddit}/comments/{posts[0].get("id")}'
)

comments = []
queue = results.json()

while len(queue) > 0:
    current = queue.pop(0)
    kind = current.get('kind')
    current = current.get('data')
    
    if kind == 't1':
        comments.append({
            'created': current.get('created_utc'),
            'article': current.get('link_id'),
            'name': current.get('name'),
            'parent': current.get('parent_id')
        })
    queue += current.get('children', [])
    
    if type(current.get('replies') ) == dict:
        queue += current.get('replies').get('data', {}).get('children', [])
