### Finding the top 100 keywords in Hacker News 2014 with a pipeline

In [1]:
from pipeline import Pipeline, build_csv
from datetime import datetime
import json
import io
import csv
import string
from stop_words import stop_words

In [2]:
pipeline = Pipeline()

In [3]:
@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json',"r") as f:
        stories = json.load(f)
        return stories["stories"]

@pipeline.task(depends_on = file_to_json)
def filter_stories(stories):
    def is_popular(story):
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')
    
    return (
        story for story in stories
        if is_popular(story)
    )
            
@pipeline.task(depends_on = filter_stories)
def json_to_csv(stories):
    lines = []
    for story in stories:
        lines.append((story["objectID"],\
      datetime.strptime(story['created_at']\
                  , "%Y-%m-%dT%H:%M:%SZ"),\
                         story["url"],\
                   story["points"],\
                   story["title"]))
    return build_csv(lines, header=[
        'objectID', 'created_at', 'url',\
        'points','title'],file=io.StringIO())

@pipeline.task(depends_on = json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index("title")
    for line in reader:
        yield line[idx]
        
@pipeline.task(depends_on = extract_titles)
def clean_titles(titles):
    for title in titles:
        title = title.lower()
        title = ''.join(c for c in title if c not in string.punctuation)
        yield title

@pipeline.task(depends_on = clean_titles)
def build_keyword_dictionary(titles):
    word_freq = {}
    for title in titles:
        words = title.split(" ")     
        for word in words:
            if word and word not in stop_words:
                if word not in word_freq:
                    word_freq[word] = 1
                word_freq[word] += 1
    return word_freq

@pipeline.task(depends_on = build_keyword_dictionary)
def top_words(word_freq):
    freq_tuple = [
        (word, word_freq[word])
        for word in sorted(word_freq, key=word_freq.get, reverse=True)
    ]
    return freq_tuple[:100]              


In [4]:
ran = pipeline.run()
print(ran[top_words])

[('new', 186), ('google', 168), ('bitcoin', 102), ('open', 93), ('programming', 91), ('web', 89), ('data', 86), ('video', 80), ('python', 76), ('code', 73), ('facebook', 72), ('released', 72), ('using', 71), ('javascript', 66), ('2013', 66), ('free', 65), ('source', 65), ('game', 64), ('internet', 63), ('microsoft', 60), ('c', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('work', 55), ('language', 55), ('software', 53), ('2014', 53), ('startup', 52), ('use', 51), ('apple', 51), ('make', 51), ('security', 49), ('yc', 49), ('time', 49), ('github', 46), ('nsa', 46), ('windows', 45), ('world', 42), ('way', 42), ('like', 42), ('1', 41), ('project', 41), ('computer', 41), ('heartbleed', 41), ('ios', 38), ('git', 38), ('dont', 38), ('users', 38), ('design', 38), ('ceo', 37), ('os', 37), ('twitter', 37), ('developer', 37), ('life', 37), ('vs', 37), ('big', 36), ('day', 36), ('online', 35), ('android', 35), ('years', 34), ('court', 34), ('simple', 34), ('mt', 33), ('api', 33), ('apps', 33), ('

###### check the api for updated data

https://hn.algolia.com/api