# Hacker News Pipeline

This project introduces the field of data engineering by creating a data pipeline. Specifically, the data pipeline retrieves the 100 most populars keywords in the titles of articles in Hacker News. 

Each post has a set of keys. Here is a description of the most relevant keys:
* `created_at` - A timestamp of the story's creation time
* `created_at_i` - A unix epoch timestamp
* `url` - URL of the article's link
* `objectID` - Unique ID of the story
* `author` - The author's username on Hacker News
* `points` - The number of upvotes
* `title` - The title of the story
* `num_comments` - The number of comments for the article on Hacker News

**Objective:** To construct a data pipeline to retrieve the 100 most common keywords of the most popular articles on Hacker News.

**Techniques used:**
* Natural language processing
* JSON, datetime, data pipeline
* Function decorators

In [2]:
from datetime import datetime
import json
import io
import string
import csv

from pipeline import build_csv, Pipeline

pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as f:
        data = json.load(f)
        stories = data['stories']
    return stories

@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    def is_popular(story):
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')
    
    return (
        story for story in stories
        if is_popular(story)
    )

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines = []
    for story in stories:
        lines.append(
            (story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title'])
        )
    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())

@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    
    return (line[idx] for line in reader)

@pipeline.task(depends_on=extract_titles)
def clean_title(titles):
    for title in titles:
        title = title.lower()
        title = ''.join(c for c in title if c not in string.punctuation)
        yield title

@pipeline.task(depends_on=clean_title)
def build_keyword_dictionary(titles):
    word_freq = {}
    for title in titles:
        for word in title.split(' '):
            if len(word) > 5:
                if word not in word_freq:
                    word_freq[word] = 1
                word_freq[word] += 1
    return word_freq

@pipeline.task(depends_on=build_keyword_dictionary)
def top_keywords(word_freq):
    freq_tuple = [
        (word, word_freq[word])
        for word in sorted(word_freq, key=word_freq.get, reverse=True)
    ]
    return freq_tuple[:100]

ran = pipeline.run()
print(ran[top_keywords])

[('google', 168), ('bitcoin', 102), ('programming', 91), ('python', 76), ('released', 72), ('facebook', 72), ('javascript', 66), ('source', 65), ('internet', 63), ('microsoft', 60), ('language', 55), ('software', 53), ('startup', 52), ('security', 49), ('github', 46), ('system', 45), ('windows', 45), ('project', 41), ('computer', 41), ('heartbleed', 41), ('should', 38), ('design', 38), ('twitter', 37), ('developer', 37), ('online', 35), ('android', 35), ('simple', 34), ('browser', 33), ('learning', 33), ('firefox', 32), ('mozilla', 32), ('server', 32), ('problem', 32), ('engine', 32), ('introducing', 31), ('amazon', 31), ('better', 30), ('support', 30), ('people', 30), ('million', 30), ('development', 29), ('developers', 28), ('library', 28), ('billion', 28), ('chrome', 28), ('website', 28), ('inside', 28), ('hacker', 27), ('release', 27), ('silicon', 27), ('mobile', 26), ('haskell', 26), ('public', 26), ('service', 26), ('valley', 26), ('science', 26), ('future', 26), ('building', 25)