TODOs:

- Rewrite the Pipeline class' output to save a file of the output for each task. This will allow you to "checkpoint" tasks so they don't have to be run twice.
- Use the nltk package for more advanced natural language processing tasks.
- Convert to a CSV before filtering, so you can keep all the stories from 2014 in a raw file.
- Fetch the data from Hacker News directly from a JSON API. Instead of reading from the file we gave, you can perform additional data processing using newer data.

In [1]:
import json
from pipeline import Pipeline, build_csv
from datetime import datetime
import io
import csv
import string
from stop_words import stop_words
import operator

pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    
    f = open('hn_stories_2014.json', 'r')
    
    # Load json file into a Python dictionary
    data = json.load(f) # file to string
    stories = data['stories'] 
    
    return stories

@pipeline.task(depends_on = file_to_json)
def filter_stories(stories):
    # Return a generator function to avoid memory overflow
    def is_popular(story):
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')     
    return (
        story for story in stories
        if is_popular(story)
    )  

@pipeline.task(depends_on = filter_stories)
def json_to_csv(stories):

    lines = []
    for story in stories:
        objectID = story['objectID']
        created_at = datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ")
        url = story['url']
        points = story['points']
        title = story['title']
        
        lines.append((objectID, created_at, url, points, title))
    
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    
    return build_csv(lines, header, file=io.StringIO())

@pipeline.task(depends_on = json_to_csv)
def extract_titles(csv_file):
    def inner():
        #f = open(csv_file, 'r')
        csv_reader = csv.reader(csv_file)
        
        headers = next(csv_reader)
        idx = headers.index('title')
        return (
            l[idx] for l in csv_reader
        )
    return inner()


@pipeline.task(depends_on = extract_titles)
def clean_titles(titles):
    
    def remove_punctuation_and_lower_title():
        
        # Need to review maketrans and translate methods
        str_to_remove = string.punctuation + '‘' + '’'
        remove = str.maketrans({key: None for key in str_to_remove})
        return (
            t.translate(remove).lower() for t in titles
        )
    return remove_punctuation_and_lower_title()

@pipeline.task(depends_on = clean_titles)
def build_keyword_dictionary(clean_titles):
    
    word_count = {}
    
    for t in clean_titles:
        words = t.split()
        for w in words:
            if w not in stop_words:
                if w not in word_count.keys():
                    word_count[w] = 0 
                word_count[w] += 1
    return word_count

@pipeline.task(depends_on = build_keyword_dictionary)
def top_100_words_used_in_titles(word_count):
    
    # sorted(word_count, key=word_count.get, reverse=True)
    sorted_words = sorted(word_count.items(), key=operator.itemgetter(1), 
                          reverse = True)
    return sorted_words[:100]
    
            
output = pipeline.run()
print(output[top_100_words_used_in_titles])
print() 


[('new', 185), ('google', 167), ('bitcoin', 101), ('open', 92), ('programming', 90), ('web', 89), ('data', 85), ('video', 79), ('python', 76), ('code', 72), ('released', 71), ('using', 71), ('facebook', 71), ('2013', 65), ('javascript', 65), ('source', 64), ('free', 64), ('internet', 63), ('game', 63), ('microsoft', 59), ('c', 59), ('linux', 58), ('app', 57), ('pdf', 55), ('language', 54), ('dont', 54), ('work', 54), ('2014', 52), ('software', 52), ('startup', 51), ('make', 50), ('apple', 50), ('use', 50), ('yc', 48), ('security', 48), ('time', 48), ('github', 45), ('nsa', 45), ('windows', 44), ('way', 41), ('1', 41), ('world', 41), ('like', 41), ('project', 40), ('computer', 40), ('heartbleed', 40), ('users', 37), ('design', 37), ('ios', 37), ('git', 37), ('ceo', 36), ('developer', 36), ('life', 36), ('os', 36), ('vs', 36), ('twitter', 36), ('big', 35), ('day', 35), ('android', 34), ('online', 34), ('court', 33), ('simple', 33), ('years', 33), ('browser', 32), ('mt', 32), ('api', 32),

The code below is for test purposes:

In [4]:
def file_to_json():
    
    f = open('hn_stories_2014.json', 'r')
    
    # Load json file into a Python dictionary
    data = json.load(f) # file to string
    stories = data['stories'] 
    
    return stories

def filter_stories(stories):
    # Return a generator function to avoid memory overflow
    def is_popular(story):
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')     
    return (
        story for story in stories
        if is_popular(story)
    )  

def json_to_csv(stories):

    lines = []
    for story in stories:
        objectID = story['objectID']
        created_at = datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ")
        url = story['url']
        points = story['points']
        title = story['title']
        
        lines.append((objectID, created_at, url, points, title))
    
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    
    return build_csv(lines, header, file=io.StringIO())

def extract_titles(csv_file):
    def inner():
        #f = open(csv_file, 'r')
        csv_reader = csv.reader(csv_file)
        
        headers = next(csv_reader)
        idx = headers.index('title')
        return (
            l[idx] for l in csv_reader
        )
    return inner()

def clean_titles(titles):
    
    def remove_punctuation_and_lower_title():
        
        # Need to review maketrans and translate methods
        str_to_remove = string.punctuation + '‘' + '’'
        remove = str.maketrans({key: None for key in str_to_remove})
        return (
            t.translate(remove).lower() for t in titles
        )
    return remove_punctuation_and_lower_title()

def build_keyword_dictionary(clean_titles):
    
    word_count = {}
    
    for t in clean_titles:
        words = t.split()
        for w in words:
            if w not in stop_words:
                if w not in word_count.keys():
                    word_count[w] = 0 
                word_count[w] += 1
    return word_count

def top_100_words_used_in_titles(word_count):
    sorted_words = sorted(word_count.items(), key=operator.itemgetter(1), 
                          reverse = True)
    
    print(type(sorted_words))
    
    return sorted_words[:10]
    
    
    
file = file_to_json()
stories = filter_stories(file)
csv_file = json_to_csv(stories)

titles = extract_titles(csv_file)
clean_titles = clean_titles(titles)
word_count = build_keyword_dictionary(clean_titles)
sorted_words = top_100_words_used_in_titles(word_count)

print(sorted_words)

for t in clean_titles:
    print(t)
    break

# True Goodbye: ‘Using TrueCrypt Is Not Secure’

<class 'list'>
[('new', 185), ('google', 167), ('bitcoin', 101), ('open', 92), ('programming', 90), ('web', 89), ('data', 85), ('video', 79), ('python', 76), ('code', 72)]


In [5]:
from datetime import datetime
created_at = '2014-05-29T04:27:42Z'

dt = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
print(dt)

2014-05-29 04:27:42


In [6]:
print((test['stories'][0]))

s = test['stories']

print(type(s))

{'story_id': None, '_highlightResult': {'author': {'matchLevel': 'none', 'matchedWords': [], 'value': 'TuxLyn'}, 'title': {'matchLevel': 'none', 'matchedWords': [], 'value': 'DuckDuckGo Settings'}, 'url': {'matchLevel': 'none', 'matchedWords': [], 'value': 'https://duckduckgo.com/settings'}, 'story_text': {'matchLevel': 'none', 'matchedWords': [], 'value': ''}}, 'title': 'DuckDuckGo Settings', '_tags': ['story', 'author_TuxLyn', 'story_7815290'], 'created_at': '2014-05-29T08:25:40Z', 'story_url': None, 'author': 'TuxLyn', 'created_at_i': 1401351940, 'comment_text': None, 'url': 'https://duckduckgo.com/settings', 'objectID': '7815290', 'story_text': '', 'story_title': None, 'points': 1, 'num_comments': 0, 'parent_id': None}
<class 'list'>


In [26]:
#print(stop_words)

if 'the' in stop_words:
    print('yep')

yep
