In [None]:
!cp '/content/drive/MyDrive/Dataset/hackernews.csv'  .

We'll compare Ask HN and Show HN to answer the following questions:

A. Do `Ask HN` or `Show HN` receive more comments on average?

B. Do posts created at a certain time receive more comments on average?

C. Do either `Ask HN` or `Show HN` receive more points?

D. During which hours are the posts more likely to receive higher 
points?


In [None]:
from csv import reader
opened_file = open("hackernews.csv")
read_file = reader(opened_file)
data = list(read_file)
print(data[0:5])

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]


In order to analyze our data, we'll remove the first row of column headers:


In [None]:
headers = data[0]
data = data[1:]
print(data[0:5])

[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]


In [None]:
ask_posts = []
show_posts = []
other_posts = []
for row in data:
    title = row[1]
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)

print("Total number of ask posts:", len(ask_posts))
print("Total number of show posts:", len(show_posts))
print("Total number of other posts:", len(other_posts))

Total number of ask posts: 1744
Total number of show posts: 1162
Total number of other posts: 17195


In [None]:

print(ask_posts[0:5])

[['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', '8/16/2016 9:55'], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', '11/22/2015 13:43'], ['11610310', 'Ask HN: Aby recent changes to CSS that broke mobile?', '', '1', '1', 'polskibus', '5/2/2016 10:14'], ['12210105', 'Ask HN: Looking for Employee #3 How do I do it?', '', '1', '3', 'sph130', '8/2/2016 14:20'], ['10394168', 'Ask HN: Someone offered to buy my browser extension from me. What now?', '', '28', '17', 'roykolak', '10/15/2015 16:38']]


In [None]:
print(show_posts[0:5])

[['10627194', 'Show HN: Wio Link  ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', '11/25/2015 14:03'], ['10646440', 'Show HN: Something pointless I made', 'http://dn.ht/picklecat/', '747', '102', 'dhotson', '11/29/2015 22:46'], ['11590768', 'Show HN: Shanhu.io, a programming playground powered by e8vm', 'https://shanhu.io', '1', '1', 'h8liu', '4/28/2016 18:05'], ['12178806', 'Show HN: Webscope  Easy way for web developers to communicate with Clients', 'http://webscopeapp.com', '3', '3', 'fastbrick', '7/28/2016 7:11'], ['10872799', 'Show HN: GeoScreenshot  Easily test Geo-IP based web pages', 'https://www.geoscreenshot.com/', '1', '9', 'kpsychwave', '1/9/2016 20:45']]


Determine if ask posts or show posts receive more comments on average


In [None]:

# Using list comprehension to tidy up the loops
ask_comments = [int(row[4]) for row in ask_posts]
show_comments = [int(row[4]) for row in show_posts]
print(sum(ask_comments))
print(len(ask_comments))

avg_ask_comments = sum(ask_comments)/len(ask_comments)
avg_show_comments = sum(show_comments)/len(show_comments)

print("Average Ask Comments: ", avg_ask_comments)
print("Average Show Comments: ", avg_show_comments)

24483
1744
Average Ask Comments:  14.038417431192661
Average Show Comments:  10.31669535283993


**Our calculation indicates that on average, ask hn posts receive more comments.**
Since ask posts are more likely to receive comments, we'll focus our remaining analysis just on these posts.

Determine if there is a certain time ask posts are more likely to attract comments

In [None]:
import datetime as dt
result_list = []
for row in ask_posts:
    sublist = []
    created_time = row[6]
    num_comments = int(row[4])
    sublist.append(created_time)
    sublist.append(num_comments)
    result_list.append(sublist)

result_list[0:12]

[['8/16/2016 9:55', 6],
 ['11/22/2015 13:43', 29],
 ['5/2/2016 10:14', 1],
 ['8/2/2016 14:20', 3],
 ['10/15/2015 16:38', 17],
 ['9/26/2015 23:23', 1],
 ['4/22/2016 12:24', 4],
 ['11/16/2015 9:22', 1],
 ['2/24/2016 17:57', 1],
 ['6/4/2016 17:17', 2],
 ['9/19/2015 17:04', 7],
 ['9/22/2015 13:16', 1]]

**create the frequency table with the date and comments data:**

In [None]:
counts_by_hour = {}
comments_by_hour = {}

for row in result_list:
    dt_object = row[0]
    dt_parsed = dt.datetime.strptime(dt_object, "%m/%d/%Y %H:%M")
    hr = dt.datetime.strftime(dt_parsed, "%H")
    #print(hr)

    if hr not in counts_by_hour:
        counts_by_hour[hr] = 1
        comments_by_hour[hr] = int(row[1])
    else:
        counts_by_hour[hr] = counts_by_hour[hr] + 1
        comments_by_hour[hr] = comments_by_hour[hr] + int(row[1])

print(comments_by_hour)
print(counts_by_hour)

{'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58}


Average number of comments in an hour

We can now use these two dictionaries to calculate the average number of comments for posts created during each hour of the day.You See bellow  we will build a list of lists containing the hours during which posts were created and the average number of comments those posts received.

In [None]:
avg_by_hour = []

for hour in comments_by_hour:
    avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])

avg_by_hour
# 1. lists can be created from dictionaries
# 2. Calculatons can be done on the key values of dictionaries

[['09', 5.5777777777777775],
 ['13', 14.741176470588234],
 ['10', 13.440677966101696],
 ['14', 13.233644859813085],
 ['16', 16.796296296296298],
 ['23', 7.985294117647059],
 ['12', 9.41095890410959],
 ['17', 11.46],
 ['15', 38.5948275862069],
 ['21', 16.009174311926607],
 ['20', 21.525],
 ['02', 23.810344827586206],
 ['18', 13.20183486238532],
 ['03', 7.796296296296297],
 ['05', 10.08695652173913],
 ['19', 10.8],
 ['01', 11.383333333333333],
 ['22', 6.746478873239437],
 ['08', 10.25],
 ['04', 7.170212765957447],
 ['00', 8.127272727272727],
 ['06', 9.022727272727273],
 ['07', 7.852941176470588],
 ['11', 11.051724137931034]]

We'll swap the elements to display the average by hour below.

In [None]:

swap_avg_by_hour = []
for row in avg_by_hour:
    # print(key)
    # creating a sublist and appending it
    sublist = []
    sublist.append(row[1])
    sublist.append(row[0])
    swap_avg_by_hour.append(sublist)
print(swap_avg_by_hour)

[[5.5777777777777775, '09'], [14.741176470588234, '13'], [13.440677966101696, '10'], [13.233644859813085, '14'], [16.796296296296298, '16'], [7.985294117647059, '23'], [9.41095890410959, '12'], [11.46, '17'], [38.5948275862069, '15'], [16.009174311926607, '21'], [21.525, '20'], [23.810344827586206, '02'], [13.20183486238532, '18'], [7.796296296296297, '03'], [10.08695652173913, '05'], [10.8, '19'], [11.383333333333333, '01'], [6.746478873239437, '22'], [10.25, '08'], [7.170212765957447, '04'], [8.127272727272727, '00'], [9.022727272727273, '06'], [7.852941176470588, '07'], [11.051724137931034, '11']]


In [None]:
#top 5 hours for posting comments:

# sorted function to arrange
sorted_swap = sorted(swap_avg_by_hour, reverse=True)
print("Top 5 hours for Ask Posts Comments:")

for i in range(0, 5):
    hr_obj = dt.datetime.strptime(
        sorted_swap[i][1], "%H")  # Assigns each value
    hr_obj_string = dt.datetime.strftime(hr_obj, "%H:%M")  # To final string
    avg = sorted_swap[i][0]
    template = '''
    {}: {:.2f} average comments per post    
    '''
    print(template.format(hr_obj_string, avg))

Top 5 hours for Ask Posts Comments:

    15:00: 38.59 average comments per post    
    

    02:00: 23.81 average comments per post    
    

    20:00: 21.52 average comments per post    
    

    16:00: 16.80 average comments per post    
    

    21:00: 16.01 average comments per post    
    


Ans For 2nd Question -> On average, the majority of comments are created at 15:00 you can adjust it with your time zoone this is for EST

In [None]:
a_points = 0
s_points = 0

for row in ask_posts:
    a_points = a_points + int(row[3])

print("Total Number of points received by all ask posts:", a_points, "\n"
      "Avg. number of points per ask post:", (a_points/len(ask_posts)), "\n")

for row in show_posts:
    s_points = s_points + int(row[3])

print("Total Number of points received by all show posts:", s_points, "\n",
      "Avg. number of points per show post:", (s_points/len(show_posts)), "\n")

if (a_points/len(ask_posts)) > (s_points/len(show_posts)):
    print("On average, number of points received is greater for a an ask post.")
else:
    print("On average, number of points received is greater for a show post.")

Total Number of points received by all ask posts: 26268 
Avg. number of points per ask post: 15.061926605504587 

Total Number of points received by all show posts: 32019 
 Avg. number of points per show post: 27.555077452667813 

On average, number of points received is greater for a show post.


**which hours are Show HN posts more likely to receive higher points**

construct a list to hold the data we're interested in

In [None]:
points_date = []

for row in show_posts:
    sublist = []
    created_time = row[6]
    points = int(row[3])
    sublist.append(created_time)
    sublist.append(points)
    points_date.append(sublist)

points_date[0:4]

[['11/25/2015 14:03', 26],
 ['11/29/2015 22:46', 747],
 ['4/28/2016 18:05', 1],
 ['7/28/2016 7:11', 3]]

In [None]:
show_counts_by_hour = {}
points_by_hour = {}

for row in points_date:
    dt_obj = row[0]
    dt_parsed = dt.datetime.strptime(dt_obj, "%m/%d/%Y %H:%M")
    hr = dt.datetime.strftime(dt_parsed, "%H")
    # print(hr)

    if hr not in show_counts_by_hour:
        show_counts_by_hour[hr] = 1
        points_by_hour[hr] = int(row[1])
    else:
        show_counts_by_hour[hr] = show_counts_by_hour[hr] + 1
        points_by_hour[hr] = points_by_hour[hr] + int(row[1])

print(points_by_hour)
print(show_counts_by_hour)

{'14': 2187, '22': 1856, '18': 2215, '07': 494, '20': 1819, '05': 104, '16': 2634, '19': 1702, '15': 2228, '03': 679, '17': 2521, '06': 375, '02': 340, '13': 2438, '08': 519, '21': 866, '04': 386, '11': 1480, '12': 2543, '23': 1526, '09': 553, '01': 700, '10': 681, '00': 1173}
{'14': 86, '22': 46, '18': 61, '07': 26, '20': 60, '05': 19, '16': 93, '19': 55, '15': 78, '03': 27, '17': 93, '06': 16, '02': 30, '13': 99, '08': 34, '21': 47, '04': 26, '11': 44, '12': 61, '23': 36, '09': 30, '01': 28, '10': 36, '00': 31}


In [None]:
show_avg_by_hour = []

for hour in points_by_hour:
    show_avg_by_hour.append(
        [hour, points_by_hour[hour]/show_counts_by_hour[hour]])

show_avg_by_hour

[['14', 25.430232558139537],
 ['22', 40.34782608695652],
 ['18', 36.31147540983606],
 ['07', 19.0],
 ['20', 30.316666666666666],
 ['05', 5.473684210526316],
 ['16', 28.322580645161292],
 ['19', 30.945454545454545],
 ['15', 28.564102564102566],
 ['03', 25.14814814814815],
 ['17', 27.107526881720432],
 ['06', 23.4375],
 ['02', 11.333333333333334],
 ['13', 24.626262626262626],
 ['08', 15.264705882352942],
 ['21', 18.425531914893618],
 ['04', 14.846153846153847],
 ['11', 33.63636363636363],
 ['12', 41.68852459016394],
 ['23', 42.388888888888886],
 ['09', 18.433333333333334],
 ['01', 25.0],
 ['10', 18.916666666666668],
 ['00', 37.83870967741935]]

In [None]:
swap_show_avg_by_hour = []
for row in show_avg_by_hour:
    # print(key)
    # creating a sublist and appending it
    sublist = []
    sublist.append(row[1])
    sublist.append(row[0])
    swap_show_avg_by_hour.append(sublist)
print(swap_show_avg_by_hour)

[[25.430232558139537, '14'], [40.34782608695652, '22'], [36.31147540983606, '18'], [19.0, '07'], [30.316666666666666, '20'], [5.473684210526316, '05'], [28.322580645161292, '16'], [30.945454545454545, '19'], [28.564102564102566, '15'], [25.14814814814815, '03'], [27.107526881720432, '17'], [23.4375, '06'], [11.333333333333334, '02'], [24.626262626262626, '13'], [15.264705882352942, '08'], [18.425531914893618, '21'], [14.846153846153847, '04'], [33.63636363636363, '11'], [41.68852459016394, '12'], [42.388888888888886, '23'], [18.433333333333334, '09'], [25.0, '01'], [18.916666666666668, '10'], [37.83870967741935, '00']]


In [None]:
#sorted function to arrange 
show_sorted_swap = sorted(swap_show_avg_by_hour, reverse = True)
print("Top 5 hours for Show Posts Points")

for i in range(0,5):
    obj = dt.datetime.strptime(show_sorted_swap[i][1],"%H") # Assigns what is what
    obj_string = dt.datetime.strftime(obj, "%H:%M") #To final string
    show_avg = show_sorted_swap[i][0]
    template = '''
    {}: {:.2f} average points per post    
    '''
    print(template.format(obj_string,show_avg))

Top 5 hours for Show Posts Points

    23:00: 42.39 average points per post    
    

    12:00: 41.69 average points per post    
    

    22:00: 40.35 average points per post    
    

    00:00: 37.84 average points per post    
    

    18:00: 36.31 average points per post    
    


Ans D: the best time for a show HN post is at 23:00 EST and on an average 12:00 and 22:00

In [None]:
import pandas as pd
df=pd.read_csv('hackernews.csv')

In [None]:
df.head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
0,12224879,Interactive Dynamic Video,http://www.interactivedynamicvideo.com/,386,52,ne0phyte,8/4/2016 11:52
1,10975351,How to Use Open Source and Shut the Fuck Up at...,http://hueniverse.com/2016/01/26/how-to-use-op...,39,10,josep2,1/26/2016 19:30
2,11964716,Florida DJs May Face Felony for April Fools' W...,http://www.thewire.com/entertainment/2013/04/f...,2,1,vezycash,6/23/2016 22:20
3,11919867,Technology ventures: From Idea to Enterprise,https://www.amazon.com/Technology-Ventures-Ent...,3,1,hswarna,6/17/2016 0:01
4,10301696,Note by Note: The Making of Steinway L1037 (2007),http://www.nytimes.com/2007/11/07/movies/07ste...,8,2,walterbell,9/30/2015 4:12
