# Exploring Hacker News Posts

In this project we're specifically interested in posts whose titles begin with either Ask HN or Show HN. 

* Users submit Ask HN posts to ask the Hacker News community a specific question.
* Users submit Show HN posts to show the Hacker News community a project, product, or just generally something interesting. 

We will analyze which type of post and time receive the most comments on average i.e we need to advise on best hour to maximize the amount of comments a post receives

Our aim is to compare these two types of posts to determine the following:

1. Do Ask HN or Show HN receive more comments on average?
2. Do posts created at a certain time receive more comments on average?

In [1]:
from csv import reader

In [2]:
# f = open("hacker_news.csv")
# read = reader(f)
# hn = list(read)
# f.close()

with open("hacker_news.csv") as f:
    read = reader(f)
    hn = list(read)

In [3]:
header = hn[0]
header

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']

In [4]:
hn = hn[1:]

In [5]:
# print(hn[:10])

In [6]:
import datetime as dt

In [7]:
date_format = "%m/%d/%Y %H:%M"

for i in hn:
    i[-1] = dt.datetime.strptime(i[-1], date_format)

In [8]:
ask_posts = []
show_posts = []
# other_posts = []

for i in hn:
    title = i[1]
    if title.upper().startswith("ASK HN"):
        ask_posts.append(i)
    elif title.upper().startswith("SHOW HN"):
        show_posts.append(i)

In [9]:
print(ask_posts[:5])

[['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', datetime.datetime(2016, 8, 16, 9, 55)], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', datetime.datetime(2015, 11, 22, 13, 43)], ['11610310', 'Ask HN: Aby recent changes to CSS that broke mobile?', '', '1', '1', 'polskibus', datetime.datetime(2016, 5, 2, 10, 14)], ['12210105', 'Ask HN: Looking for Employee #3 How do I do it?', '', '1', '3', 'sph130', datetime.datetime(2016, 8, 2, 14, 20)], ['10394168', 'Ask HN: Someone offered to buy my browser extension from me. What now?', '', '28', '17', 'roykolak', datetime.datetime(2015, 10, 15, 16, 38)]]


In [10]:
print(show_posts[:3])

[['10627194', 'Show HN: Wio Link  ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', datetime.datetime(2015, 11, 25, 14, 3)], ['10646440', 'Show HN: Something pointless I made', 'http://dn.ht/picklecat/', '747', '102', 'dhotson', datetime.datetime(2015, 11, 29, 22, 46)], ['11590768', 'Show HN: Shanhu.io, a programming playground powered by e8vm', 'https://shanhu.io', '1', '1', 'h8liu', datetime.datetime(2016, 4, 28, 18, 5)]]


In [11]:
print(len(hn))
print(len(ask_posts))
print(len(show_posts))

20100
1744
1162


In [12]:
# average comments on ask hn

total_num_comments = 0
total_ask_post =1744  # or len(ask_posts)

for i in ask_posts:
    comments = int(i[4])
    total_num_comments += comments
    
avg_comments_ask_posts = total_num_comments/total_ask_post
avg_comments_ask_posts

14.038417431192661

In [13]:
# average comments on show hn

total_num_comments = 0
total_hn_post = 0  # or len(show_posts)

for i in show_posts:
    total_hn_post += 1
    comments = int(i[4])
    total_num_comments += comments
    
avg_comments_show_posts = total_num_comments/total_hn_post
avg_comments_show_posts

10.31669535283993

In [14]:
hn[10][-1].hour

18

In [15]:
counts_by_hour = {}
comments_by_hour = {}

for i in ask_posts:
    hour = i[-1].hour
    comments = int(i[4])
    if hour not in counts_by_hour: # (counts_by_hour and comments_by_hour)
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = comments
    else:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += comments
    

In [16]:
print(counts_by_hour)

{9: 45, 13: 85, 10: 59, 14: 107, 16: 108, 23: 68, 12: 73, 17: 100, 15: 116, 21: 109, 20: 80, 2: 58, 18: 109, 3: 54, 5: 46, 19: 110, 1: 60, 22: 71, 8: 48, 4: 47, 0: 55, 6: 44, 7: 34, 11: 58}


In [17]:
print(comments_by_hour)

{9: 251, 13: 1253, 10: 793, 14: 1416, 16: 1814, 23: 543, 12: 687, 17: 1146, 15: 4477, 21: 1745, 20: 1722, 2: 1381, 18: 1439, 3: 421, 5: 464, 19: 1188, 1: 683, 22: 479, 8: 492, 4: 337, 0: 447, 6: 397, 7: 267, 11: 641}


In [18]:
avg_comments_by_hour = {}

for hour in counts_by_hour:
    # print(i)
    no_of_posts = counts_by_hour[hour]
    no_of_comments = comments_by_hour[hour]
    avg = no_of_comments/no_of_posts
    avg_comments_by_hour[hour] = round(avg,2)

In [19]:
print(avg_comments_by_hour)

{9: 5.58, 13: 14.74, 10: 13.44, 14: 13.23, 16: 16.8, 23: 7.99, 12: 9.41, 17: 11.46, 15: 38.59, 21: 16.01, 20: 21.52, 2: 23.81, 18: 13.2, 3: 7.8, 5: 10.09, 19: 10.8, 1: 11.38, 22: 6.75, 8: 10.25, 4: 7.17, 0: 8.13, 6: 9.02, 7: 7.85, 11: 11.05}


In [20]:
# sorted(avg_comments_by_hour, key =avg_comments_by_hour.get, reverse = True)

In [21]:
avg_comments_by_hour = []

for hour in counts_by_hour:
    # print(i)
    no_of_posts = counts_by_hour[hour]
    no_of_comments = comments_by_hour[hour]
    avg = no_of_comments/no_of_posts
    avg_comments_by_hour.append([hour,round(avg,2)])

In [22]:
sort_by_avg_comments =sorted(avg_comments_by_hour, key = lambda lst:lst[1],
                            reverse = True)
sort_by_avg_comments

[[15, 38.59],
 [2, 23.81],
 [20, 21.52],
 [16, 16.8],
 [21, 16.01],
 [13, 14.74],
 [10, 13.44],
 [14, 13.23],
 [18, 13.2],
 [17, 11.46],
 [1, 11.38],
 [11, 11.05],
 [19, 10.8],
 [8, 10.25],
 [5, 10.09],
 [12, 9.41],
 [6, 9.02],
 [0, 8.13],
 [23, 7.99],
 [7, 7.85],
 [3, 7.8],
 [4, 7.17],
 [22, 6.75],
 [9, 5.58]]

In [23]:
def sorting(lst):
    return lst[1]

In [24]:
# sort_by_avg_comments =sorted(avg_comments_by_hour, key = sorting,
#                             reverse = True)
# sort_by_avg_comments

In [25]:
# top 3 hours

for hour, avg_comm in sort_by_avg_comments[:3]:
    hour = dt.datetime.strptime(str(hour), "%H")
    hour = hour.strftime("%H:%M")
#     print("There are {0} average comments at {1} hours".format(avg_comm,hour))
#     print("There are {0} average comments at {1} hours {1}".format(avg_comm,hour))
    print("There are {} average comments at {} hours".format(avg_comm,hour))

There are 38.59 average comments at 15:00 hours
There are 23.81 average comments at 02:00 hours
There are 21.52 average comments at 20:00 hours


In [26]:
# top 3 hours

for hour, avg_comm in sort_by_avg_comments[:3]:
    hour = dt.datetime.strptime(str(hour), "%H")
    hour = hour.strftime("%H:%M")
    print(f"There are {avg_comm} average comments at {hour} hours")

There are 38.59 average comments at 15:00 hours
There are 23.81 average comments at 02:00 hours
There are 21.52 average comments at 20:00 hours


In [27]:
str((38.59-23.81)/23.81*100) + "%"

'62.07475850482993%'

# Analysis show_posts

In [33]:
print(show_posts[:1])

[['10627194', 'Show HN: Wio Link  ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', datetime.datetime(2015, 11, 25, 14, 3)]]


In [63]:
# average comments on show hn
total_numb_cmt_shn = 0
total_show_posts = len(show_posts)
for i in show_posts:
    show_comments = int(i[4])
    total_numb_cmt_shn += show_comments
    
avg_show_comment = total_numb_cmt_shn/total_show_posts
round(avg_show_comment,3)

10.317

In [32]:
hn[1][-1].hour

19

In [39]:
count_shn_by_hour = {}
comment_shn_by_hour = {}
for i in show_posts:
    hour = i[-1].hour
    comment = int(i[4])
    if hour not in count_shn_by_hour:
        count_shn_by_hour[hour] = 1
        comment_shn_by_hour[hour] = comment
    else:
        count_shn_by_hour[hour] += 1
        comment_shn_by_hour[hour] += comment

In [40]:
print(count_shn_by_hour)

{14: 86, 22: 46, 18: 61, 7: 26, 20: 60, 5: 19, 16: 93, 19: 55, 15: 78, 3: 27, 17: 93, 6: 16, 2: 30, 13: 99, 8: 34, 21: 47, 4: 26, 11: 44, 12: 61, 23: 36, 9: 30, 1: 28, 10: 36, 0: 31}


In [41]:
print(comment_shn_by_hour)

{14: 1156, 22: 570, 18: 962, 7: 299, 20: 612, 5: 58, 16: 1084, 19: 539, 15: 632, 3: 287, 17: 911, 6: 142, 2: 127, 13: 946, 8: 165, 21: 272, 4: 247, 11: 491, 12: 720, 23: 447, 9: 291, 1: 246, 10: 297, 0: 487}


In [46]:
avg_shn_comment_by_hour = {}
for hour in count_shn_by_hour:
    no_of_shn_posts = count_shn_by_hour[hour]
    no_of_shn_comment = comment_shn_by_hour[hour]
    avg = no_of_shn_comment/no_of_shn_posts
    avg_shn_comment_by_hour[hour] = round(avg,2)

In [47]:
print(avg_shn_comment_by_hour)

{14: 13.44, 22: 12.39, 18: 15.77, 7: 11.5, 20: 10.2, 5: 3.05, 16: 11.66, 19: 9.8, 15: 8.1, 3: 10.63, 17: 9.8, 6: 8.88, 2: 4.23, 13: 9.56, 8: 4.85, 21: 5.79, 4: 9.5, 11: 11.16, 12: 11.8, 23: 12.42, 9: 9.7, 1: 8.79, 10: 8.25, 0: 15.71}


In [48]:
avg_shn_comment_by_hour = []
for hour in count_shn_by_hour:
    no_of_shn_posts = count_shn_by_hour[hour]
    no_of_shn_comment = comment_shn_by_hour[hour]
    avg = no_of_shn_comment/no_of_shn_posts
    avg_shn_comment_by_hour.append([hour,round(avg,2)])

In [49]:
print(avg_shn_comment_by_hour)

[[14, 13.44], [22, 12.39], [18, 15.77], [7, 11.5], [20, 10.2], [5, 3.05], [16, 11.66], [19, 9.8], [15, 8.1], [3, 10.63], [17, 9.8], [6, 8.88], [2, 4.23], [13, 9.56], [8, 4.85], [21, 5.79], [4, 9.5], [11, 11.16], [12, 11.8], [23, 12.42], [9, 9.7], [1, 8.79], [10, 8.25], [0, 15.71]]


In [50]:
def sorting(lst):
    return lst[1]

In [68]:
#sort_shn_by_avg_comments =sorted(avg_shn_comment_by_hour, key = sorting,reverse = True)
sort_shn_by_avg_comments =sorted(avg_shn_comment_by_hour, key = lambda lst:lst[1],reverse = True)

In [69]:
sort_shn_by_avg_comments

[[18, 15.77],
 [0, 15.71],
 [14, 13.44],
 [23, 12.42],
 [22, 12.39],
 [12, 11.8],
 [16, 11.66],
 [7, 11.5],
 [11, 11.16],
 [3, 10.63],
 [20, 10.2],
 [19, 9.8],
 [17, 9.8],
 [9, 9.7],
 [13, 9.56],
 [4, 9.5],
 [6, 8.88],
 [1, 8.79],
 [10, 8.25],
 [15, 8.1],
 [21, 5.79],
 [8, 4.85],
 [2, 4.23],
 [5, 3.05]]

In [70]:
# top 5 Hour
for hour,avg_comm in sort_shn_by_avg_comments[:5]:
    hour = dt.datetime.strptime(str(hour), "%H")
    hour = hour.strftime("%H:%M")
    print(f"Ther are {avg_comm} average comments at {hour} Hours")

Ther are 15.77 average comments at 18:00 Hours
Ther are 15.71 average comments at 00:00 Hours
Ther are 13.44 average comments at 14:00 Hours
Ther are 12.42 average comments at 23:00 Hours
Ther are 12.39 average comments at 22:00 Hours


In [71]:
print(counts_by_hour)

{9: 45, 13: 85, 10: 59, 14: 107, 16: 108, 23: 68, 12: 73, 17: 100, 15: 116, 21: 109, 20: 80, 2: 58, 18: 109, 3: 54, 5: 46, 19: 110, 1: 60, 22: 71, 8: 48, 4: 47, 0: 55, 6: 44, 7: 34, 11: 58}
