In [2]:
import pandas as pd


# **1. Load the Dataset**

In [3]:
dataset=pd.read_csv('hacker.csv')

In [32]:
dataset

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at,created_at_hour
0,12224879,interactive dynamic video,http://www.interactivedynamicvideo.com/,386,52,ne0phyte,2016-08-04 11:52:00,11
1,10975351,how to use open source and shut the fuck up at...,http://hueniverse.com/2016/01/26/how-to-use-op...,39,10,josep2,2016-01-26 19:30:00,19
2,11964716,florida djs may face felony for april fools' w...,http://www.thewire.com/entertainment/2013/04/f...,2,1,vezycash,2016-06-23 22:20:00,22
3,11919867,technology ventures: from idea to enterprise,https://www.amazon.com/Technology-Ventures-Ent...,3,1,hswarna,2016-06-17 00:01:00,0
4,10301696,note by note: the making of steinway l1037 (2007),http://www.nytimes.com/2007/11/07/movies/07ste...,8,2,walterbell,2015-09-30 04:12:00,4
...,...,...,...,...,...,...,...,...
20095,12379592,how purism avoids intels active management tec...,https://puri.sm/philosophy/how-purism-avoids-i...,10,6,AdmiralAsshat,2016-08-29 02:22:00,2
20096,10339284,yc application translated and broken down,https://medium.com/@zreitano/the-yc-applicatio...,4,1,zreitano,2015-10-06 14:57:00,14
20097,10824382,microkernels are slow and elvis didn't do no d...,http://blog.darknedgy.net/technology/2016/01/0...,169,132,vezzy-fnord,2016-01-02 00:49:00,0
20098,10739875,how product hunt really works,https://medium.com/@benjiwheeler/how-product-h...,695,222,brw12,2015-12-15 19:32:00,19


# **2. Data Preprocessing**

In [4]:
dataset['created_at']

Unnamed: 0,created_at
0,8/4/2016 11:52
1,1/26/2016 19:30
2,6/23/2016 22:20
3,6/17/2016 0:01
4,9/30/2015 4:12
...,...
20095,8/29/2016 2:22
20096,10/6/2015 14:57
20097,1/2/2016 0:49
20098,12/15/2015 19:32


In [5]:
# Convert 'created_at' column to datetime format
dataset['created_at']=pd.to_datetime(dataset['created_at'])

In [6]:
# Extract the hour from 'created_at' and create a new column 'created_at_hour'
dataset['created_at_hour']=dataset['created_at'].dt.hour

In [7]:
# Convert all titles to lowercase for uniformity
dataset['title'] = dataset['title'].str.lower()

# **3. Filtering Data by Post Type**

In [8]:
# Identify posts that start with 'ask hn'
ask = dataset['title'].str.startswith('ask hn')
ask

Unnamed: 0,title
0,False
1,False
2,False
3,False
4,False
...,...
20095,False
20096,False
20097,False
20098,False


In [9]:
# Identify posts that start with 'show hn'
show = dataset['title'].str.startswith('show hn')


In [10]:
show

Unnamed: 0,title
0,False
1,False
2,False
3,False
4,False
...,...
20095,False
20096,False
20097,False
20098,False


In [11]:
# Filter out 'ask hn' and 'show hn' posts to focus on other posts
filtered_dataset = dataset[~(ask | show)]

In [33]:
filtered_dataset

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at,created_at_hour
0,12224879,interactive dynamic video,http://www.interactivedynamicvideo.com/,386,52,ne0phyte,2016-08-04 11:52:00,11
1,10975351,how to use open source and shut the fuck up at...,http://hueniverse.com/2016/01/26/how-to-use-op...,39,10,josep2,2016-01-26 19:30:00,19
2,11964716,florida djs may face felony for april fools' w...,http://www.thewire.com/entertainment/2013/04/f...,2,1,vezycash,2016-06-23 22:20:00,22
3,11919867,technology ventures: from idea to enterprise,https://www.amazon.com/Technology-Ventures-Ent...,3,1,hswarna,2016-06-17 00:01:00,0
4,10301696,note by note: the making of steinway l1037 (2007),http://www.nytimes.com/2007/11/07/movies/07ste...,8,2,walterbell,2015-09-30 04:12:00,4
...,...,...,...,...,...,...,...,...
20095,12379592,how purism avoids intels active management tec...,https://puri.sm/philosophy/how-purism-avoids-i...,10,6,AdmiralAsshat,2016-08-29 02:22:00,2
20096,10339284,yc application translated and broken down,https://medium.com/@zreitano/the-yc-applicatio...,4,1,zreitano,2015-10-06 14:57:00,14
20097,10824382,microkernels are slow and elvis didn't do no d...,http://blog.darknedgy.net/technology/2016/01/0...,169,132,vezzy-fnord,2016-01-02 00:49:00,0
20098,10739875,how product hunt really works,https://medium.com/@benjiwheeler/how-product-h...,695,222,brw12,2015-12-15 19:32:00,19


In [12]:
dataset[ask].shape

(1744, 8)

In [13]:
dataset[show].shape

(1162, 8)

In [14]:
dataset.shape

(20100, 8)

# **4. Statistical Analysis**

In [16]:
# Calculate the average number of comments for 'ask hn'
avg_ask_comments = round(dataset[ask]['num_comments'].mean(), 2)

In [34]:
avg_ask_comments

14.04

In [17]:
# Calculate the average number of comments for 'show hn'
avg_show_comments = round(dataset[show]['num_comments'].mean(), 2)

In [35]:
avg_show_comments

10.32

In [19]:
# Find the maximum number of comments on a single post
max_comments = dataset['num_comments'].max()

In [36]:
max_comments

1733

# **5. Grouping and Aggregation by Hour**

In [23]:
# Group data by the hour of creation and calculate the average number of comments per hour
groupby_hour = dataset.groupby('created_at_hour')
avg_comments_by_hour = groupby_hour['num_comments'].mean()


In [38]:
avg_comments_by_hour

Unnamed: 0_level_0,num_comments
created_at_hour,Unnamed: 1_level_1
0,25.07604
1,21.19898
2,26.015123
3,23.82377
4,21.891841
5,22.715232
6,19.771368
7,24.755906
8,24.32872
9,25.08046


In [24]:
# Identify the hour with the highest average number of comments
top_hour = avg_comments_by_hour.sort_values(ascending=False).head(1)

In [39]:
top_hour

Unnamed: 0_level_0,num_comments
created_at_hour,Unnamed: 1_level_1
14,29.144222


# **6. Detailed Analysis of 'Ask HN' and 'Show HN' Posts**

In [25]:
# Group 'ask hn' posts by the hour of creation and find the top 5 hours with the most comments
groupby_ask=dataset[ask].groupby('created_at_hour')

In [26]:
top_ask_hours = groupby_ask['num_comments'].mean().sort_values(ascending=False).head(5)

In [41]:
top_ask_hours

Unnamed: 0_level_0,num_comments
created_at_hour,Unnamed: 1_level_1
15,38.594828
2,23.810345
20,21.525
16,16.796296
21,16.009174


In [28]:
# Group 'show hn' posts by the hour of creation and find the top 5 hours with the most comments
groupby_show=dataset[show].groupby('created_at_hour')

In [29]:
top_show_hours = groupby_show['num_comments'].mean().sort_values(ascending=False).head(5)


In [42]:
top_show_hours

Unnamed: 0_level_0,num_comments
created_at_hour,Unnamed: 1_level_1
18,15.770492
0,15.709677
14,13.44186
23,12.416667
22,12.391304


# **7. Pivot Table Analysis for 'Ask HN' Posts**

In [30]:
# Create a pivot table for 'ask hn' posts to analyze average comments by hour
avgby_hr = dataset[ask].pivot_table(values='num_comments', index='created_at_hour', aggfunc='mean')
sorted_avgby_hr = avgby_hr.sort_values('num_comments', ascending=False)

In [None]:
avgby_hr.sort_values('num_comments',ascending=False)

Unnamed: 0_level_0,num_comments
created_at_hour,Unnamed: 1_level_1
15,38.594828
2,23.810345
20,21.525
16,16.796296
21,16.009174
13,14.741176
10,13.440678
14,13.233645
18,13.201835
17,11.46


# **8. Output Results**

In [31]:
print("Average comments for 'Ask HN':", avg_ask_comments)
print("Average comments for 'Show HN':", avg_show_comments)
print("Maximum comments on a single post:", max_comments)
print("Top hour for comments:", top_hour)
print("Top 5 hours for 'Ask HN' comments:", top_ask_hours)
print("Top 5 hours for 'Show HN' comments:", top_show_hours)

Average comments for 'Ask HN': 14.04
Average comments for 'Show HN': 10.32
Maximum comments on a single post: 1733
Top hour for comments: created_at_hour
14    29.144222
Name: num_comments, dtype: float64
Top 5 hours for 'Ask HN' comments: created_at_hour
15    38.594828
2     23.810345
20    21.525000
16    16.796296
21    16.009174
Name: num_comments, dtype: float64
Top 5 hours for 'Show HN' comments: created_at_hour
18    15.770492
0     15.709677
14    13.441860
23    12.416667
22    12.391304
Name: num_comments, dtype: float64
