In [15]:
import os, sys, time, re
import datetime as dt
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import clickhouse_driver
from clickhouse_driver.client import Client
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [16]:
clickhouse = Client('{host}', database = 'hn_launches',
                    user = '{user}', password = '{password}')

### Quering raw data from Clickhouse

In [865]:
launches_id = clickhouse.query_dataframe("""
select * from hn_launches.launches_raw;
""")

launches_text = clickhouse.query_dataframe("""
select * from hn_launches.launches_pages_raw;
""")

comments = clickhouse.query_dataframe("""
select * from hn_launches.comments_raw;
""")

print('launches_id')
display(launches_id.info())
display(launches_id.head())

print('launches_text')
display(launches_text.info())
display(launches_text.head())

print('comments')
display(comments.info())
display(comments.head())

  local_timezone = get_localzone().zone


launches_id
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   item_id  300 non-null    int64 
 1   title    300 non-null    object
 2   url      300 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.2+ KB


None

Unnamed: 0,item_id,title,url
0,23770214,Launch HN: ElectroNeek (YC W20) – Automaticall...,item?id=23770214
1,23780062,Launch HN: Yotta Savings (YC S20) – Behavioral...,item?id=23780062
2,23821502,Launch HN: Aquarium (YC S20) – Improve Your ML...,item?id=23821502
3,23833441,Launch HN: Openbase (YC S20) – reviews and ins...,item?id=23833441
4,23846186,Launch HN: Legacy (YC S19) – we help men test ...,item?id=23846186


launches_text
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   item_id  300 non-null    int64         
 1   score    300 non-null    int64         
 2   by       300 non-null    object        
 3   time     300 non-null    datetime64[ns]
 4   text     300 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 11.8+ KB


None

Unnamed: 0,item_id,score,by,time,text
0,30274390,101,drewkim,2022-02-09 16:16:05,"Hi HN, Drew and Edmund from Pelm here (https:/..."
1,30433104,96,declan_g,2022-02-22 20:16:33,"Hi HN, we’re Philip, Amby, and Declan from Hyp..."
2,31653985,131,kwent,2022-06-07 13:29:31,"Hi HN, Quentin and JJ here! We are co-founders..."
3,31683157,75,mshuffett,2022-06-09 16:20:46,Hey HN! Michael here—founder of Compose AI (ht...
4,31724838,91,ptrthomas,2022-06-13 12:53:24,"Hi HN, Peter here, founder of Karate Labs (htt..."


comments
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23039 entries, 0 to 23038
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   item_id     23039 non-null  int64         
 1   comment_id  23039 non-null  int64         
 2   by          23039 non-null  object        
 3   time        23039 non-null  datetime64[ns]
 4   level       23039 non-null  int64         
 5   comment     23039 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 1.1+ MB


None

Unnamed: 0,item_id,comment_id,by,time,level,comment
0,23770214,23770525,cpr,2020-07-08 15:00:30,0,Looks fantastic!I can't find any demo videos o...
1,23770214,23770559,charlesdaniels,2020-07-08 15:03:08,0,"This sounds really cool!However, I wonder abou..."
2,23770214,23770641,maxehmookau,2020-07-08 15:10:41,0,Interesting that you use a lot of the same ter...
3,23770214,23770681,dorianmariefr,2020-07-08 15:13:29,0,Reminds me of AutoHotKey which I used to autom...
4,23770214,23770732,Digitaltzar,2020-07-08 15:18:46,1,"Hey, we host demo videos on our youtube channe..."


In [866]:
launches = pd.merge(launches_id, launches_text, how = 'inner', on = 'item_id')
#del launches_id, launches_text

In [867]:
launches = launches[~launches['title'].str.contains('meet the batch', case = False) &
                    ~launches['text'].str.contains('meet the batch', case = False)].reset_index(drop = True)

In [977]:
for count, row in launches[launches['is_oss'] == 1].iterrows():
    print(row['name'], ':', set(['https://github.com/'+u for u in re.findall('github.com/(.*?)[\.\) :\[]', row['text'])]))

Openbase : set()
QuestDB : {'https://github.com/questdb/questdb', 'https://github.com/questdb/questdb/blob/master/core/src/main', 'https://github.com/peter-lawrey/Java-Chronicle'}
Nestybox : {'https://github.com/nestybox/sysbox'}
Papercups : {'https://github.com/papercups-io/papercups'}
SuperTokens : {'https://github.com/supertokens/supertokens-core'}
Airbyte : {'https://github.com/airbytehq/airbyte/'}
Opstrace : {'https://github.com/opstrace/opstraceAbout', 'https://github.com/grafana/loki', 'https://github.com/opstrace/opstrace', 'https://github.com/cortexproject/cortex'}
Infracost : {'https://github.com/infracost/infracost'}
SigNoz : {'https://github.com/SigNoz/signoz'}
Pyroscope : {'https://github.com/pyroscope-io/pyroscope'}
MindsDB : {'https://github.com/mindsdb/mindsdb'}
Spruce : {'https://github.com/spruceid/didkitCredible', 'https://github.com/spruceid/credibleDocs'}
Lunatic : {'https://github.com/lunatic-solutions/chat', 'https://github.com/lunatic-solutions/lunatic'}
Webiny 

### Fields extraction

In [868]:
launches['name'] = launches.apply(lambda row: re.search('Launch HN: (.*?) \(YC', row['title']).group(1), axis = 1)
launches['yc_batch'] = launches.apply(lambda row: re.search('\(YC (.*?)\)', row['title']).group(1), axis = 1)
launches['short_description'] = launches.apply(lambda row: re.search('\)(.*)', row['title']).group(1).replace(' – ', '').replace('- ', '').replace(': ', '').capitalize(), axis = 1)
#launches['is_oss'] = (launches['title'].str.contains('open-source|oss|open source', case = False)).astype('int')
launches['is_oss'] = launches.apply(lambda row: 1 if (((bool(re.search('open-source|open source', row['text']))) & (bool(re.search('github.com', row['text'])))) 
                                                      or (bool(re.search('open-source|open source', row['short_description'])))) else 0, axis = 1)
launches['urls'] = launches.apply(lambda row: ['https://'+u for u in re.findall('\(https://(.*?)\)', row['text'])], axis = 1)

launches = launches[['item_id', 'by', 'time', 'name', 'yc_batch', 'short_description', 'is_oss', 'urls', 'url', \
                     'text', \
                     'score']]

launches.head()

Unnamed: 0,item_id,by,time,name,yc_batch,short_description,is_oss,urls,url,text,score
0,23770214,Digitaltzar,2020-07-08 14:30:15,ElectroNeek,W20,Automatically find and automate routine work,0,[https://electroneek.com],item?id=23770214,"Hey Hacker News! We are Sergey Yudovsky, Dmitr...",132
1,23780062,adammoelis,2020-07-09 13:42:30,Yotta Savings,S20,Behavioral psychology to help people save,0,"[https://www.withyotta.com/, https://en.wikipe...",item?id=23780062,"Hey HN! We are Adam & Ben, co-founders of Yott...",240
2,23821502,pgao,2020-07-13 15:05:41,Aquarium,S20,Improve your ml dataset quality,0,"[https://www.aquariumlearning.com/, https://me...",item?id=23821502,Hi everyone! I’m Peter from Aquarium (https://...,167
3,23833441,liorgrossman,2020-07-14 15:57:52,Openbase,S20,Reviews and insights for open-source packages,1,[https://openbase.io],item?id=23833441,"Hi everyone! I'm Lior, one of the makers of Op...",148
4,23846186,khaledkteily,2020-07-15 15:04:37,Legacy,S19,We help men test and freeze their sperm,0,[https://www.givelegacy.com/],item?id=23846186,"Hi everyone,I'm Khaled Kteily – and I helped f...",193


### Labeling

In [869]:
labels__dict = {'Healthcare': ['health ', 'telehealth', 'telemedicine', 'wellness', 'fitness', 'doctor', 'patients', 'meds', 'drugs', 'hospitalization', \
                               'therapy', 'diseases', 'psychology', 'sleep', 'meditation', 'relief', 'medication'],
                'Education': ['education', 'teaching', 'student', 'teacher', ' class ', 'exam', 'schooler'],
                'Food': ['meat', 'food', 'restaurant'],
                'Green': ['co2', 'carbon'],
                'Gaming': ['game', 'gaming'],
                'AR&VR': ['3d', ' ar ', ' vr '],
                'Crypto': ['cryptocurrency', 'crypto', 'dao', 'nft', 'web 3.0'],
                'Financial': ['card', 'bank', 'digital bank', 'neo-bank', 'loan', 'debt', 'venture', 'financicals', \
                              'trading', 'trade', 'portfolio', 'investment', 'investing', 'invest', 'stocks', 'pay', 'bill'],
                'Industrials': ['industrial', 'drones', 'aircraft', 'aviation', ' space ', 'robotics', 'manufacturing', 'energy', 'agro', 'industrials', \
                                'facilities', 'systems', 'oil', 'gas', ' solar '],
                'SaaS': ['as a service', 'company', 'b2b', 'smb', 'saas', 'billing', 'subscription', 'marketing', 'retail', 'sale', 'hr ', 'hire', \
                         'platform', 'onboarding', 'automation', 'automate', 'marketplaces', 'builder', 'co-working', 'workers', 'for companies', \
                         'shopify', 'selling', 'no-code', 'optimization', 'workspace', 'remote', 'employee', 'build', 'create', 'collaborative', \
                         'e-commerce', 'email', ' docs', 'product', 'for founders', 'for startups', 'support', 'customer', 'collaboration', 'authentication', \
                         'office', 'meeting'],
                'Consumer': ['personal', ' men ', ' women ', 'adults', 'teens', 'kids', 'for people', 'apparel', 'social', 'dating', 'planning', 'people', 'apparel', \
                             'booking', 'at-home'],
                'DevTools': ['design', 'coding', ' code', 'developer', ' ide ', 'deploy', 'cloud', 'data', ' prod', 'database', 'pipelines', 'etl ', 'k8s', 'aws ', 'kubernetes', \
                             'ci/cd', ' ml ', 'engineering', 'full-stack', 'fullstack', 'data analytics', 'data science', 'open-source', 'open source', \
                             'microservices', 'server', 'incidents', 'debug', 'secrets', 'securely', 'security', 'front-end', 'back-end', 'machine learning'],} 

In [870]:
conditions = [launches['short_description'].str.contains('|'.join(item), case = False) for item in labels__dict.values()]
values = [item for item in labels__dict]

launches['industry'] = np.select(conditions, values, default = 'Other')

In [871]:
launches.head()

Unnamed: 0,item_id,by,time,name,yc_batch,short_description,is_oss,urls,url,text,score,industry
0,23770214,Digitaltzar,2020-07-08 14:30:15,ElectroNeek,W20,Automatically find and automate routine work,0,[https://electroneek.com],item?id=23770214,"Hey Hacker News! We are Sergey Yudovsky, Dmitr...",132,SaaS
1,23780062,adammoelis,2020-07-09 13:42:30,Yotta Savings,S20,Behavioral psychology to help people save,0,"[https://www.withyotta.com/, https://en.wikipe...",item?id=23780062,"Hey HN! We are Adam & Ben, co-founders of Yott...",240,Healthcare
2,23821502,pgao,2020-07-13 15:05:41,Aquarium,S20,Improve your ml dataset quality,0,"[https://www.aquariumlearning.com/, https://me...",item?id=23821502,Hi everyone! I’m Peter from Aquarium (https://...,167,DevTools
3,23833441,liorgrossman,2020-07-14 15:57:52,Openbase,S20,Reviews and insights for open-source packages,1,[https://openbase.io],item?id=23833441,"Hi everyone! I'm Lior, one of the makers of Op...",148,DevTools
4,23846186,khaledkteily,2020-07-15 15:04:37,Legacy,S19,We help men test and freeze their sperm,0,[https://www.givelegacy.com/],item?id=23846186,"Hi everyone,I'm Khaled Kteily – and I helped f...",193,Consumer


### Comments Sentiment

In [872]:
sia = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    
    scores = sia.polarity_scores(text)
    
    if (scores['pos'] > 0.1) and (scores['neg'] < 0.1): sentiment = 'pos'
    elif (scores['pos'] < 0.1) and (scores['neg'] > 0.1): sentiment = 'neg'
    else: sentiment = 'neu'
    
    return sentiment

comments['sentiment'] = comments.apply(lambda row: analyze_sentiment(row['comment']), axis = 1)

comments.head()

Unnamed: 0,item_id,comment_id,by,time,level,comment,sentiment
0,23770214,23770525,cpr,2020-07-08 15:00:30,0,Looks fantastic!I can't find any demo videos o...,neu
1,23770214,23770559,charlesdaniels,2020-07-08 15:03:08,0,"This sounds really cool!However, I wonder abou...",neu
2,23770214,23770641,maxehmookau,2020-07-08 15:10:41,0,Interesting that you use a lot of the same ter...,neu
3,23770214,23770681,dorianmariefr,2020-07-08 15:13:29,0,Reminds me of AutoHotKey which I used to autom...,neu
4,23770214,23770732,Digitaltzar,2020-07-08 15:18:46,1,"Hey, we host demo videos on our youtube channe...",neu


### Load to Clickhouse

In [873]:
clickhouse.execute('insert into hn_launches.launches values',
                   launches.to_dict('records'), types_check = True)

  local_timezone = get_localzone().zone


291

In [874]:
clickhouse.execute('insert into hn_launches.commentss values',
                   comments.to_dict('records'), types_check = True)

23039