# Analyse r/dataengineering

In [None]:
from datetime import datetime
import pandas as pd
import praw
import pytz
from tqdm.notebook import tqdm

## Get Reddit API

In [None]:
reddit = praw.Reddit(
    client_id="4yY0E0DtKqCupSK1SFgvUw",
    client_secret="A1pgvqhJYuEl6XhHHX0xqnvrmNm41g",
    user_agent="Post_Extractor",
)

## Select subreddit - Data Engineering

In [None]:
reddit_de = reddit.subreddit("dataengineering")

## Function to get posts

In [None]:
def get_posts(reddit_posts_gen: praw.models.listing.generator.ListingGenerator, time_filter:str = None, post_type = "top") -> list:
    posts = []
    total_posts = reddit_posts_gen.limit
    for post in tqdm(reddit_posts_gen, total = total_posts):
        author_name = post.author.name if post.author else None
        post_args = {
            "author": author_name,
            "title": post.title,
            "description": post.selftext,
            "up votes": post.ups,
            "down_votes": post.downs,
            "post_timestamp_utc": post.created_utc,
            "num_comments": post.num_comments,
            "post_link": post.permalink,
            "post_url": post.url,
        }
        if post_type == "top":
            post_args["top_time_filter"] = time_filter
        posts.append(post_args)
    return posts

## Load all posts into dataframe

### Get top posts

In [None]:
time_filters = ['hour', 'day', 'week', 'month', 'year', 'all']
all_top_posts = []
for time_filter in time_filters:
    top_posts = get_posts(
        reddit_posts_gen = reddit_de.top(time_filter=time_filter),
        time_filter = time_filter
    )
    all_top_posts.extend(top_posts)
df_top_posts = pd.DataFrame(all_top_posts).drop_duplicates(subset=["title", "post_timestamp_utc"])

In [7]:
df_top_posts

Unnamed: 0,author,title,description,up votes,down_votes,post_timestamp_utc,num_comments,post_link,post_url,is_self,top_time_filter
0,dfragnito,A Schemaless Data Store within YOUR SQL Database,,0,0,1.640033e+09,0,/r/dataengineering/comments/rkwqin/a_schemales...,https://schemafreesql.com/,False,hour
1,zer0crash,What did you guys wish you knew before impleme...,So I'm in charge of the dev/dataengineering/de...,64,0,1.639985e+09,29,/r/dataengineering/comments/rkhevl/what_did_yo...,https://www.reddit.com/r/dataengineering/comme...,True,day
2,roohitavaf,Seven Reading Suggestions for the Holidays (on...,,41,0,1.639959e+09,10,/r/dataengineering/comments/rk9qdk/seven_readi...,https://www.mydistributed.systems/2021/12/holi...,False,day
3,twopairisgood,The Guide to Data Versioning,,19,0,1.639955e+09,0,/r/dataengineering/comments/rk84l6/the_guide_t...,https://medium.com/whispering-data/the-guide-t...,False,day
4,BadGuyBadGuy,Have you ever had a predecessor who impressed ...,I'm asking because it seems common to hear the...,16,0,1.640024e+09,10,/r/dataengineering/comments/rktdy4/have_you_ev...,https://www.reddit.com/r/dataengineering/comme...,True,day
...,...,...,...,...,...,...,...,...,...,...,...
407,sanchit089,Want to learn Data Engineering? Here are some ...,,101,0,1.582680e+09,5,/r/dataengineering/comments/f9l209/want_to_lea...,https://github.com/san089/Udacity-Data-Enginee...,False,all
408,rckahuna,"CMV: Data Engineers should code, not build ad-...",Or call it Data Analyst.,101,0,1.635980e+09,57,/r/dataengineering/comments/qm6jpa/cmv_data_en...,https://www.reddit.com/r/dataengineering/comme...,True,all
409,Lostwhispers05,Is it me or are beginner-friendly ETL pipeline...,So this is something I've been struggling with...,101,0,1.627009e+09,19,/r/dataengineering/comments/opt99w/is_it_me_or...,https://www.reddit.com/r/dataengineering/comme...,True,all
410,porcelainsmile,Open source contributions for a Data Engineer?,What are some good git projects that a Data En...,101,0,1.618580e+09,57,/r/dataengineering/comments/ms33t0/open_source...,https://www.reddit.com/r/dataengineering/comme...,True,all


In [6]:
df_top_posts.author.value_counts()

Data_Cog            5
joseph_machado      5
dataengineerdude    4
tmccormick92        4
BlancBryn           4
                   ..
TreapPeep           1
Strijdhagen         1
PhSon               1
kkjeb               1
rckahuna            1
Name: author, Length: 261, dtype: int64

### Get hot posts

In [7]:
hot_posts = get_posts(
    reddit_posts_gen = reddit_de.hot(),
    post_type = "hot"
)
df_hot_posts = pd.DataFrame(hot_posts).drop_duplicates(subset=["title", "post_timestamp_utc"])

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
df_hot_posts.head()

Unnamed: 0,author,title,description,up votes,down_votes,post_timestamp_utc,num_comments,post_link,post_url
0,AutoModerator,Quarterly Salary Discussion,This is a recurring thread that happens quarte...,40,0,1638378000.0,150,/r/dataengineering/comments/r6jfnm/quarterly_s...,https://www.reddit.com/r/dataengineering/comme...
1,SeaworthinessFit7893,What is Kubernetes used for in data engineering?,Im curious as do why people use kubernetes in ...,31,0,1639843000.0,14,/r/dataengineering/comments/rja8s6/what_is_kub...,https://www.reddit.com/r/dataengineering/comme...
2,Suspicious-Use7032,Working as an etl developer using informatica ...,I am working as a data engineer (data integrat...,12,0,1639840000.0,4,/r/dataengineering/comments/rj9en6/working_as_...,https://www.reddit.com/r/dataengineering/comme...
3,BlancBryn,Tools or frameworks to simply trigger python f...,"Hi engineers,\nI am looking for a way to trigg...",2,0,1639854000.0,0,/r/dataengineering/comments/rje71b/tools_or_fr...,https://www.reddit.com/r/dataengineering/comme...
4,TheLastKingofReddit,Data warehouse - Normalized vs denormalized fa...,I'm designing a data warehouse to hold survey ...,2,0,1639854000.0,0,/r/dataengineering/comments/rjdwfi/data_wareho...,https://www.reddit.com/r/dataengineering/comme...


### Get new posts

In [9]:
new_posts = get_posts(
    reddit_posts_gen = reddit_de.new(),
    post_type = "hot"
)
df_new_posts = pd.DataFrame(new_posts).drop_duplicates(subset=["title", "post_timestamp_utc"])

  0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
df_new_posts.head()

Unnamed: 0,author,title,description,up votes,down_votes,post_timestamp_utc,num_comments,post_link,post_url
0,BlancBryn,Tools or frameworks to simply trigger python f...,"Hi engineers,\nI am looking for a way to trigg...",2,0,1639854000.0,0,/r/dataengineering/comments/rje71b/tools_or_fr...,https://www.reddit.com/r/dataengineering/comme...
1,odahat,Why use Segment instead of just doing it manua...,"Hello, I am pretty junior in data engineering ...",1,0,1639854000.0,2,/r/dataengineering/comments/rjdwww/why_use_seg...,https://www.reddit.com/r/dataengineering/comme...
2,TheLastKingofReddit,Data warehouse - Normalized vs denormalized fa...,I'm designing a data warehouse to hold survey ...,2,0,1639854000.0,0,/r/dataengineering/comments/rjdwfi/data_wareho...,https://www.reddit.com/r/dataengineering/comme...
3,SeaworthinessFit7893,What is Kubernetes used for in data engineering?,Im curious as do why people use kubernetes in ...,30,0,1639843000.0,14,/r/dataengineering/comments/rja8s6/what_is_kub...,https://www.reddit.com/r/dataengineering/comme...
4,Minimum-Membership-8,Data architect with snowflake,How essential is a data architect in a snowfla...,0,0,1639842000.0,2,/r/dataengineering/comments/rj9w9l/data_archit...,https://www.reddit.com/r/dataengineering/comme...


## Analyse Data

### Get top posters

#### authors with most top posts

In [11]:
pd.DataFrame(df_top_posts.author.value_counts().head())

Unnamed: 0,author
Data_Cog,5
joseph_machado,5
dataengineerdude,4
tmccormick92,4
BlancBryn,4


#### authors with most up votes

In [12]:
df_top_posts.groupby("author").sum("up votes").sort_values("up votes", ascending = False)[["up votes"]].head(10)

Unnamed: 0_level_0,up votes
author,Unnamed: 1_level_1
joseph_machado,814
noNSFWcontent,683
kuwala-io,681
mitchum_,568
Legitimate-Cry2837,559
ohammou,532
ali_azg,529
adgezaza87,484
blef__,481
AaronSWE,463


In [13]:
df_top_posts["topic"] = df_top_posts.title + " " + df_top_posts.description

In [21]:
df_hot_posts["topic"] = df_hot_posts.title + " " + df_hot_posts.description

In [22]:
df_new_posts["topic"] = df_new_posts.title + " " + df_new_posts.description

### Get categories from title and description

In [14]:
from transformers import pipeline

In [15]:
hypothesis_template = 'This post in data engineering is discussing {}.'

In [50]:
classifier = pipeline(
    "zero-shot-classification", 
    model = "facebook/bart-large-mnli", 
    hypothesis_template=hypothesis_template
)

In [51]:
count = 10
sequence = df_new_posts.topic.iloc[count][:500]



classifier(sequence, candidate_labels, multi_label=True)

{'sequence': 'Do you unit test your ETL pipelines? As title suggests. Do you write unit tests to sense check if units of your ETL code (e.g. custom transformation functions) perform just as expected?\n\n[View Poll](https://www.reddit.com/poll/rin780)',
 'labels': ['data transform',
  'ETL jobs',
  'learning',
  'data orchestration',
  'career',
  'CI/CD',
  'data ingestion',
  'data lakehouse',
  'scheduling',
  'data mesh',
  'streaming',
  'data warehouse',
  'container orchestration',
  'data lake'],
 'scores': [0.8619182109832764,
  0.7931176424026489,
  0.5037088394165039,
  0.49432599544525146,
  0.1772947460412979,
  0.08029742538928986,
  0.07847443222999573,
  0.06400887668132782,
  0.05714365839958191,
  0.0549035407602787,
  0.03209041804075241,
  0.019350597634911537,
  0.016374895349144936,
  0.005799442529678345]}

### Topics:
- dbt (transformation)
- kubernetes (container orchestration)
- Airflow (scheduling)


In [None]:
dbt > 0.9