In [1]:
import numpy as np
import pandas as pd
import os
import re
from bs4 import BeautifulSoup

In [2]:
DATA_FOLDER = 'data/'
columns = ['web-scraper-order', 'text', 'section', 'topic', 'urls', 'userMentions', 'isReply', 'emojis']
distribution = {'bugreports': 9,
                'chat': 28,
                'collections': 8,
                'help': 31,
                'ligo': 5,
                'notes': 400,
                'science': 15,
                'virgo': 4}

In [3]:
def check_None(el):
    return el if el else None

def get_content(row):
    body = BeautifulSoup(row['body'], 'html.parser')
    content = body.find('div', {'class': 'talk-comment-content'})
    text_content = content.find('div', {'class':'markdown'})
    row['text'] = check_None(text_content.text)
    row['topic'] = row['links']
    row['urls'] = check_None(';'.join([a['href'] for a in content.findAll('a') if '.png' not in a['href']]))
    row['userMentions'] = check_None(';'.join(re.findall(r'\@\S+', row['text'])))
    row['isReply'] = check_None(body.find('div', {'class': 'talk-comment-reply'}) is not None)
    row['emojis'] = check_None(';'.join([img['alt'] for img in text_content.findAll('img', {'class': 'emoji'})]))
    return row

def get_comments_df(df):
    df = df.apply(get_content, axis=1)
    return df[columns]

def read_data(folder):
    comments_df = pd.DataFrame(columns=columns)
    for file in os.listdir(folder):
        df = pd.read_csv(os.path.join(folder, file))
        df['section'] = file.split('.')[0].split('[')[0]
        comments_df = comments_df.append(get_comments_df(df))
    return comments_df

In [4]:
%%time
all_data = read_data(DATA_FOLDER)
all_data

Wall time: 7min 57s


Unnamed: 0,web-scraper-order,text,section,topic,urls,userMentions,isReply,emojis
0,1604198576-694,Hello - yes to the above suggestions. Please p...,bugreports,Emails from Zooniverse projects now fail to ar...,https://status.zooniverse.org,,,
1,1604198689-999,"hey @team , why is there audio in the field g...",bugreports,audio in field guide,,@team,,
2,1604198607-767,"I should say, you have to be trying to go from...",bugreports,Frame 4 button,,,,
3,1604198528-563,I believe that there are some problems at the ...,bugreports,Possible level-up bug,,,,
4,1604198895-1630,Thank you for flagging this and your construct...,bugreports,Opting out of the level-up popups?,https://github.com/zooniverse/Panoptes-Front-E...,,,
...,...,...,...,...,...,...,...,...
14,1604198211-259,"Scattered Light comes in many, many guises in ...",virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,,True,
15,1604198222-261,"For proposing new Virgo glitch classes, please...",virgo,Process for Making New Virgo Classes Official,https://www.zooniverse.org/projects/zooniverse...,,,
16,1604198211-260,Horizontal lines at medium to high frequencies...,virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,,True,
17,1604198211-257,"On a related note, @dziakj1 has discovered (an...",virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,@dziakj1,True,


In [5]:
print(np.unique(all_data['section'], return_counts=True))
print(all_data.isna().sum())
all_data.describe()

(array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([  382,  1155,   339,  1348,   193, 86370,   695,    19],
      dtype=int64))
web-scraper-order        0
text                     0
section                  0
topic                    0
urls                 14974
userMentions         86088
isReply              85632
emojis               88039
dtype: int64


Unnamed: 0,web-scraper-order,text,section,topic,urls,userMentions,isReply,emojis
count,90501,90501,90501,90501,75527,4413,4869,2462
unique,90500,62865,8,54530,62938,1077,1,147
top,1604684664-937,50sb\n,notes,Subject 3777930,/projects/zooniverse/gravity-spy/talk/tags/pos...,@EcceruElme,True,😀
freq,2,808,86370,36,508,1166,4869,918


In [6]:
def get_sample_df():
    return pd.concat([all_data[all_data['section'] == k].sample(v, random_state=42) for k, v in distribution.items()]).reset_index(drop=True)

In [7]:
sample_df = get_sample_df()
sample_df

Unnamed: 0,web-scraper-order,text,section,topic,urls,userMentions,isReply,emojis
0,1604198662-928,Hi did you fix this bug?\n,bugreports,Subject 11057234,,,True,
1,1604198796-1345,I have never been quite certain that the X ax...,bugreports,RFE: An easy way to read off feature frequencies,,,,
2,1604198776-1275,\nBecause I can understand it is of immense in...,bugreports,Statistics not updating on Gravity Spy home page,,,True,😦
3,1604198884-1593,"And working again.\n(A Schrödinger tool, as we...",bugreports,Gravity Spy Tools down,,,,😃
4,1604197902-62,I did occasionally. Reloading the site helps t...,bugreports,Already Seen keeps coming up,/users/eperozzi;/users/sbc538,@eperozzi;@sbc538,,
...,...,...,...,...,...,...,...,...
495,1604199043-1992,Yep -- this is a really interesting trigger! V...,science,Read the press release about latest #Gravitati...,,,True,
496,1604198211-254,We've seen many pretty moiré artefacts from LI...,virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,,True,
497,1604198190-241,"Hello,\nSimilar to what we have observed for 2...",virgo,Viola - Virgo (O2a),https://www.zooniverse.org/projects/zooniverse...,,,
498,1604198211-253,"Thick (for a Q45 feature) braids at 19Hz, some...",virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,@PepBlanquer,True,


In [8]:
print(np.unique(sample_df['section'], return_counts=True))
print(sample_df.isna().sum())
sample_df.describe()

(array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([  9,  28,   8,  31,   5, 400,  15,   4], dtype=int64))
web-scraper-order      0
text                   0
section                0
topic                  0
urls                 110
userMentions         461
isReply              452
emojis               480
dtype: int64


Unnamed: 0,web-scraper-order,text,section,topic,urls,userMentions,isReply,emojis
count,500,500,500,500,390,39,48,20
unique,500,474,8,485,372,27,1,9
top,1604576473-5338,#repeatingblips\n,notes,Pulsar glitches,/users/EcceruElme,@EcceruElme,True,😃
freq,1,4,400,3,6,9,48,7


In [10]:
all_data.to_csv(os.path.join(DATA_FOLDER, 'all_data.csv'), index=False)
sample_df.to_csv(os.path.join(DATA_FOLDER, 'sample.csv'), index=False)

In [5]:
all_data = pd.read_csv(os.path.join(DATA_FOLDER, 'all_data.csv'))
sample_df = pd.read_csv(os.path.join(DATA_FOLDER, 'sample.csv'))
difference_df = pd.concat([all_data, sample_df]).drop_duplicates(keep=False)
print(sample_df.shape, difference_df.shape)

(500, 8) (90001, 8)


In [6]:
difference_df.to_csv(os.path.join(DATA_FOLDER, 'difference.csv'), index=False)

In [17]:
all_data[all_data['web-scraper-order'] == '1604200055-3004']

Unnamed: 0,web-scraper-order,text,section,topic,urls,userMentions,isReply,emojis
1991,1604200055-3004,10070989 \n,help,Sub-spectrogram wave,/projects/zooniverse/gravity-spy/talk/subjects...,,,😎
