In [1]:


from util.streamer import line_gen
import pandas as pd
from glob import glob
from itertools import chain
import ujson as json
from util.util import cache
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
import stop_words



In [2]:
stopwords = set(stop_words.get_stop_words('en'))
stopwords.update(['quote', 'pmquote', 'amquote', 'just', 'don', 'one', 'thing', 'even', 'way', 'maybe', 'also', 'please', 'well', 'actually', 'something',
                                         'going', 'anything', 'le', 'ever', 'say', 'see', 'likely', 'per', 'another', 'someone', 'let', 'anyone', 'doesn', 'include', 'doe', 'exactly',
                                         'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'like',
                                         'said', 'guy', 'will', 'can', 'able', 'people', 'become', 'tell', 'hey', 'much', 'many', 'lol', 'lot', 'want', 'still', 'really', 'think', 'didn',
                                         'isn', 'post', 'edited', 'share', 'facebookshare', 'twitter', 'monday', 'tuedsday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'])
                                        # adding exclusions to 'bitcion' because presumably, the entire dataset contains
                                        # bitcoin-related tweets, which means mentions of bitcoin shouldn't add any value
lemmatizer = WordNetLemmatizer()


def parse_string(input_string):
    input_string = input_string.lower()
    # remove links
    input_string = re.sub(r'http\S+', ' ', input_string)
    input_string = re.sub(r'\S+.(com|org)', '', input_string)
    # remove all non-English alphabet characters including numbers,
    # foreign and special characters
    input_string = re.sub( "[^a-zA-Z]", " ", input_string).split()

    # lemmatize word
    words = [lemmatizer.lemmatize(w) for w in input_string]
    # get rid of stopwords and words less than 3 characters
    words = [w for w in words if w not in stopwords and len(w) > 2]

    # fix common misspellings of bitcoin
    words = ["bitcoin" if w == "bitcoins" else w for w in words]
    words = ["bitcoin" if w == "itcoin" else w for w in words]

    return words


In [3]:
# favorite count being number of likes
use_fields = ['created_at', 'in_reply_to_user_id', 'retweet_count', 'favorite_count']
user_fields = ['id', 'verified', 'followers_count']


filenames = glob("data/BTC/json/*.jsonl")
files = [open(filename, 'r') for filename in filenames]

jsonl_gen = chain(*files)

posts_lst = []
comm_lst = []

In [4]:
n_retweets = 0

for i, line in enumerate(jsonl_gen):

    line = json.loads(line)

    # filter out non-english posts
    if line['lang'] != 'en':
        continue

    print(f"\rposts: {len(posts_lst) - n_retweets} comments: {len(comm_lst)} retweets: {n_retweets}", end='')
    
    full_text = parse_string(line['full_text'])
    
    row = {}


    # skip posts that are less than 5 words
    if len(full_text) < 5:
        continue
    else:
        row['full_text'] = full_text


    for col in use_fields:
        row[col] = line[col]

    for col in user_fields:
        row[col] = line['user'][col]

    # RETWEETS
    if 'retweeted_status' in line.keys():
        # if it's a retweet, grab the id of the original tweet user
        row['retweet_id'] = line['retweeted_status']['user']['id']
        n_retweets += 1

    # POSTS
    if type(line['in_reply_to_user_id']) != int:
        posts_lst.append(row)
        # COMMENTS
    elif line['in_reply_to_user_id'] != None:
        comm_lst.append(row)

    if i > 4000000:
        break



posts: 2804189 comments: 536494 retweets: 37698

In [5]:
cache(posts_lst, 'posts_lst')
cache(comm_lst, 'comm_lst')

In [6]:
df = pd.DataFrame(posts_lst + comm_lst)

In [7]:
cache(df, 'btc_data')

In [8]:
df.columns

Index(['full_text', 'created_at', 'in_reply_to_user_id', 'retweet_count',
       'favorite_count', 'id', 'verified', 'followers_count', 'retweet_id'],
      dtype='object')