In [1]:
import sys
import os
import json
import langdetect
import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.classes.segmenter import Segmenter
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang.slangdict import slangdict
import re
import datetime
from collections import defaultdict
import math
import progressbar

In [4]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date'],
    # terms that will be annotated
    annotate=['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'],
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used
    # for word segmentation 
    segmenter="twitter_2018", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter_2018", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    spell_correction=True,
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, slangdict]
    )

segmenter=Segmenter(corpus="twitter_2018")

Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...
Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...


In [3]:
def contains_words(sent, words):
    for w in words:
        if w in sent:
            return True
    return False

def preprocess(text):
    text = text.lower()
    text = re.sub(",", " ", text)
    text = re.sub(":", "", text)
    text = re.sub("@ ", "@", text)
    text = re.sub("# ", "#", text)
    text = re.sub('\S*@\S*\s?', '', text)
    text = re.sub(r"\"", "", text)
    text = re.sub("\'", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'\w*pic.twitter.co\w*', '', text)
    text = re.sub(r'\w*twitter.co\w*', '', text)
    text = re.sub(r'\w*twitter.com\w*', '', text)
    text = re.sub(r"./\S+", "", text)
    text = re.sub(r"@ \S+", "", text)
    text = re.sub(r"#\S+", "", text)
    text = re.sub(r'\n+', " ", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub("co vid", "covid", text)
    text = re.sub(r"\ss\s", " 's ", text)
    text = re.sub(r"\sm\s", " 'm ", text)
    text = re.sub(r"\sll\s", " 'll ", text)
    text = re.sub(r"\st\s", " 't ", text)
    text = re.sub(r"\sd\s", " 'd ", text)
    text = re.sub(r"\svir\s", " virus ", text)
    text = re.sub("rt", "", text)
    try:
        if langdetect.detect(text) == 'en':
            return text
        else:
            return None
    except:
        return None
    return text
                 

In [3]:
users = json.load(open('covid_users_100000_non-org.jsonl', 'r'))
tweet_count = 0
for user in users:
    tweet_file = './data/user-timelines/'+user['id_str']+'_tweets.jsonl'
    try:
        tweets = open(tweet_file, 'r').readlines()
        tweets = [json.loads(t) for t in tweets]
        tweet_count += len(tweets)
    except:
        pass

print(tweet_count)

1873022


In [None]:
keywords = ["covid", "covid_19", "coronavirus", "corona", "covid-19", "corona virus",
            "covid", "chinesevirus", "chinese virus", "chinese_virus", "chinese-virus"]

processed_tweets = []

for user in users:
    tweet_file = './data/user-timelines/'+user['id_str']+'_tweets.jsonl'
    try:
        tweets = open(tweet_file, 'r').readlines()
        tweets = [json.loads(t) for t in tweets]
        for tweet in tweets:
            if contains_words(tweet['text'].lower(), keywords):
                tweet = tweet.encode('ascii', 'ignore').decode('utf-8')
                tokens = text_processor.pre_process_doc(tweet['text'])
                tokens = [segmenter.segment(t) for t in tokens]
                text = " ".join(tokens)
                tweet_dict = dict()
                tweet_dict['id'] = tweet['id']
                tweet_dict['user'] = tweet['user']['id']
                tweet_dict['text'] = preprocess(text)
                if tweet_dict['text']:
                    processed_tweets.append(tweet_dict)
    except:
        pass
    
print('Processed {} tweets.'.format(len(processed_tweets)))

In [23]:
print(json.dumps(processed_tweets[:5], indent=2))

[
  {
    "id": 1240281265322033154,
    "user": 889861054850891776,
    "text": "   revelation  11   6 these men have power .  to strike the earth with every kind of plague as often as they want .   corona virus  \u2026"
  },
  {
    "id": 1240281153921339393,
    "user": 889861054850891776,
    "text": "   we do all we can but these are the beginning of birth pains . our focus is in heaven .  lock down kenya   coronavirus outbreak   c  \u2026"
  },
  {
    "id": 1240014758578925570,
    "user": 38333629,
    "text": "what happens if we have to plan a funeral during a pandemic like corona virus ? scott let 's you know what mueller mem \u2026 "
  },
  {
    "id": 1240548736763408384,
    "user": 66330918,
    "text": "parents were moaning saying schools should close   now they have   they \u2019 reply moaning about them closing .   covid  19 uk   school closures uk "
  },
  {
    "id": 1240317440871014402,
    "user": 66330918,
    "text": "   the news has been pretty occupied with cor

In [24]:
from stanfordcorenlp import StanfordCoreNLP
import logging

class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port, timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def ner(self, sentence):
        return self.nlp.ner(sentence)

    def parse(self, sentence):
        return self.nlp.parse(sentence)

    def dependency_parse(self, sentence):
        return self.nlp.dependency_parse(sentence)

    def annotate(self, sentence):
        return json.loads(self.nlp.annotate(sentence, properties=self.props))

    @staticmethod
    def tokens_to_dict(_tokens):
        tokens = defaultdict(dict)
        for token in _tokens:
            tokens[int(token['index'])] = {
                'word': token['word'],
                'lemma': token['lemma'],
                'pos': token['pos'],
                'ner': token['ner']
            }
        return tokens

nlp = StanfordNLP()

In [25]:
for tweet in processed_tweets:
    tweet['lemma'] = ""
    tweet['pos'] = ""
    for sent in nlp.annotate(tweet['text'])['sentences']:
        for token in sent['tokens']:
            tweet['lemma'] = " ".join([tweet['lemma'], token['lemma']])
            tweet['pos'] = " ".join([tweet['pos'], token['pos']])
    

In [27]:
print(json.dumps(processed_tweets[:5], indent=2))

[
  {
    "id": 1240281265322033154,
    "user": 889861054850891776,
    "text": "   revelation  11   6 these men have power .  to strike the earth with every kind of plague as often as they want .   corona virus  \u2026",
    "lemma": " revelation 11 6 these man have power . to strike the earth with every kind of plague as often as they want . corona virus ...",
    "pos": " NN CD CD DT NNS VBP NN . TO VB DT NN IN DT NN IN VB RB RB IN PRP VBP . NN NN :"
  },
  {
    "id": 1240281153921339393,
    "user": 889861054850891776,
    "text": "   we do all we can but these are the beginning of birth pains . our focus is in heaven .  lock down kenya   coronavirus outbreak   c  \u2026",
    "lemma": " we do all we can but these be the beginning of birth pain . we focus be in heaven . lock down kenya coronavirus outbreak c ...",
    "pos": " PRP VBP DT PRP MD CC DT VBP DT NN IN NN NNS . PRP$ NN VBZ IN NN . VB RP NN NN NN NN :"
  },
  {
    "id": 1240014758578925570,
    "user": 38333629,
    "t

In [26]:
with open(directory+'/covid_tweets_100000.json', 'w') as f:
    json.dump(processed_tweets, f, indent=2)