In [16]:
from app.scrape_functions import read_from_bucket
import pandas as pd
from google.cloud import storage
import spacy
from spacy.pipeline import merge_noun_chunks, merge_entities
from datetime import date
from collections import defaultdict

nlp = spacy.load('en_core_web_sm', parser=True)

## Import data from GCP bucket

In [7]:
bucket_name = 'uk-gov-tweets-14289'
storage_client = storage.Client.from_service_account_json('app/creds.json')
bucket = storage_client.get_bucket(bucket_name)
data = read_from_bucket(bucket=bucket)

## Data cleaning

In [8]:
data['date'] = data['created'].apply(lambda x: x[:10])
data['date'] = pd.to_datetime(data['date'])

In [9]:
data.head()

Unnamed: 0,id,text,created,user,date
0,1439706594888007683,This week at #UNGA I will be making the case t...,2021-09-19 21:42:55,Boris Johnson,2021-09-19
1,1439693329281818631,It was an honour to present the Groundbreaking...,2021-09-19 20:50:12,Boris Johnson,2021-09-19
2,1439630463988125700,Our heroic NHS staff have done an incredible j...,2021-09-19 16:40:24,Boris Johnson,2021-09-19
3,1439546304233934858,Sad to hear the news about Jimmy Greaves.\n\nH...,2021-09-19 11:05:59,Boris Johnson,2021-09-19
4,1439534497788317699,There are just two weeks to go until Conservat...,2021-09-19 10:19:04,Boris Johnson,2021-09-19


## Data pre-processing

In [58]:
# New stop words list 
customize_stop_words = [
    'RT'
]

# Mark them as stop words
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

def process_tweet(text):
    doc = nlp(text)
    
    # Tokenize text, lemmatize, remove stopwords
    tokens = [token for token in merge_noun_chunks(doc) if not token.is_stop and not token.is_punct]    
    return tokens

In [60]:
tweet = data.loc[11, 'text']
process_tweet(tweet)

[The UK,
 the UAE,
 share,
 a long and rich history,
 His Highness,
 @MohamedBinZayed,
 I,
 focused,
 the future,
 https://t.co/4DBGZTvaM1]

In [27]:
tweet = data.loc[18, 'text']
for token in merge_entities(nlp(tweet)):
    print(token.text)

4/
The
NHS
will
continue
to
get
the
support
it
needs
,
with
an
extra
£
5.4bn
in
England
for
the next six months
alone
…
https://t.co/AwtWsjy0bp


Todos:
* Tokenize
* Remove 'RT' retweet
* Remove stopwords
* Lemmatize

## Entity Extraction Block

In [63]:
def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

In [52]:
data['entities'] = data['text'].apply(extract_entities)

d = defaultdict(list)
for i, row in data.iterrows():
    for entity in row['entities']:
        if entity.text=="":
            pass
        else:
            d['date'].append(row['date'])
            d['user'].append(row['user'])
            d['entity'].append(entity.text)
            d['label'].append(entity.label_)
entities = pd.DataFrame(d)
entities.head()

Unnamed: 0,date,user,entity,label
0,2021-09-19,Boris Johnson,This week,DATE
1,2021-09-19,Boris Johnson,UNGA,ORG
2,2021-09-19,Boris Johnson,NHS,ORG
3,2021-09-19,Boris Johnson,Jimmy Greaves,PERSON
4,2021-09-19,Boris Johnson,one,CARDINAL
