In [33]:
import pandas as pd
from google.cloud import storage
import spacy
from datetime import date
from collections import defaultdict
import re
import string
from io import StringIO, BytesIO
from urllib.request import Request, urlopen
from functools import reduce, partial
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from time import time


nlp = spacy.load('en_core_web_sm', parser=True)


def read_from_bucket(bucket):

    """
    This concatenates all csv files in a bucket together.
    Returns a single dataframe.
    """
    
    frames = []
    files  = list(bucket.list_blobs())
    for file in files:
        blob = bucket.blob(file.name)
        data = pd.read_csv(BytesIO(blob.download_as_string()), encoding='utf-8')
        frames.append(data)
    data = pd.concat(frames)
    return data


def return_politician_handles(option='list'):
    req = Request('https://www.politics-social.com/api/list/csv/followers', headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    s=str(webpage,'utf-8')
    data = StringIO(s) 
    df=pd.read_csv(data)
    df['Name'] = df['Name'].apply(lambda x: x.rstrip())
    df['Screen name'] = df['Screen name'].apply(lambda x: x[1:])
    politician_handles = df['Screen name']
    print('Politician twitter handles imported.\n')

    if option=='list':
        return politician_handles
    else:
        return df
    
    
    
def deEmojify(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)




def remove_hyperlinks(text):
    text = re.sub(r"http\S+", "", text)
    return text

def remove_punctuation(text):
    text = re.sub(r'[)(|!%?@#*,/:;…-]', ' ', text)
    text = re.sub(r'&amp', 'and', text)
    return text

def fix_apostrophes(text):
    text = re.sub(r'’', "'", text)
    return text

def remove_whitespace(text):
    text = re.sub(r'[\t\n\r]', ' ', text)
    return text

def normalize_whitespace(text):
    text = re.sub(r' +', ' ', text)
    return text

def strip_space(text):
    return text.strip()

def end_with_fullstop(text):
#     print(text)
    if len(text)>0 and text[-1]=='.':
        return text
    else:
        return text + '.'

def string_process(text):
    func_list = [
        deEmojify,
        remove_hyperlinks,
        remove_punctuation,
        fix_apostrophes,
        remove_whitespace,
        normalize_whitespace,
        strip_space,
        end_with_fullstop
    ]
    text = reduce(lambda x, func: func(x), func_list, text)
    return text.lower()


def tokenize(text):
    doc = nlp(text)
    tokens = [token for token in doc if not token.is_stop]
    tokens = [token for token in tokens if token.pos_ not in ['PUNCT','SYM','NUM','PART','SPACE']]
    tokens = [token for token in tokens if token.text not in [
        "n't","'h",'m','wh','%','rt',"'s","'ve","'ll",'’re',
        "'m",'&',"'ve","'re",'’ve','’ll','’s','’m','n’t','s.','c.','f.','m.'
    ]]
    tokens = [token.lemma_ for token in tokens]
    return tokens

In [5]:
bucket_name = 'uk-gov-tweets-14289'
storage_client = storage.Client.from_service_account_json('creds.json')
bucket = storage_client.get_bucket(bucket_name)
data = read_from_bucket(bucket=bucket)
data.head()

Unnamed: 0,id,text,created,user
0,1442904820264804354,"Very sorry to hear of the death of Roger Hunt,...",2021-09-28 17:31:31+00:00,BorisJohnson
1,1442895964386197507,I'd urge everyone to go about their business i...,2021-09-28 16:56:20+00:00,BorisJohnson
2,1442513824821751809,RT @RishiSunak: When we said we’d do whatever ...,2021-09-27 15:37:51+00:00,BorisJohnson
3,1442062761761062920,It’s 75 years since the foundation of the Nati...,2021-09-26 09:45:29+00:00,BorisJohnson
4,1442031671910486016,No words can adequately do justice to the debt...,2021-09-26 07:41:56+00:00,BorisJohnson


In [15]:
corpus = data['text'].apply(string_process)
corpus = corpus.apply(tokenize)

In [16]:
corpus.to_csv('tokenized_tweets.csv')

In [17]:
corpus = list(corpus)

0       [sorry, hear, death, roger, hunt, legendary, g...
1        [would, urge, business, normal, way, fill, need]
2       [rishisunak, say, would, take, mean, story, fu...
3       [year, foundation, national, blood, transfusio...
4       [word, adequately, justice, debt, nation, owe,...
                              ...                        
3078    [deputy, prime, minister, justice, secretary, ...
3079    [politicsjoe_uk, eton, millionaire, care, disa...
3080    [teacher, encourage, believe, forward, stand, ...
3081                                    [detail, protest]
3082    [uber, mega, corporation, worth, billion, poun...
Name: text, Length: 32539, dtype: object

In [19]:
import multiprocessing

from gensim.models import Word2Vec



In [21]:
cores = multiprocessing.cpu_count()

4

In [55]:
w2v_model = Word2Vec(min_count=5,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 10:38:11: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.03)', 'datetime': '2021-10-09T10:38:11.313851', 'gensim': '4.0.1', 'python': '3.7.11 (default, Jul 27 2021, 07:03:16) \n[Clang 10.0.0 ]', 'platform': 'Darwin-20.6.0-x86_64-i386-64bit', 'event': 'created'}


In [56]:
t = time()

w2v_model.build_vocab(corpus, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 10:38:13: collecting all words and their counts
INFO - 10:38:13: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 10:38:13: PROGRESS: at sentence #10000, processed 127986 words, keeping 18002 word types
INFO - 10:38:14: PROGRESS: at sentence #20000, processed 256343 words, keeping 27954 word types
INFO - 10:38:14: PROGRESS: at sentence #30000, processed 377485 words, keeping 35347 word types
INFO - 10:38:14: collected 36841 word types from a corpus of 409026 raw words and 32539 sentences
INFO - 10:38:14: Creating a fresh vocabulary
INFO - 10:38:14: Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 7734 unique words (20.992915501750765%% of original 36841, drops 29107)', 'datetime': '2021-10-09T10:38:14.166368', 'gensim': '4.0.1', 'python': '3.7.11 (default, Jul 27 2021, 07:03:16) \n[Clang 10.0.0 ]', 'platform': 'Darwin-20.6.0-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 10:38:14: Word2Vec lifecycle event {'msg': 'effective_min_count=

Time to build vocab: 0.01 mins


In [57]:
t = time()

w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 10:38:17: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 7734 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-10-09T10:38:17.966961', 'gensim': '4.0.1', 'python': '3.7.11 (default, Jul 27 2021, 07:03:16) \n[Clang 10.0.0 ]', 'platform': 'Darwin-20.6.0-x86_64-i386-64bit', 'event': 'train'}
INFO - 10:38:18: worker thread finished; awaiting finish of 2 more threads
INFO - 10:38:18: worker thread finished; awaiting finish of 1 more threads
INFO - 10:38:18: worker thread finished; awaiting finish of 0 more threads
INFO - 10:38:18: EPOCH - 1 : training on 409026 raw words (197897 effective words) took 0.7s, 292974 effective words/s
INFO - 10:38:19: worker thread finished; awaiting finish of 2 more threads
INFO - 10:38:19: worker thread finished; awaiting finish of 1 more threads
INFO - 10:38:19: worker thread finished; awaiting finish of 0 more threads
INFO - 10:38:19: EPOCH - 2 : training on 409026 raw words (19827

Time to train the model: 0.36 mins


In [61]:
w2v_model.wv.most_similar(positive=['millionaire'])

[('eton', 0.9443747997283936),
 ('politicsjoe_uk', 0.8493973016738892),
 ('landlord', 0.8288161754608154),
 ('hike', 0.7386841177940369),
 ('renter', 0.7367849349975586),
 ('arrear', 0.7354673147201538),
 ('socialcare', 0.7239164710044861),
 ('andrew_harrop', 0.7214705944061279),
 ('rent', 0.7198046445846558),
 ('clobber', 0.7147722244262695)]

In [78]:
w2v_model.wv.similarity('national', 'pride')

0.17753191

In [75]:
corpus[:5]

[['sorry',
  'hear',
  'death',
  'roger',
  'hunt',
  'legendary',
  'goalscorer',
  'member',
  'magnificent',
  'squad',
  'take',
  'england',
  'world',
  'cup',
  'victory',
  'thought',
  'family',
  'friend',
  'fan'],
 ['would', 'urge', 'business', 'normal', 'way', 'fill', 'need'],
 ['rishisunak', 'say', 'would', 'take', 'mean', 'story', 'furlough'],
 ['year',
  'foundation',
  'national',
  'blood',
  'transfusion',
  'service',
  'time',
  'selfless',
  'blood',
  'donor',
  'save',
  'improve',
  'life',
  'people',
  'hour',
  'need',
  'would',
  'encourage',
  'blood',
  'donor',
  'lifesav'],
 ['word',
  'adequately',
  'justice',
  'debt',
  'nation',
  'owe',
  'fall',
  'police',
  'officer',
  'dedication',
  'willingness',
  'run',
  'danger',
  'simply',
  'able',
  'live',
  'life',
  'safety',
  'security',
  'grant',
  'npmd21']]

In [49]:
w2v_model.save('uk_politics_w2v.model')

INFO - 10:35:21: Word2Vec lifecycle event {'fname_or_handle': 'uk_politics_w2v.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-10-09T10:35:21.974485', 'gensim': '4.0.1', 'python': '3.7.11 (default, Jul 27 2021, 07:03:16) \n[Clang 10.0.0 ]', 'platform': 'Darwin-20.6.0-x86_64-i386-64bit', 'event': 'saving'}
INFO - 10:35:21: not storing attribute cum_table
INFO - 10:35:21: saved uk_politics_w2v.model
