In [85]:
import pandas as pd
from google.cloud import storage
import spacy
from datetime import date
from collections import defaultdict
import re
import string
from io import StringIO, BytesIO
from urllib.request import Request, urlopen
from functools import reduce, partial
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


nlp = spacy.load('en_core_web_sm', parser=True)
nlp.max_length = 2200000

def read_from_bucket(bucket):

    """
    This concatenates all csv files in a bucket together.
    Returns a single dataframe.
    """
    
    frames = []
    files  = list(bucket.list_blobs())
    for file in files:
        blob = bucket.blob(file.name)
        data = pd.read_csv(BytesIO(blob.download_as_string()), encoding='utf-8')
        frames.append(data)
    data = pd.concat(frames)
    return data


def return_politician_handles(option='list'):
    req = Request('https://www.politics-social.com/api/list/csv/followers', headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    s=str(webpage,'utf-8')
    data = StringIO(s) 
    df=pd.read_csv(data)
    df['Name'] = df['Name'].apply(lambda x: x.rstrip())
    df['Screen name'] = df['Screen name'].apply(lambda x: x[1:])
    politician_handles = df['Screen name']
    print('Politician twitter handles imported.\n')

    if option=='list':
        return politician_handles
    else:
        return df

## Import data from GCP bucket

In [2]:
bucket_name = 'uk-gov-tweets-14289'
storage_client = storage.Client.from_service_account_json('creds.json')
bucket = storage_client.get_bucket(bucket_name)
data = read_from_bucket(bucket=bucket)

## Data cleaning

In [3]:
data['date'] = data['created'].apply(lambda x: x[:10])
data['date'] = pd.to_datetime(data['date'])
data.reset_index(inplace=True)

In [4]:
data.head()

Unnamed: 0,index,id,text,created,user,date
0,0,1442904820264804354,"Very sorry to hear of the death of Roger Hunt,...",2021-09-28 17:31:31+00:00,BorisJohnson,2021-09-28
1,1,1442895964386197507,I'd urge everyone to go about their business i...,2021-09-28 16:56:20+00:00,BorisJohnson,2021-09-28
2,2,1442513824821751809,RT @RishiSunak: When we said we’d do whatever ...,2021-09-27 15:37:51+00:00,BorisJohnson,2021-09-27
3,3,1442062761761062920,It’s 75 years since the foundation of the Nati...,2021-09-26 09:45:29+00:00,BorisJohnson,2021-09-26
4,4,1442031671910486016,No words can adequately do justice to the debt...,2021-09-26 07:41:56+00:00,BorisJohnson,2021-09-26


## Text pre-processing

In [77]:
def deEmojify(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)




def remove_hyperlinks(text):
    text = re.sub(r"http\S+", "", text)
    return text

def remove_punctuation(text):
    text = re.sub(r'[)(|!%?@#*,/:;…-]', ' ', text)
    text = re.sub(r'&amp', 'and', text)
    return text

def fix_apostrophes(text):
    text = re.sub(r'’', "'", text)
    return text

def remove_whitespace(text):
    text = re.sub(r'[\t\n\r]', ' ', text)
    return text

def normalize_whitespace(text):
    text = re.sub(r' +', ' ', text)
    return text

def strip_space(text):
    return text.strip()

def end_with_fullstop(text):
#     print(text)
    if len(text)>0 and text[-1]=='.':
        return text
    else:
        return text + '.'

def string_process(text):
    func_list = [
        deEmojify,
        remove_hyperlinks,
        remove_punctuation,
        fix_apostrophes,
        remove_whitespace,
        normalize_whitespace,
        strip_space,
        end_with_fullstop
    ]
    text = reduce(lambda x, func: func(x), func_list, text)
    return text.lower()
    


## Grouping into users

In [56]:
users = data[['user','text']]
users['text'] = users['text'].apply(string_process)
users = users.groupby('user')['text'].apply(' '.join).reset_index()
users.head(11)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,user,text
0,ABridgen,the ghost of labour leaders past is coming thr...
1,ACunninghamMP,rt francesogrady great speech by keir_starmer ...
2,AJRichardsonMP,rt jackiedp exactly so. we are women. we are n...
3,AJonesMP,rt philallottpfcc good to hear at the public a...
4,ALewerMBE,rt appgonmnd “now we can actually point to cle...
5,ASollowayUK,today i'm wearing thebhf 60th birthday commemo...
6,AWMurrison,kier throwing red meat to his party. in my pat...
7,AaronBell4NUL,rt ellisonprimary year 6 have been studying th...
8,AdamAfriyie,such fun to speak with and answer questions fr...
9,Afzal4Gorton,rt cbthunting we are at uklabour lpc21. keir s...


## Grouping into party

In [83]:
party = data.merge(return_politician_handles(option='all'), left_on='user', right_on='Screen name')
party.drop([
    'index','id','created','date','user','Screen name',
    'Constituency','New followers in last 24 hours','Followers'
], axis=1, inplace=True)
party = party.groupby('Party')['text'].apply(' '.join).reset_index()
party['text'] = party['text'].apply(string_process)
party['text'].apply(len)

Politician twitter handles imported.



0        9591
1     2168154
2       31044
3       19927
4       27070
5     1873577
6      104684
7       28615
8      435460
9       42643
10      21664
11       6628
Name: text, dtype: int64

## Tokenizing

In [86]:
def noun_chunk(text):
    doc = nlp(text)
#     doc = [token for token in doc if not token.is_stop]
    tokens = []
    for chunk in doc.noun_chunks:
        chunk = re.sub(' ', '_', chunk.text)
        tokens.append(chunk)
    return tokens

def tokenize(text):
    doc = nlp(text, disable = ['ner', 'parser'])
    tokens = [token for token in doc if not token.is_stop]
    tokens = [token for token in tokens if token.pos_ not in ['PUNCT','SYM','NUM','PART','SPACE']]
    tokens = [token for token in tokens if token.text not in [
        "n't","'h",'m','wh','%','rt',"'s","'ve","'ll",'’re',
        "'m",'&',"'ve","'re",'’ve','’ll','’s','’m','n’t','s.','c.','f.','m.'
    ]]
    tokens = [token.lemma_ for token in tokens]
    return tokens

def identity_tokenizer(tokens: list) -> list:
    return tokens

## Vectorizing

In [87]:
tfidf = TfidfVectorizer(
    tokenizer=tokenize,
    lowercase=True,
    max_df=0.5)

tfidf.fit(party['text'])

with open('tfidf.pk1', 'wb') as pickle_file:
    pickle.dump(tfidf, pickle_file)

## Get top n important tokens

In [76]:
def top_n_important_tokens(text, tfidf, n):
    
    def get_feature(i, elements):
        return elements[i]
    
    feature_names = np.array(tfidf.get_feature_names())
    vector = tfidf.transform([text])
    importance = vector.toarray()[0]
    tfidf_sorting = np.argsort(vector.toarray()).flatten()[::-1]

    get_features_word = partial(get_feature, elements=feature_names)
    get_features_importance = partial(get_feature, elements=importance)

    df = pd.DataFrame()
    df['words'] = list(map(get_features_word, tfidf_sorting))[:n]
    df['importance'] = list(map(get_features_importance, tfidf_sorting))[:n]
    
    return df

In [96]:
i = 5
test = party.loc[i, 'text']
print(party.loc[i, 'Party'])
top_n_important_tokens(test, tfidf, 10)

Labour


Unnamed: 0,words,importance
0,uklabour,0.526487
1,keir_starmer,0.277472
2,tax,0.190254
3,keir,0.154611
4,rachelreevesmp,0.12033
5,wage,0.105001
6,mental,0.100755
7,shadow,0.100461
8,ed_miliband,0.087997
9,email,0.083347
