<a href="https://colab.research.google.com/github/marcek83/hello-world/blob/master/wids_nlp_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WIDS NLP Workshop

## Doc2vec word embeddings and UMAP visualisation

**Setting up Google Colab**



In [0]:
# Installing PyDrive
# PyDrive is a wrapper library of google-api-python-client.
!pip install -U -q PyDrive

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Add your ID for 'colab files' folder, e.g.: https://drive.google.com/open?id=1aBgO4hO6E6mcyU1u4hlYthOZXScNr0d5
# List of your files on drive will be printed
# E.g. twitter_trolls_english_only.csv, id: 1Ofl7U9qy3LFZ2XdxGXjZhOGBIC7JHsQx 

from google.colab import files
file_list = drive.ListFile({'q': "'YOUR FOLDER ID' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

In [0]:
# Getting file content
tweets_downloaded = drive.CreateFile({'id': 'YOUR FILE ID'})
tweets_downloaded.GetContentFile('tweets_50k.csv')

In [0]:
# Importing Python libraries
import pandas as pd
import numpy as np
import io
df1 = pd.read_csv('tweets_50k.csv', sep=',', engine='python')
df1

In [0]:
df1.count()

In [0]:
df1.head()

In [0]:
import re

# Dropping tweets with null values
df1 = df1.dropna(subset=['content'])
df1 = df1.reset_index(drop = True)

# Keep hold of original tweets and create a seperate column
# for preprocessing and adding cleaned tweets
df1['cleaned'] =  df1['content'].values

## Removing emojis

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           "]+", flags=re.UNICODE)

def remove_emoji(df1):
    row = list()
    for i in range(0, df1.shape[0]):
        if bool(re.search(emoji_pattern, df1['content'][i])) == True:
            row.append(i)
    df1.drop(df1.index[row], inplace=True)
    return df1

remove_emoji(df1)
df1 = df1.reset_index(drop = True) # Resets the index when removing rows and does not add a  new column index

In [0]:
# Removing stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

df1['cleaned'] = df1['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [0]:
# Further pre-processing
from nltk.stem.porter import *
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
# nltk.download() #run this first time to download all the packages in nltk

from datetime import datetime 
from dateutil.parser import parse 

import string
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
lemmatiser = WordNetLemmatizer()

cleaned_tweet = []
def cleantweet(df1, cleaned_tweet):
    
    for data in df1['cleaned']:
        str_no_hyperlinks=re.sub(r'https?:\/\/.*\/\w*','',data)
        str_lower =str_no_hyperlinks.lower()
        str_letters_only = re.sub("[^a-zA-Z]", " ", str_lower) ##  remove non letters
        str_no_username = re.sub(r'(?:@[\w_]+)', '', str_letters_only) # @-mentions 
        str_no_username = str_no_username.strip()
        exclude = set(string.punctuation)
        str_no_punc = "".join(word for word in str_no_username if word not in exclude)
        tweet = re.sub('[\s]+', ' ', str_no_punc)
        tweet1 = re.sub('[\n]+', ' ', tweet)
        tweets = re.sub(r'[^\w]', ' ', tweet1)
        #trim
        tweets = tweets.strip()
        cleaned_tweet.append(tweets)
                  
    return cleaned_tweet

cleantweet(df1, cleaned_tweet)

df1['cleaned'] = cleaned_tweet

In [0]:
# Keeping relevant columns only
df = df1[['external_author_id', 'author', 'language','publish_date', 'cleaned', 'account_category']]
df.head()

In [0]:
# Creating account type dictionaries, we'll need them for umap
account_types = {}
for tweet in df.values:
    try:
        account_types[tweet[5].lower()]
    except KeyError:
        account_types[tweet[5].lower()] = []
    account_types[tweet[5].lower()].append(tweet)
unsorted_account_types_meta = []
for key in account_types.keys():
    unsorted_account_types_meta.append([key, len(account_types[key])])
account_types_meta = sorted(unsorted_account_types_meta, key=lambda item: item[1])
account_types_meta.reverse()
for meta in account_types_meta:
    print(meta)

In [0]:
# Creating authors dictionaries
authors_dict = {}
for tweet in df.values:
    try:
        authors_dict[tweet[0]]
    except KeyError:
        authors_dict[tweet[0]] = {'author_handle': tweet[1], 'account_type': tweet[5], 'tweet_bodies': []}
    authors_dict[tweet[0]]['tweet_bodies'].append(tweet[4])
authors = []
for author_id, author_data in authors_dict.items():
    authors.append({
        'account_type': author_data['account_type'],
        'author_handle': author_data['author_handle'],
        'external_author_id': author_id,
        'tweet_bodies': author_data['tweet_bodies'],
        'tweets_count': len(author_data['tweet_bodies'])
    })

In [0]:
authors[:1]

In [0]:
# Installing Gensim library for doc2vec
!pip install gensim

In [0]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import logging
import re

class TaggedDocumentIterator:
    def __iter__(self):
        for i, author in enumerate(authors):
            tweets_text = ''
            for tweet_text in author['tweet_bodies']:
                scrubbed_tweet_text = re.sub(r'\bhttps://t\.co/\S*\b', '', tweet_text)
                tweets_text = tweets_text + ' ' + scrubbed_tweet_text
            yield TaggedDocument(simple_preprocess(tweets_text), [i])

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

corpus = TaggedDocumentIterator()

In [0]:
# Building vocabulary
model = gensim.models.Doc2Vec(vector_size=200, min_count=5, epochs=10, workers=3)
model.build_vocab(corpus)

In [0]:
# Training model
%time model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [0]:
# Saving model
model.save('doc2vec.model')

In [0]:
# Loading model
from gensim.models.doc2vec import Doc2Vec
d2v_model = Doc2Vec.load('doc2vec.model')

In [0]:
# Printing the vector of document at index 1 in docLabels
docvec = d2v_model.docvecs[1]
print(docvec)

In [0]:
# Let's see most similar document with similarity scores using document-index
similar_doc = d2v_model.docvecs.most_similar(14) 
print(similar_doc)


In [0]:
d2v_model.most_similar('trump')

In [0]:
d2v_model.most_similar('gop')

In [0]:
docvecs = []
for i in range(len(model.docvecs)):
    docvecs.append(model.docvecs[i])

In [0]:
docvecs[:2]

In [0]:
!pip install umap

In [0]:
!pip install umap-learn

In [0]:
import umap.umap_ as umap

In [0]:
embedding = umap.UMAP(verbose=True, random_state=42).fit_transform(docvecs)

In [0]:
account_type_color_map = {
    'righttroll': 'red',
    'lefttroll': 'blue',
    'fearmonger': 'black',
    'hashtaggamer': 'purple',
    'newsfeed': 'cyan',
    'unknown': 'green',
    'commercial': 'green',
    'nonenglish': 'green'
}

% matplotlib inline
import matplotlib.pyplot as plt

c = []
x = []
y = []
for i, item in enumerate(embedding):
    c.append(account_type_color_map[authors[i]['account_type'].lower()])
    x.append(item[0])
    y.append(item[1])

figumap = plt.figure(figsize=(10,10))
plt.scatter(x, y, c=c, s=4, alpha=0.25)
plt.show()
#plt.savefig('fig_umap.png')

In [0]:
from sklearn import cluster
#from sklearn.cluster import KMeans
n_clusters = 8
k_means = cluster.KMeans(n_clusters=n_clusters, random_state=42)
k_means.fit(docvecs)

In [0]:
!pip install bokeh

In [0]:
% matplotlib inline
from bokeh.palettes import Set2
import matplotlib.pyplot as plt
import matplotlib.patches as patches

c = []
x = []
y = []
for i, item in enumerate(embedding):
    try:
        c.append(Set2[n_clusters][k_means.labels_[i]])
        x.append(item[0])
        y.append(item[1])
    except KeyError:
        pass

plt.figure(figsize=(8,8))
all_patches = []
for i in range(n_clusters):
    all_patches.append(patches.Patch(color=Set2[n_clusters][i], label='Cluster ' + str(i)))
plt.legend(handles=all_patches)
plt.scatter(x, y, c=c, s=4, alpha=1)
plt.title('Clusters on the 200-dimensional embeddings.')
plt.show()
#plt.savefig('bokeh_fig.png')

**Doc2vec (DBOW & DM)**

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [0]:
df_new = pd.read_csv('tweets_25k.csv', sep=',', engine='python')

In [0]:
df_new.head()

In [0]:
df_new = df_new[['content','account_category']]
df_new = df_new[pd.notnull(df_new['content'])]
df_new.rename(columns = {'content':'tweet'}, inplace = True)
df_new.head(10)

In [0]:
df_new.shape

In [0]:
df_new['tweet'].apply(lambda x: len(x.split(' '))).sum()

In [0]:
cnt_pro = df_new['account_category'].value_counts()

plt.figure(figsize=(12,4))
sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Account_Category', fontsize=12)
plt.xticks(rotation=90)
plt.show();

In [0]:
def print_tweet(index):
    example = df_new[df_new.index == index][['tweet', 'account_category']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Account_Category:', example[1])

In [0]:
print_tweet(20)

In [0]:
print_tweet(50)

Text Processing

In [0]:
import nltk
nltk.download('punkt')
def cleanText(text):
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r' ', text)
    text = text.lower()
    return text
df1['tweet'] = df_new['tweet'].apply(cleanText)

In [0]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [0]:
train, test = train_test_split(df_new, test_size=0.3, random_state=42)

In [0]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['tweet']), tags=[r.account_category]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['tweet']), tags=[r.account_category]), axis=1)

In [0]:
train_tagged.values[20]

Training the model

DBOW

Building a vocabulary

In [0]:
model_dbow = Doc2Vec(dm=0, vector_size=200, negative=5, hs=0, min_count=2, sample = 0, workers=2)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [0]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

Buliding the final vector feature for the classifier

In [0]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [0]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [0]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [0]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [0]:
model.save('modeldbow.model')

In [0]:
from gensim.models.doc2vec import Doc2Vec 
model = Doc2Vec.load('modeldbow.model')
docvecs = []
for i in range(len(model.docvecs)):
    docvecs.append(model.docvecs[i])

In [0]:
docvecs = []
for i in range(len(model.docvecs)):
    docvecs.append(model.docvecs[i])

In [0]:
docvecs[:2]

In [0]:
!pip install umap

In [0]:
!pip install umap-learn

In [0]:
import umap.umap_ as umap

In [0]:
#import umap
embedding = umap.UMAP(verbose=True, random_state=42).fit_transform(docvecs)

In [0]:
account_types = {}
for tweet in df_new.values:
    try:
        account_types[tweet[1].lower()]
    except KeyError:
        account_types[tweet[1].lower()] = []
    account_types[tweet[1].lower()].append(tweet)
unsorted_account_types_meta = []
for key in account_types.keys():
    unsorted_account_types_meta.append([key, len(account_types[key])])
account_types_meta = sorted(unsorted_account_types_meta, key=lambda item: item[1])
account_types_meta.reverse()
for meta in account_types_meta:
    print(meta)

In [0]:
authors_dict = {}
for tweet in df_new.values:
    try:
        authors_dict[tweet[0]]
    except KeyError:
        authors_dict[tweet[0]] = {'account_type': tweet[1], 'tweet_bodies': []}
    authors_dict[tweet[0]]['tweet_bodies'].append(tweet[0])
authors = []
for author_id, author_data in authors_dict.items():
    authors.append({
        'account_type': author_data['account_type'],
        'tweet_bodies': author_data['tweet_bodies'],
        'tweets_count': len(author_data['tweet_bodies'])
    })

In [0]:
account_type_color_map = {
    'righttroll': 'red',
    'lefttroll': 'blue',
    'fearmonger': 'black',
    'hashtaggamer': 'purple',
    'newsfeed': 'cyan',
    'unknown': 'green',
    'commercial': 'green',
    'nonenglish': 'green'
}

% matplotlib inline
import matplotlib.pyplot as plt

c = []
x = []
y = []
for i, item in enumerate(embedding):
    c.append(account_type_color_map[authors[i]['account_type'].lower()])
    x.append(item[0])
    y.append(item[1])

figumap = plt.figure(figsize=(10,10))
plt.scatter(x, y, c=c, s=4, alpha=0.25)
plt.show()

In [0]:
from sklearn import cluster
#from sklearn.cluster import KMeans
n_clusters = 8
k_means = cluster.KMeans(n_clusters=n_clusters, random_state=42)
k_means.fit(docvecs)

Visualisation with Bokeh

In [0]:
!pip install bokeh

In [0]:
% matplotlib inline
hfrom bokeh.palettes import Set2
import matplotlib.pyplot as plt
import matplotlib.patches as patches

c = []
x = []
y = []
for i, item in enumerate(embedding):
    try:
        c.append(Set2[n_clusters][k_means.labels_[i]])
        x.append(item[0])
        y.append(item[1])
    except KeyError:
        pass

plt.figure(figsize=(8,8))
all_patches = []
for i in range(n_clusters):
    all_patches.append(patches.Patch(color=Set2[n_clusters][i], label='Cluster ' + str(i)))
plt.legend(handles=all_patches)
plt.scatter(x, y, c=c, s=4, alpha=1)
plt.title('Clusters on the 200-dimensional embeddings.')
plt.show()

Distributed Memory



In [0]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

In [0]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

Train Logistic Regression

In [0]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [0]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [0]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [0]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [0]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [0]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [0]:
authors_dict = {}
for tweet in df.values:
    try:
        authors_dict[tweet[0]]
    except KeyError:
        authors_dict[tweet[0]] = {'author_handle': tweet[1], 'account_type': tweet[5], 'tweet_text': []}
    authors_dict[tweet[0]]['tweet_text'].append(tweet[4])
authors = []
for author_id, author_data in authors_dict.items():
    authors.append({
        'account_type': author_data['account_type'],
        'author_handle': author_data['author_handle'],
        'external_author_id': author_id,
        'tweet_text': author_data['tweet_text'],
        'tweets_count': len(author_data['tweet_text'])

SyntaxError: ignored