In [1]:
import pandas as pd
import numpy as np
import re
import multiprocessing
import glob
import os
import string
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from nltk.stem import PorterStemmer
from time import time 
from collections import defaultdict
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
path = r'translated_tweets'          
all_files = glob.glob(os.path.join(path, "*.csv"))

df_from_each = (pd.read_csv(f).assign(filename = os.path.basename(f)) for f in all_files)
df = pd.concat(df_from_each, ignore_index=True)
df_cleaned = df.dropna().drop_duplicates().reset_index(drop=True)
df.head(5)

INFO - 22:13:00: NumExpr defaulting to 8 threads.


Unnamed: 0,id,created_at,text,rt_count,fav_count,english,filename
0,1447514721503129603,2021-10-11 10:49:37+00:00,"Educating girls doesn't just improve lives, it...",422,2697,"Educating girls doesn't just improve lives, it...",translated_BorisJohnson.csv
1,1447156515438305287,2021-10-10 11:06:14+00:00,It takes courage to speak out about mental hea...,409,1911,It takes courage to speak out about mental hea...,translated_BorisJohnson.csv
2,1446429130631827457,2021-10-08 10:55:52+00:00,Desperately sad to hear the news about James B...,797,7895,Desperately sad to hear the news about James B...,translated_BorisJohnson.csv
3,1446169835738648578,2021-10-07 17:45:32+00:00,Fantastic news that the UAE has become the fir...,372,2142,Fantastic news that the UAE has become the fir...,translated_BorisJohnson.csv
4,1446101800629768195,2021-10-07 13:15:11+00:00,The UK is honoured to host these games in the ...,190,1243,The UK is honoured to host these games in the ...,translated_BorisJohnson.csv


In [7]:
def tweetToWords(tweet):
    tweet = str(tweet)
    tweet = tweet.lower()
    
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'http.?://[^\s]+[\s]?', '', tweet)
    tweet = tweet.replace('_','')
    punct = string.punctuation
    tab = str.maketrans(punct, len(punct)*' ')
    tweet = tweet.translate(tab)
    tweet = re.sub(r'\d+', '', tweet)
    porter = PorterStemmer()
    words = tweet.split()
    
    text = tweet.split()
    
    return text

In [16]:
df_cleaned.english = df_cleaned.english.apply(lambda x: tweetToWords(x))
df_cleaned.head(-10)

Unnamed: 0,id,created_at,text,rt_count,fav_count,english,filename
0,1447514721503129603,2021-10-11 10:49:37+00:00,"Educating girls doesn't just improve lives, it...",422,2697,"[educating, girls, doesn, t, just, improve, li...",translated_BorisJohnson.csv
1,1447156515438305287,2021-10-10 11:06:14+00:00,It takes courage to speak out about mental hea...,409,1911,"[it, takes, courage, to, speak, out, about, me...",translated_BorisJohnson.csv
2,1446429130631827457,2021-10-08 10:55:52+00:00,Desperately sad to hear the news about James B...,797,7895,"[desperately, sad, to, hear, the, news, about,...",translated_BorisJohnson.csv
3,1446169835738648578,2021-10-07 17:45:32+00:00,Fantastic news that the UAE has become the fir...,372,2142,"[fantastic, news, that, the, uae, has, become,...",translated_BorisJohnson.csv
4,1446101800629768195,2021-10-07 13:15:11+00:00,The UK is honoured to host these games in the ...,190,1243,"[the, uk, is, honoured, to, host, these, games...",translated_BorisJohnson.csv
...,...,...,...,...,...,...,...
17164,634551923241148416,2015-08-21 02:25:42+00:00,Jobs and Welfare Reform | speech to Australian...,2,4,"[jobs, and, welfare, reform, speech, to, austr...",translated_ScottMorrisonMP.csv
17165,632821687348625408,2015-08-16 07:50:22+00:00,"No jab, no play in Vic childcare, kindy | good...",6,12,"[no, jab, no, play, in, vic, childcare, kindy,...",translated_ScottMorrisonMP.csv
17166,632820671827963904,2015-08-16 07:46:20+00:00,A world record crowd also at Netball World Cup...,0,9,"[a, world, record, crowd, also, at, netball, w...",translated_ScottMorrisonMP.csv
17167,632820179114680321,2015-08-16 07:44:22+00:00,A great win for the Diamonds - congrats - des...,1,9,"[a, great, win, for, the, diamonds, congrats, ...",translated_ScottMorrisonMP.csv


In [9]:
df_model = df_cleaned.copy()
df_model = df_model[df_model.english.str.len()>1]

In [10]:
sent = [row for row in df_model.english]
phrases = Phrases(sent, min_count=1, progress_per= 5000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[3]

INFO - 22:14:28: collecting all words and their counts
INFO - 22:14:28: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 22:14:28: PROGRESS: at sentence #5000, processed 136952 words and 65841 word types
INFO - 22:14:28: PROGRESS: at sentence #10000, processed 322659 words and 140047 word types
INFO - 22:14:28: PROGRESS: at sentence #15000, processed 496522 words and 192847 word types
INFO - 22:14:29: collected 210626 token types (unigram + bigrams) from a corpus of 548797 words and 17127 sentences
INFO - 22:14:29: merged Phrases<210626 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 22:14:29: Phrases lifecycle event {'msg': 'built Phrases<210626 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000> in 0.94s', 'datetime': '2021-10-26T22:14:29.063668', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 22:14:29: exporting phras

['fantastic_news',
 'that',
 'the',
 'uae',
 'has_become',
 'the',
 'first',
 'gulf',
 'country',
 'to',
 'commit',
 'to',
 'net_zero',
 'emissions_by',
 'this',
 'is',
 'a',
 'significant_step',
 'in',
 'tackling_climate',
 'change',
 'and',
 'i_hope',
 'more',
 'countries_commit',
 'to',
 'net_zero',
 'ahead',
 'of']

In [11]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5,
                     alpha = 0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50)
print('Time to build vocab: {} minutes'.format(round((time() -start) / 60, 2)))

INFO - 22:14:55: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.03)', 'datetime': '2021-10-26T22:14:55.188407', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 22:14:55: collecting all words and their counts
INFO - 22:14:55: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:14:55: PROGRESS: at sentence #50, processed 1473 words, keeping 669 word types
INFO - 22:14:55: PROGRESS: at sentence #100, processed 2930 words, keeping 1154 word types
INFO - 22:14:55: PROGRESS: at sentence #150, processed 4290 words, keeping 1553 word types
INFO - 22:14:55: PROGRESS: at sentence #200, processed 5562 words, keeping 1845 word types
INFO - 22:14:55: PROGRESS: at sentence #250, processed 6829 words, keeping 2111 word types
INFO - 22:14:55: PROGRESS: at sentence #300, processed 8184 words, keeping 2380 word types
INFO - 22:14:

Time to build vocab: 0.03 minutes


In [12]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 22:15:38: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 13775 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4', 'datetime': '2021-10-26T22:15:38.717831', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 22:15:39: EPOCH 1 - PROGRESS: at 63.48% examples, 73930 words/s, in_qsize 14, out_qsize 0
INFO - 22:15:39: worker thread finished; awaiting finish of 6 more threads
INFO - 22:15:39: worker thread finished; awaiting finish of 5 more threads
INFO - 22:15:39: worker thread finished; awaiting finish of 4 more threads
INFO - 22:15:39: worker thread finished; awaiting finish of 3 more threads
INFO - 22:15:39: worker thread finished; awaiting finish of 2 more threads
INFO - 22:15:39: worker thread finished; awaiting finish of 1 more threads
INFO - 22:15:39: worker thread finished; awaiting finish of 0 more threads
INFO - 

Time to train the model: 0.55 mins


In [13]:
w2v_model.save("word2vec.model")

INFO - 22:16:19: Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-10-26T22:16:19.901874', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
INFO - 22:16:19: not storing attribute cum_table
INFO - 22:16:19: saved word2vec.model


In [14]:
df_export = df_model.copy()
df_export['old_english'] = df_export.english
df_export.old_english = df_export.old_english.str.join(' ')
df_export.english = df_export.english.apply(lambda x: ' '.join(bigram[x]))

In [15]:
df_export[['english']].to_csv('cleaned_dataset.csv', index=False)