# Pipeline to clean the keyword column 

In [1]:
import json
import pandas as pd
import itertools # to flatten lists of lists
import collections # to count
from rapidfuzz import process as pr
import numpy as np

## Load data

In [2]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as a dictionary
data = json.load(f)

# convert to data frame
df = pd.DataFrame.from_dict(data)

#### Finding the subset of the data for 1 Jan 2019 - 1 Jan 2020 based on lastModifiedDate

In [3]:
df = df.sort_values(by='lastModifiedDate') #sort dataframe

datetimes = pd.to_datetime(df['lastModifiedDate'])
df['ts_lastModifiedDate']=datetimes
#df.iloc[ts_start]['ts_lastModifiedDate']

#find start index for subset 2019-2022
ts_start=datetimes[(datetimes > pd.Timestamp(year=2019, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2019, month=1, day=2).tz_localize('utc'))].min()
print(ts_start)

#find end date for subset 2019-2022
ts_end=datetimes[(datetimes > pd.Timestamp(year=2022, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2022, month=1, day=2).tz_localize('utc'))].min()
print(ts_end)

start_date=datetimes[datetimes == ts_start]
end_date=datetimes[datetimes == ts_end]

#find index for the chosen start and end dates
start_index=start_date.index[0]
print(start_index)
df[df.index == start_date.index[0]]

end_index=end_date.index[0]
print(end_index)
df[df.index == end_date.index[0]]

df_subset=df[start_index:end_index]

2019-01-01 03:57:28.904000+00:00
2022-01-01 02:35:51.098000+00:00
60278
150367


In [4]:
df_subset.columns

Index(['id', 'name', 'shortTitle', 'text', 'teaser', 'shortTeaser', 'kicker',
       'regions', 'keywords', 'keywordStrings', 'thematicFocusCategory',
       'navigations', 'categories', 'departments', 'firstPublicationDate',
       'lastModifiedDate', 'contentDate', 'relatedAutoTopics', 'contentLinks',
       'articles', 'isOpinion', 'geographicLocations', 'contentAssociations',
       'mainContentImageLink', 'images', 'externalLinks', 'topStory',
       'language', 'ts_lastModifiedDate'],
      dtype='object')

In [5]:
keywords = list(df_subset.keywordStrings)

## Functions

In [6]:
# Used mainly for visualisation, get indices of keywords with a certain substring
def get_items_with_substring(lst_lst_keywords, substring):
    indices = [i for i, lst_kw in enumerate(lst_lst_keywords) if any(list(map(lambda x: substring in x, lst_kw)))]
    return indices

## Cleaning

### Cleaning 1: put everything in lowercase

In [13]:
keywords_lower = [list(map(str.casefold, x)) for x in keywords]

In [14]:
# for visualisation only (can remove later on)
# TODO: entity linking? Keep upper case for names, cities, acronym
print('before:', keywords[0])
print('after: ', keywords_lower[0])

before: ['NASA', 'OSIRIS-REx', 'Bennu', 'asteroid']
after:  ['nasa', 'osiris-rex', 'bennu', 'asteroid']


### Cleaning 2: split keywords that haven't been split

In [15]:
# Split keywords: kw.split splits the keyword in a list of multiple keywords based on substring, itertools.chain flattens the list of lists
keywords_lower_split = [list(itertools.chain(*[kw.split(', ') for kw in lst_kw])) for lst_kw in keywords_lower]
keywords_lower_split = [list(itertools.chain(*[kw.split(' - ') for kw in lst_kw])) for lst_kw in keywords_lower_split] # spaces around '-' to not confuse with the ones within words

In [16]:
# for visualisation only (can remove later on)
items_with_unsplit_keywords = get_items_with_substring(keywords_lower, ' - ') + get_items_with_substring(keywords_lower, ', ')
print('Nb of keywords changed:', len(items_with_unsplit_keywords), '\n')
for i in range(2):
    print('before:', keywords_lower[items_with_unsplit_keywords[i]])
    print('after: ', keywords_lower_split[items_with_unsplit_keywords[i]], '\n')

Nb of keywords changed: 13 

before: ['transgender', 'intergender', 'zagreb', 'march', 'my body - my temple']
after:  ['transgender', 'intergender', 'zagreb', 'march', 'my body', 'my temple'] 

before: ['smartphone', 'app', 'photos', 'black - b&w film emulator']
after:  ['smartphone', 'app', 'photos', 'black', 'b&w film emulator'] 



### Cleaning 3: remove unwanted characters

In [17]:
# Remove '\u2002' and '.', '" ', '"', 'keywords: ' (replace with empty)
keywords_lower_split_clean = keywords_lower_split
keywords_lower_split_clean = [list(map(lambda x: x.replace('\xa0', ' '), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: ''.join(filter(str.isprintable, x)), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('.', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('" ', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('"', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('keywords: ', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]

# Replace '\xa0' with space


In [18]:
# for visualisation only (can remove later on)
items_with_unwanted_characters = get_items_with_substring(keywords_lower, substring = 'keywords:')
print('Nb of keywords changed:', len(items_with_unwanted_characters), '\n')
for i in range(2):
    print('before:', keywords_lower_split[items_with_unwanted_characters[i]])
    print('after: ', keywords_lower_split_clean[items_with_unwanted_characters[i]], '\n')

Nb of keywords changed: 3 

before: ['keywords: germany', 'refugees', 'asylum-seekers', 'covid-19', 'coronavirus']
after:  ['germany', 'refugees', 'asylum-seekers', 'covid-19', 'coronavirus'] 

before: ['keywords: president martin vizcarra', 'peru', 'peru impeachment']
after:  ['president martin vizcarra', 'peru', 'peru impeachment'] 



### Cleaning 4: Clean sentences 
Heuristic: remove keywords that have more than 6 spaces


In [19]:
# TODO: extract keyword from the sentence instead of deleting it?
n_spaces = 8 # if there are more spaces than this number, the keyword is removed
keywords_lower_split_clean_short = [[kw for kw in lst_kw if kw.count(' ')<n_spaces] for lst_kw in keywords_lower_split_clean]

In [20]:
# for visualisation only (can remove later on)
items_with_sentences = [i for i,lst_kw in enumerate(keywords_lower_split_clean) if any(list(map(lambda x: x.count(' ')>=n_spaces, lst_kw)))]
print('Nb of keywords changed:', len(items_with_sentences), '\n')
for i in range(5):
    print('before:', keywords_lower_split_clean[items_with_sentences[i]])
    print('after: ', keywords_lower_split_clean_short[items_with_sentences[i]], '\n')

Nb of keywords changed: 10 

before: ['venezuela', 'aid', 'juan guaido', 'nicolas maduro', 'foreign aid', 'international federation of red cross and red crescent societies']
after:  ['venezuela', 'aid', 'juan guaido', 'nicolas maduro', 'foreign aid'] 

before: ['edgar hilsenrath', 'hilsenrath', 'holocaust literature', 'german authors', 'jewish authors', 'german-jewish authors', 'german literature holocaust authors', 'holocausta humorous take on the holocaust is taboo in germany', 'but one author gains fame in doing so']
after:  ['edgar hilsenrath', 'hilsenrath', 'holocaust literature', 'german authors', 'jewish authors', 'german-jewish authors', 'german literature holocaust authors', 'but one author gains fame in doing so'] 

before: ['france', 'international day for the elimination of violence against women', 'spain', 'turkey', 'domestic violence', 'femicide']
after:  ['france', 'spain', 'turkey', 'domestic violence', 'femicide'] 

before: ['ameroon', 'open letter', 'civil society', '

# Save

In [21]:
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords)))))
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords_lower_split_clean_short)))))

Number of unique keywords before cleaning: 32682
Number of unique keywords before cleaning: 30279


In [22]:
#print(keywords[0:2])
#print(keywords_lower_split_clean_short[0:2])

#df_2019_2020 = df_subset.copy()
#df_2019_2020['keywordStringsClean'] = keywords_lower_split_clean_short

#filepath = '../data/interim/clean_keywords_2019-2021_before_FuzzyWuzzy.csv'
#df_2019_2020.to_csv(filepath, index=False)  

## Count occurence of each keyword
Will be used to know which one to keep in fuzzy wuzzy (the most used)

In [23]:
keywords_flat = list(itertools.chain(*keywords_lower_split_clean_short)) # Flatten list
keywords_freq = collections.Counter(keywords_flat)

In [24]:
# For visualisation, can be removed
keywords_freq_df = pd.DataFrame.from_dict(keywords_freq, orient='index', columns = ['freq'])
keywords_freq_df.sort_values(by='freq', ascending=False).head(10)

Unnamed: 0,freq
coronavirus,3817
germany,3299
covid-19,2641
china,1669
russia,1354
donald trump,1333
asia,1260
us,1087
eu,1078
bundesliga,905


# Rapid Fuzz

In [25]:
# extract unique ones and remove the empty entry
unique_keywords = list(set(keywords_flat))
unique_keywords.remove('')

# run rapid fuzz
ratio_array= pr.cdist(unique_keywords, unique_keywords, score_cutoff = 90)

### Find words correlating together and display

In [26]:
df_array = pd.DataFrame(ratio_array, columns = unique_keywords)

# Count number of non zero values in each row
nb_non_zero = np.count_nonzero(np.asarray(ratio_array), axis=1) 

# Save indices of rows with more than 1 non-zero value
indices_correlating_rows = [i for i, el in enumerate(list(nb_non_zero)) if el>1]

df_array.head()

Unnamed: 0,mt fuji,videometrics,dubai airshow,islamists,koala,treatment,landscape photography,nada,esther salas,workforce,...,eindhoven,decomposition,1933 seizure of power,andrei kovacs,lightning bolt,commercial whaling,monsters,dirty harry,lgbt+,eu migration pact
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Make a list of similar keywords
# TODO: problem: ['european fires', 'european firms']
all_similar_kw = []
for i in indices_correlating_rows:
    similar_words = [keyword for val, keyword in zip(list(df_array.iloc[i]), unique_keywords) if val!=0]
    all_similar_kw.append(similar_words)

In [28]:
len(all_similar_kw)

4900

# Unify similar keywords
Replace by most frequent one

In [23]:
# # Runs for 14min !!
# # Split in the ones which have equal word number, and the ones which don'try
# # TODO: Problem: UN climate summit, US department of justice: only change if same number of words? Not change it if it's a country? ENTITY LINKING? Or remove country name?
# # TODO: Optimise? Faster?
# # TODO: Word embedinngs? MD

# similar_kws_same_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))==1]
# similar_kws_diff_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))!=1]

# # Replace
# keywords_replaced = keywords_lower_split_clean_short

# for sim_kws in similar_kws_same_word_nb:
    
#     # Make list of frequencies for those similar kws
#     sim_kws_freq = [keywords_freq[word] for word in sim_kws]
    
#     for kw in sim_kws:

#         # the new keyword is the one with the highest frequency
#         right_kw = sim_kws[sim_kws_freq.index(max(sim_kws_freq))]

#         # replace similar keywords by the most frequent one
#         keywords_replaced = [list(map(lambda x: right_kw if x==kw else x, lst_kw)) for lst_kw in keywords_replaced]

In [29]:
# Runs for 14min !!
# Split in the ones which have equal word number, and the ones which don'try
# TODO: Problem: UN climate summit, US department of justice: only change if same number of words? Not change it if it's a country? ENTITY LINKING? Or remove country name?
# TODO: Optimise? Faster?
# TODO: Word embedinngs? MD

similar_kws_same_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))==1]
similar_kws_diff_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))!=1]

# Replace
keywords_replaced = keywords_lower_split_clean_short

#for sim_kws in similar_kws_same_word_nb:
    
    # Make list of frequencies for those similar kws
    #sim_kws_freq = [[keywords_freq[word] for word in sim_kws] for sim_kws in similar_kws_same_word_nb]

#argmaxright_kw = [np.argmax([keywords_freq[word] for word in sim_kws]) for sim_kws in similar_kws_same_word_nb[0:5]]
#maxright_kw = [max([keywords_freq[word] for word in sim_kws]) for sim_kws in similar_kws_same_word_nb[0:5]]
#np.random.choice(np.flatnonzero(b == b.max()))
right_kw = [sim_kws[np.argmax([keywords_freq[word] for word in sim_kws])] for sim_kws in similar_kws_same_word_nb]
from itertools import chain
keywords_replaced_flat = list(chain.from_iterable(keywords_replaced))
replacement_only = [[right_kw[i] for i, j in enumerate(similar_kws_same_word_nb) if word in j] for word in keywords_replaced_flat]
keywords_replaced_flat_post = [replacement_only[i][0] if replacement_only[i] != [] else keywords_replaced_flat[i] for i in range(len(keywords_replaced_flat))]

def gen_list_of_lists(original_list, new_structure):
    assert len(original_list) == sum(new_structure), \
    "The number of elements in the original list and desired structure don't match"
        
    list_of_lists = [[original_list[i + sum(new_structure[:j])] for i in range(new_structure[j])] \
                     for j in range(len(new_structure))]
        
    return list_of_lists

keywords_replaced_unflat_post = gen_list_of_lists(keywords_replaced_flat_post, [len(x) for x in keywords_replaced])
#keywords_replaced = [list(map(lambda x:right_kw,lst_kw)) for lst_kw in keywords_replaced]
#keywords_replaced = [[[list(map(lambda x: right_kw if x==kw else x, lst_kw)) for kw in sim_kws] for sim_kws in similar_kws_same_word_nb]for lst_kw in keywords_replaced]
#right_kw = [keywords_freq[sim_kws].index(max([keywords_freq[word] for word in sim_kws])) for sim_kws in similar_kws_same_word_nb[0:5]]
#right_kw = [[max(range(len(values)), key=values.__getitem__) for word in sim_kws])) for sim_kws in similar_kws_same_word_nb[0:5]]
#max(range(len(values)), key=values.__getitem__)
#right_kw = [[max(keywords_freq[word]) for word in sim_kws] for sim_kws in similar_kws_same_word_nb[0:5]]
#keywords_replaced = [list(map(lambda x: right_kw if x==kw else x, lst_kw)) for lst_kw in keywords_replaced[0:200]]

In [25]:
keywords_replaced = keywords_replaced_unflat_post 

In [26]:
keywords_freq['ayatollah khamenei']

2

In [27]:
keywords_freq['ayatollah khomeini']

3

In [28]:
# keywords_test=[]
# keywords_test.append(['georges', 'clemeanceau'])
# keywords_test.append(['philippe', 'petain'])
# keywords_test.append(['leon', 'blum'])

# all_similar_kw_test = []
# all_similar_kw_test.append(['OSEF','JMEF'])
# all_similar_kw_test.append(['blum','bloom'])
# all_similar_kw_test.append(['putain','petain'])
# all_similar_kw_test.append(['TAMERE','TESMORTS'])

# right_kw_test = []
# right_kw_test.append('JMEF')
# right_kw_test.append('bloom')
# right_kw_test.append('putain')

# keywords_test_list=[]
# keywords_test_list.append('georges')
# keywords_test_list.append('clemeanceau')
# keywords_test_list.append('philippe')
# keywords_test_list.append('petain')
# keywords_test_list.append('leon')
# keywords_test_list.append('blum')

# keywords_test_flat = list(chain.from_iterable(keywords_test))
# replacement_only = [[right_kw_test[i] for i, j in enumerate(all_similar_kw_test) if word in j] for word in keywords_test_flat]
# keywords_test_flat_post = [replacement_only[i][0] if replacement_only[i] != [] else keywords_test_flat[i] for i in range(len(keywords_test_flat))]

# def gen_list_of_lists(original_list, new_structure):
#     assert len(original_list) == sum(new_structure), \
#     "The number of elements in the original list and desired structure don't match"
        
#     list_of_lists = [[original_list[i + sum(new_structure[:j])] for i in range(new_structure[j])] \
#                      for j in range(len(new_structure))]
        
#     return list_of_lists

# keywords_test_unflat_post = gen_list_of_lists(keywords_test_flat_post, [len(x) for x in keywords_test])
#list_index = [[i for i, lst in enumerate(all_similar_kw_test) if word in lst][0] for word in keywords_test[0]]
#keywords_test_FT_list = [right_kw_test[i] if word in all_similar_kw_test[i] else word for word in keywords_test_list for i in range(3)]
#new_words = ['broccoli' if word == 'chicken' else word for word in words] ["foo", "bar", "baz"].index("bar")
#keywords_test_FT_list = [right_kw_test[i] if word in j else word for i,j in enumerate(all_similar_kw_test)]
#keywords_test_FT = [[[word.replace(right_kw_test[i], word) for i, x in enumerate(all_similar_kw_test) if word in x] for word in list] for list in keywords_test]

In [29]:
# test_vide = [[right_kw_test[i] for i, j in enumerate(all_similar_kw_test) if word in j] for word in keywords_test_list]

In [30]:
# TEE = [test_vide[i][0] if test_vide[i] != [] else keywords_test_list[i] for i in range(6)]

In [31]:
# TEE

In [32]:
# Example for visualisation, can be removed later
for sim_kws in all_similar_kw[0:2]:
    for word in sim_kws:
        print('freq of:', word, ' = ', keywords_freq[word])
    print('')

n_show = 2 # how many examples to show

i_show = 0
for kw_before, kw_after in zip(keywords_lower_split_clean_short, keywords_replaced):
    if kw_before != kw_after and i_show < n_show:
        print('kw_before:', kw_before)
        print('kw_after:', kw_after, '\n')
        i_show += 1

freq of: pretzel  =  1
freq of: pretzels  =  2

freq of: road trip  =  1
freq of: roadtrip  =  1

kw_before: ['iran', 'sanctions', 'civil society', 'nuclear deal', 'jcpoa', 'us sanctions', 'tehran', 'hassan rouhani', 'ayatollah khamenei']
kw_after: ['iran', 'sanctions', 'civil society', 'nuclear deal', 'jcpoa', 'us sanctions', 'tehran', 'hassan rouhani', 'ayatollah khomeini'] 

kw_before: ["new year's", 'countdown', 'resolutions']
kw_after: ["new year's", 'countdown', 'solutions'] 



# Save

In [33]:
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords)))))
print('Number of unique keywords after first clean:', len(set(list(itertools.chain(*keywords_lower_split_clean_short)))))
print('Number of unique keywords after rapidfuzz replacing:', len(set(list(itertools.chain(*keywords_replaced)))))

Number of unique keywords before cleaning: 32682
Number of unique keywords after first clean: 30279
Number of unique keywords after rapidfuzz replacing: 28056


In [30]:
# TODO: add article ID
list_dates = list(df_subset['lastModifiedDate'])
list_kws = list(df_subset['keywordStrings'])
list_new_kws = keywords_replaced

df_2019_2020 = pd.DataFrame(list(zip(list_dates, list_kws, list_new_kws)), columns=['lastModifiedDate', 'keywordStrings', 'keywordStringsCleanAfterFuzz'])

df_2019_2020.head()

Unnamed: 0,lastModifiedDate,keywordStrings,keywordStringsCleanAfterFuzz
0,2019-01-01T03:57:28.904Z,"[NASA, OSIRIS-REx, Bennu, asteroid]","[nasa, osiris-rex, bennu, asteroid]"
1,2019-01-01T06:11:50.527Z,"[English Channel, migration, boats, illegal im...","[english channel, migration, boats, illegal im..."
2,2019-01-01T06:14:35.563Z,"[Brazil, Jair Bolsonaro, Chicago economics, Ha...","[brazil, jair bolsonaro, chicago economics, ha..."
3,2019-01-01T08:26:11.599Z,"[Japan, Tokyo, Harajuku, attack]","[japan, tokyo, harajuku, attack]"
4,2019-01-01T09:05:00.736Z,"[Asia, Bangladesh, elections, Kamal Hossain, S...","[asia, bangladesh, elections, kamal hossain, s..."


In [31]:
filepath = '../data/interim/clean_keywords_2019-2021_after_RapidFuzzFT.json'

# storing the data in JSON format
df_2019_2020.to_json(filepath, orient = 'split', compression = 'infer', index = 'true')
 
# reading the JSON file
# filepath = '../data/interim/clean_keywords_2019-2021_after_FuzzyWuzzy.json'
# df_loaded = pd.read_json(filepath, orient ='split', compression = 'infer')
# flat_keywords = list(itertools.chain(*list(df['keywordStringsCleanAfterFuzz'])))
 

In [32]:
print('Number of unique keywords after rapidfuzz replacing:', len(set(list(itertools.chain(*all_similar_kw)))))

Number of unique keywords after rapidfuzz replacing: 4900


In [33]:
unique_data = [list(x) for x in set(tuple(x) for x in all_similar_kw)]

In [34]:
len(unique_data)

2842

In [36]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [37]:
import gensim
from gensim.models import Word2Vec

# wv = gensim.models.KeyedVectors.load_word2vec_format("/home/ferdinand_t/Downloads/GoogleNews-vectors-negative300.bin", binary=True)

In [38]:
# Count vectorization of text
from sklearn.feature_extraction.text import CountVectorizer
 
# Ticket Data
corpus = df_2019_2020['keywordStringsCleanAfterFuzz'].astype(str)
 
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')
 
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus)
 
#print(vectorizer.get_feature_names())
 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
CountVectorizedData['lastModifiedDate']= df_2019_2020['lastModifiedDate'].values
print(CountVectorizedData.shape)
CountVectorizedData.head()

(33830, 21982)




Unnamed: 0,007,04,05,10,100,100m,103,11,110,111,...,övp,özdemir,özil,özlem,út,überall,ünal,ünker,şehriban,lastModifiedDate
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019-01-01T03:57:28.904Z
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019-01-01T06:11:50.527Z
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019-01-01T06:14:35.563Z
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019-01-01T08:26:11.599Z
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019-01-01T09:05:00.736Z


In [45]:
df_2019_2020['keywordStringsCleanAfterFuzz'][0:2]

0                  [nasa, osiris-rex, bennu, asteroid]
1    [english channel, migration, boats, illegal im...
Name: keywordStringsCleanAfterFuzz, dtype: object

In [100]:
# Ticket Data
corpus = df_2019_2020['keywordStringsCleanAfterFuzz'][0:3].astype(str)
 
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')
 
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus)
 
#print(vectorizer.get_feature_names())
 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
#CountVectorizedData['lastModifiedDate']= df_2019_2020['lastModifiedDate'].values
print(CountVectorizedData.shape)
CountVectorizedData.head()

(3, 20)




Unnamed: 0,asteroid,bennu,boats,bolsonaro,brazil,channel,chicago,economics,english,guedes,hamilton,illegal,immigration,jair,migration,mourao,nasa,osiris,paulo,rex
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0
2,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,1,0,0,1,0


In [101]:
corpus

0          ['nasa', 'osiris-rex', 'bennu', 'asteroid']
1    ['english channel', 'migration', 'boats', 'ill...
2    ['brazil', 'jair bolsonaro', 'chicago economic...
Name: keywordStringsCleanAfterFuzz, dtype: object

In [102]:
WordsVocab=CountVectorizedData.columns

In [103]:
WordsVocab.shape

(20,)

In [104]:
def FunctionText2Vec(inpTextData):
    # Converting the text to numeric data
    X = vectorizer.transform(inpTextData)
    CountVecData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    
    # Looping through each row for the data
    for i in range(CountVecData.shape[0]):
 
        # initiating a sentence with all zeros
        Sentence = np.zeros(300)
 
        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVecData.iloc[i , :]>=1]:
            #print(word)
            if word in wv.key_to_index.keys():    
                Sentence=Sentence+wv[word]
        # Appending the sentence to the dataframe
        W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Sentence]))
    return(W2Vec_Data)

In [68]:
CountVecData.iloc[33829 , :]

NameError: name 'CountVecData' is not defined

In [69]:
#df_2019_2020['keywordStringsCleanAfterFuzz'].head()

In [105]:
import numpy as np
W2Vec_Data=FunctionText2Vec(df_2019_2020['keywordStringsCleanAfterFuzz'][0:3].astype(str))



In [106]:
W2Vec_Data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.182617,0.047119,0.154297,0.467773,-0.137695,-0.589355,0.196259,-0.612854,0.220215,-0.068726,...,0.154785,0.447266,-0.489746,0.336731,0.023804,-0.051758,-0.420166,-0.286865,0.342285,0.759033
0,-0.154053,0.11377,0.041992,-0.263184,-0.71936,-0.577271,-0.123596,0.30957,0.201172,0.320068,...,0.016357,0.215561,0.845909,0.062622,0.300781,0.254883,0.032059,-0.284668,0.453857,0.143555
0,-0.426941,0.242188,0.20929,1.017578,0.179688,0.282715,-0.306396,-0.980957,-0.179443,0.467773,...,-0.016602,0.634399,0.228271,0.607666,-0.173828,-0.10376,-0.198181,-1.099854,-0.002441,0.262787


In [107]:
df_2019_2020['keywordStringsCleanAfterFuzz'][0:3]

0                  [nasa, osiris-rex, bennu, asteroid]
1    [english channel, migration, boats, illegal im...
2    [brazil, jair bolsonaro, chicago economics, ha...
Name: keywordStringsCleanAfterFuzz, dtype: object

: 

In [97]:
sum(W2Vec_Data.iloc[2])

0.0