In [1]:
import spacy
import json
import tqdm
import pandas as pd

with open('tweets.json') as json_file:
    records = json.load(json_file)
    
nlp_md = spacy.load('es_core_news_md')
nlp_lg = spacy.load('es_core_news_lg')



In [2]:


def add_tag(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = [value]
    elif value not in dictionary[key]:
        dictionary[key] += [value]
    return dictionary
    
    
def get_tags(tweet, nlp):
    included_tags = ['LOC', 'PER', 'NOUN', 'VERB', 'PROPN', 'HASHTAGS', 'URLS', 'ORG']
    tags = {}
    key_words = []
    doc = nlp(tweet['text'])
    for ent in doc.ents:
        if ent.label_ in included_tags:
            tags = add_tag(tags, ent.label_, ent.text)
            key_words += [ent.text]
    for key in tags:
        tweet[key] = tags[key]
    tweet['key_words'] = list(set(key_words))
    return tweet



In [3]:
tweets_md = [get_tags(records[key], nlp_md) for key in tqdm.tqdm(records)]
tweets_lg = [get_tags(records[key], nlp_lg) for key in tqdm.tqdm(records)]

100%|██████████| 49/49 [00:00<00:00, 109.53it/s]
100%|██████████| 49/49 [00:00<00:00, 112.67it/s]


In [4]:
tweets_md_df = pd.DataFrame(tweets_md)
tweets_lg_df = pd.DataFrame(tweets_lg)


In [5]:
tweets_md_df.head()

Unnamed: 0,created_at,id,text,key_words,PER,ORG,LOC
0,2021-02-20T11:25:02.000Z,1363087261148065798,El rapero que hizo chocar a con sus límites de...,[],,,
1,2021-02-20T11:15:02.000Z,1363084743840059392,"Josefina Huffington Archbold, presidenta de la...","[Providencia, Old Providence, Josefina Huffing...",[Josefina Huffington Archbold],[Old Providence],"[Old Providence, Providencia]"
2,2021-02-20T11:15:00.000Z,1363084734436376578,Los países ricos acumulan las vacunas que falt...,[],,,
3,2021-02-20T11:11:12.000Z,1363083777430482944,Caso Uribe: testigo perdida y pruebas de últim...,"[Caso Uribe, Iván Cepeda]","[Caso Uribe, Iván Cepeda]",,
4,2021-02-20T11:10:02.000Z,1363083486140194819,El presidente Iván Duque inauguró el Módulo de...,"[La Guajira, Gobierno, Maicao, Iván Duque]",[Iván Duque],[Gobierno],"[Maicao, La Guajira]"


In [6]:
tweets_lg_df.head()

Unnamed: 0,created_at,id,text,key_words,PER,ORG,LOC
0,2021-02-20T11:25:02.000Z,1363087261148065798,El rapero que hizo chocar a con sus límites de...,[],,,
1,2021-02-20T11:15:02.000Z,1363084743840059392,"Josefina Huffington Archbold, presidenta de la...","[Providencia, Old Providence, Josefina Huffing...",[Josefina Huffington Archbold],[Old Providence],"[Old Providence, Providencia]"
2,2021-02-20T11:15:00.000Z,1363084734436376578,Los países ricos acumulan las vacunas que falt...,[],,,
3,2021-02-20T11:11:12.000Z,1363083777430482944,Caso Uribe: testigo perdida y pruebas de últim...,"[Caso Uribe, Iván Cepeda]","[Caso Uribe, Iván Cepeda]",,
4,2021-02-20T11:10:02.000Z,1363083486140194819,El presidente Iván Duque inauguró el Módulo de...,"[La Guajira, Gobierno, Maicao, Iván Duque]",[Iván Duque],[Gobierno],"[Maicao, La Guajira]"
