In [1]:
# Librería o conjunto de herramientas para el uso de lenguaje natural
import nltk 

# Modelo para tokenización
nltk.download('punkt')  
nltk.download('punkt_tab') 
# Corpus que contiene stopwords en diferentes idiomas
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('pos_tag')


#Herramienta para tokenizar separando por palabras
from nltk.tokenize import word_tokenize 
#Importar el la herramienta con el corpus de stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Librería que contiene herramientas de lematización
import spacy
# Cargar el modelo de spaCy para inglés
nlp = spacy.load("en_core_web_sm")


# Librería para carga de datos
import pandas as pd

# Herrmienta para contar la frecuencia
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mario.fernandezr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mario.fernandezr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mario.fernandezr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mario.fernandezr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading pos_tag: Package 'pos_tag' not found in
[nltk_data]     index


In [2]:
ruta_digcomp = "../data/digcomp/digcomp_nivelCompetencia.csv"
digcomp = pd.read_csv(ruta_digcomp, encoding="latin1")

digcomp = digcomp[digcomp['Idioma'] != 'Español']


In [3]:
def tokenizar_texto_wordnet(texto):
    tokens = word_tokenize(texto)
    return [token for token in tokens]

In [4]:
def tokenizar_texto_spacy(texto):
    doc = nlp(texto.lower())
    return list(doc)

In [5]:
def eliminar_stopwords_wordnet(tokens):
    return [token for token in tokens if token.isalpha() and token not in stop_words]

In [6]:
def eliminar_stopwords_spacy(tokens):
    return [token for token in tokens if token.is_alpha and not token.is_stop]

In [7]:
def lematizar_texto_wordnet(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [8]:
def lematizar_texto_spacy(tokens):
    return [token.lemma_.lower() for token in tokens]

In [9]:
tokens_spacy = digcomp["Texto"].apply(tokenizar_texto_spacy)
tokens_filtrados = tokens_spacy.apply(eliminar_stopwords_spacy)
digcomp["Palabras spacy"] = tokens_filtrados.apply(lematizar_texto_spacy)

tokens_wordnet = digcomp["Texto"].apply(tokenizar_texto_wordnet)
tokens_filtrados = tokens_wordnet.apply(eliminar_stopwords_wordnet)
digcomp["Palabras wordnet"] = tokens_filtrados.apply(lematizar_texto_wordnet)

display(digcomp)

Unnamed: 0,ID_area,ID_competencia,Idioma,ID_nivel,ID_nivelCompetencia,Texto,Verbo,Palabras spacy,Palabras wordnet
27,1,1,Inglés,1,1,"identify my information needs, find data, info...",identify,"[identify, information, need, find, datum, inf...","[identify, information, need, find, data, info..."
28,1,1,Inglés,1,2,"find how to access these data, information and...",find,"[find, access, datum, information, content, na...","[find, access, data, information, content, nav..."
29,1,1,Inglés,1,3,identify simple personal search strategies.,identify,"[identify, simple, personal, search, strategy]","[identify, simple, personal, search, strategy]"
30,1,1,Inglés,2,1,identify my information needs.,identify,"[identify, information, need]","[identify, information, need]"
31,1,1,Inglés,2,2,"find data, information and content through a s...",find,"[find, datum, information, content, simple, se...","[find, data, information, content, simple, sea..."
...,...,...,...,...,...,...,...,...,...
724,5,4,Inglés,6,3,choose the most appropriate opportunities for ...,choose,"[choose, appropriate, opportunity, self, devel...","[choose, appropriate, opportunity, keep, date,..."
725,5,4,Inglés,7,1,create solutions to complex problems with limi...,create,"[create, solution, complex, problem, limited, ...","[create, solution, complex, problem, limited, ..."
726,5,4,Inglés,7,2,integrate my knowledge to contribute to profes...,"integrate, guide","[integrate, knowledge, contribute, professiona...","[integrate, knowledge, contribute, professiona..."
727,5,4,Inglés,8,1,create solutions to solve complex problems wit...,create,"[create, solution, solve, complex, problem, in...","[create, solution, solve, complex, problem, ma..."


In [10]:
pd.set_option('display.max_rows', None)

In [11]:

# Explota la columna "Palabras x" para que cada token ocupe una fila.
spacy = digcomp.explode("Palabras spacy")
wordn = digcomp.explode("Palabras wordnet")
 

# Agrupa por ID_area, ID_competencia y palabra, y cuenta la cantidad de ocurrencias de cada palabra
tabla_frecuencias_spacy = (
    spacy
    .groupby(["Palabras spacy"])
    .size()
    .reset_index(name="Frecuencia")
)

tabla_frecuencias_wn = (
    wordn
    .groupby(["Palabras wordnet"])
    .size()
    .reset_index(name="Frecuencia")
)

# Mostrar la tabla resultante
display(tabla_frecuencias_spacy)


Unnamed: 0,Palabras spacy,Frecuencia
0,access,6
1,act,3
2,adapt,18
3,adjust,6
4,analyse,2
5,analysis,5
6,apply,29
7,appraise,1
8,appropiate,1
9,appropriate,50


In [12]:
display(tabla_frecuencias_wn)

Unnamed: 0,Palabras wordnet,Frecuencia
0,I,8
1,access,6
2,act,3
3,adapt,13
4,adapted,5
5,adjust,6
6,among,1
7,analysing,2
8,analysis,5
9,apply,24
