## Feature Extraction

Features this notebook extracts:

1. Average sentence length in words
2. Type-Token Ratio
3. POS tag proportion
4. Proportion of content vs function words
5. Length of sentence in words

In [10]:
# download punkt package for tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [28]:
# Example text in Spanish
text = """Me llamo Darya y soy estudiante de la Universidad de Columbia Británica. 
        Estudio lingüística computacional y ciencia de datos que incluye el aprendizaje automático y redes neuronales."""

In [29]:
# Average sentence length (in words)
from statistics import mean
sentence_list = nltk.tokenize.sent_tokenize(text)
print(sentence_list)
avg_sent_len = mean([len(sent.split(" ")) for sent in sentence_list])
print(avg_sent_len)

['Me llamo Darya y soy estudiante de la Universidad de Columbia Británica.', 'Estudio lingüística computacional y ciencia de datos que incluye el aprendizaje automático y redes neuronales.']
13.5


In [30]:
# Type-token ratio

# tokenize
tokens = nltk.word_tokenize(text)

# types
types = set(tokens)

# TTR
TTR = len(types)/len(tokens) * 100

print(TTR)

82.75862068965517


In [3]:
# Download latest version of spacy
!pip install -U spacy --user

In [5]:
# Use es_core_news_md pipeline for POS tagging
!python -m spacy download es_core_news_md

In [14]:
import spacy
nlp = spacy.load("es_core_news_md")

In [31]:
# POS tag proportion

from collections import defaultdict

doc = nlp(text)
pos_to_count = defaultdict(int)

total_len = len(tokens)
for token in doc:
    pos = token.pos_
    pos_to_count[pos] += 1

pos_to_prop = defaultdict(float)
for pos, count in pos_to_count.items():
    pos_to_prop[pos] = count/total_len

print(pos_to_prop)

defaultdict(<class 'float'>, {'PRON': 0.06896551724137931, 'VERB': 0.06896551724137931, 'PROPN': 0.1724137931034483, 'CCONJ': 0.10344827586206896, 'AUX': 0.034482758620689655, 'NOUN': 0.20689655172413793, 'ADP': 0.10344827586206896, 'DET': 0.06896551724137931, 'PUNCT': 0.06896551724137931, 'SPACE': 0.034482758620689655, 'ADJ': 0.10344827586206896})


In [49]:
# Proportion of Content vs function words

CONTENT_POS = {'VERB', 'NOUN', 'PROPN', 'ADP', 'ADJ', 'ADV'}
FUNCTION_POS = {'CONJ', 'CCONJ', 'SCONJ', 'AUX', 'DET', 'PRON', 'INTJ', 'NUM', 'PART'}
total_len = len(tokens)

doc = nlp(text)

cat_count = {'CONTENT': 0,
             'FUNCTION': 0}

total = 0
for token in doc:
    pos = token.pos_
    if pos in CONTENT_POS:
        cat_count['CONTENT'] += 1
        total += 1
    elif pos in FUNCTION_POS:
        cat_count['FUNCTION'] += 1
        total += 1
    elif pos == 'SPACE':
        pass #ignore
    else:# punctuation, which we ignore but need to remove from total_len
        total_len -= 1


assert(cat_count['CONTENT'] + cat_count['FUNCTION'] == total_len)
cat_prop = {'CONTENT': 0.0,
            'FUNCTION': 0.0}
for cat, count in cat_count.items():
    cat_prop[cat] = cat_count[cat]/total_len

print(cat_prop)

assert(round(cat_prop['CONTENT'] + cat_prop['FUNCTION'], 2) == 1)

{'CONTENT': 0.7037037037037037, 'FUNCTION': 0.2962962962962963}


In [26]:
# Text length
text_len = len(tokens)
print(text_len)