# Exploratory Data Analysis

In [None]:
import os
import pandas as pd
import langdetect
import spacy
from sklearn.feature_extraction.text import ...

# Parameters

In [None]:
# Paths
path_interim = os.path.join("data", "interim")

# Input
file_input = "prep.csv"

# Load data

In [None]:
path_data_eda = os.path.join(path_interim, file_input)

df_eda = pd. ...
df_eda.head()

# Check corpus language

In [None]:
# Apply to every example of `x_text` of `df_eda` the appropieate function from `langdetect`
# not every instance may by detected as Spanish, given that the tool is not perfect
df_eda['lang'] = df_eda['x_text'].apply(...)

In [None]:
# Count values in pct for `df_eda['lang']`

# Vocabulary frequency

In [None]:
# Helper cell
from unidecode import unidecode
import typing

# Load Spanish language model
nlp = spacy.load('es_core_news_lg')


def tokenizer_lemma_es(text,
                       max_input_len=nlp.max_length,  # 1000000
                       min_token_len= 2,
                       ) -> typing.List[str]:
    doc = nlp(text[:max_input_len])  # truncar el documento de entrada al máximo proporcionado por el modelo de spacy
    lemmas = [unidecode(token.lemma_) for token in doc if token.is_alpha
              and len(token) > min_token_len
              and not token.is_stop
              and not token.like_email
              and not token.like_url
              and not token.is_currency
              and token.ent_type_ not in ['PER', 'LOC', 'ORG']
              ]
    return lemmas # list[str]


example = df_eda.loc[0, "x_text"]
ex_lemma = tokenizer_lemma_es(example)

print(f"{example=}")
print(f"{ex_lemma=}")

# Should see:
# example='Poder crear un usuario y acceder a través de él a la aplicación'
# ex_lemma=['crear', 'usuario', 'acceder', 'aplicacion']


In [None]:
cvect_lemma = XXXVectorizer(  
    # Use the correct one to get exact term counts
    # Do not remove stopwords
    # use unigrams only
    # Do not discard any frequent/infrequent term in the vocabulary

    tokenizer=tokenizer_lemma_es,
    stop_words=...,
    ngram_range=...
    min_df=...,
    max_df=...
)

# Fit on training data and transform both train and test
dtm = cvect_lemma.fit_transform(df_eda['x_text'])

dtm.shape  # note the number of columns

In [None]:
# Helper cell: This dataframe should have terms a columns, docs as rows
# and each cell represents the term frequency on the given doc

df_dtm = pd.DataFrame(
    data= dtm.todense(),
    columns= cvect_lemma.get_feature_names_out()
)
    
df_dtm.head(2)

In [None]:
# Perform the correct operation on  `df_dtm` to obtain the total term frequency

se_vocab = ...
print(f"{type(se_vocab)=}")  # should be 1-dim (a pandas.Series)
print(f"{se_vocab.shape=}")  # should match DTM number of columns

In [None]:
se_vocab_srt = ...  # order terms by descending frequency

In [None]:
# total vocab size
len(se_vocab_srt)

In [None]:
# number of terms with frequency = 1
len(se_vocab_srt[se_vocab_srt==1])

In [None]:
# number of terms with frequency = 2

In [None]:
# number of terms with frequency = < 5

In [None]:
# To check how many terms are below a given freq:
# se_freq_vals_sizes = se_vocab_srt.value_counts()
# se_freq_vals_sizes # n terms ->  obs frequency


In [None]:
# Top most frequent terms
se_vocab_srt.head(10)

In [None]:
# Top most infrequent terms
se_vocab_srt.tail(10)

In [None]:
se_vocab_srt.head(30).plot.bar(
    title="Top 30 most frequent terms"
)

In [None]:
mask = ...  # filter terms with freq below threshold

se_vocab_srt[mask].tail(30).plot.bar(
    title="Bottom 30 least frequent terms (with freq >10)"
)
