In [2]:
import pandas as pd

df = pd.read_csv('../../data/minors_downloaded.csv')
df.info()
descriptions = df['Description']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1505 entries, 0 to 1504
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1505 non-null   int64 
 1   id           1505 non-null   int64 
 2   zip          1505 non-null   object
 3   file name    1505 non-null   object
 4   entry        1505 non-null   int64 
 5   Description  1505 non-null   object
 6   CPV          1505 non-null   object
 7   URI          1505 non-null   object
dtypes: int64(3), object(5)
memory usage: 94.2+ KB


## Data balancing

In [3]:
# Calculate the length of each description
description_lengths = descriptions.apply(len)

Q1 = description_lengths.quantile(0.25)
Q3 = description_lengths.quantile(0.75)
IQR = Q3 - Q1

# Determine outlier thresholds
lower_threshold = Q1 - 1.5 * IQR
upper_threshold = Q3 + 1.5 * IQR

print(lower_threshold,upper_threshold)

80.0 456.0


In [4]:
# Filter descriptions within thresholds
filtered_df = df[(description_lengths >= lower_threshold) & (description_lengths <= upper_threshold)]
filtered_df = filtered_df.reset_index(drop=True)
print(filtered_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392 entries, 0 to 1391
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1392 non-null   int64 
 1   id           1392 non-null   int64 
 2   zip          1392 non-null   object
 3   file name    1392 non-null   object
 4   entry        1392 non-null   int64 
 5   Description  1392 non-null   object
 6   CPV          1392 non-null   object
 7   URI          1392 non-null   object
dtypes: int64(3), object(5)
memory usage: 87.1+ KB
None


## Text preprocessing

In [5]:
import nltk
from nltk.corpus import stopwords
import spacy

# Load spaCy's Spanish language model
nlp = spacy.load('es_core_news_sm')

# Download NLTK stop words
nltk.download('stopwords')

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define Spanish stop words
spanish_stopwords = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import re
import spacy

nlp = spacy.load('es_core_news_sm')

def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text, language='spanish')
    
    # Remove unnecessary data
    tokens = [re.sub(r'\b\d{2,}\b|(?:\d{1,2}[\/\-\.]){2,}\d{2,4}|\b(?:tel|email)\b', '', token) for token in tokens]
    tokens = [token for token in tokens if token.lower() not in ['lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado', 'domingo', 'enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre']]
    
    # Remove non-alphabetic data
    tokens = [re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑüÜ]', '', token) for token in tokens]
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stop words
    tokens = [token for token in tokens if token not in spanish_stopwords]
    
    # Lemmatization
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ for token in doc]
    
    return ' '.join(tokens)

In [7]:
filtered_descriptions = filtered_df['Description']

preprocessed_descriptions = [preprocess_text(desc) for desc in filtered_descriptions]
filtered_df['processed_description'] = preprocessed_descriptions

Add new column for non preprocessed models (just lowercasing because some models are case sensitive)

In [8]:
filtered_df['lowercase_description'] = filtered_descriptions.str.lower()

In [9]:
# drop unnecessary columns
filtered_df = filtered_df.drop(columns=['Unnamed: 0', 'id'])
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392 entries, 0 to 1391
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   zip                    1392 non-null   object
 1   file name              1392 non-null   object
 2   entry                  1392 non-null   int64 
 3   Description            1392 non-null   object
 4   CPV                    1392 non-null   object
 5   URI                    1392 non-null   object
 6   processed_description  1392 non-null   object
 7   lowercase_description  1392 non-null   object
dtypes: int64(1), object(7)
memory usage: 87.1+ KB


In [10]:
# Save the DataFrame to a CSV file
filtered_df.to_csv('../../data/minors_with_preprocessed_descriptions.csv')