# Smart Urban System - NLP


## Fase 1: Identificación y extracción de datos


In [4]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
import pandas as pd
import numpy as np

# Add src to path
src_path = Path("../src").resolve()
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from preprocessor import SmartUrbanPreprocessor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
processor = SmartUrbanPreprocessor()
raw_data = processor.load_data()

for source, df in raw_data.items():
    print(f"{source}: {len(df)} rows")

Initializing NLP models...
Loading Stores Complaints...
Loading Financial Complaints...
Loading University Complaints...
Loading Amazon Reviews...
Loading News Sentiments...
Falling back to ISO-8859-1 encoding for news
Removing non-UTF-8 characters from news text
Removing non-UTF-8 characters from news sentiment
Converted news to UTF-8
stores: 81883 rows
financial: 162421 rows
university: 1005 rows
amazon: 30000 rows
news: 4846 rows


In [3]:
{k: v.shape for k, v in raw_data.items()}

{'stores': (81883, 24),
 'financial': (162421, 3),
 'university': (1005, 8),
 'amazon': (30000, 9),
 'news': (4846, 2)}

In [4]:
stores = raw_data["stores"]
financial = raw_data["financial"]
university = raw_data["university"]
reviews = raw_data["amazon"]
news = raw_data["news"]

dfs = {
    "stores": stores,
    "financial": financial,
    "university": university,
    "reviews": reviews,
    "news": news,
}

#### 1.1 EDA


In [5]:
{k: df.head(2) for k, df in dfs.items()}

{'stores':       ID_EXP FECHA_INGRESO            FECHA_FIN      FECHA DE CIERRE  \
 0  2022_3584    2022-01-03  2022-08-15 00:00:00  2022-08-15 00:00:00   
 1  2022_2645    2022-01-03  2022-03-29 00:00:00  2022-03-29 00:00:00   
 
                TIPO_CONCILIACION ESTADO_PROCESAL  \
 0  Turnada a Concil Person p/seg      Conciliada   
 1  Turnada a Concil Person p/seg   Desistimiento   
 
                             PROVEEDOR                    NOMBRE_COMERCIAL  \
 0  DIVERSIDAD PARA EL HOGAR, SA DE CV  DIVERSIDAD PARA EL HOGAR, SA DE CV   
 1                        PROMEDIO 100                        PROMEDIO 100   
 
                                     GIRO  \
 0                              MUEBLERÍA   
 1  ESCUELA DE EDUCACIÓN PRIMARIA PRIVADA   
 
                                   SECTOR  ... COSTO BIEN SERVICIO  \
 0                               MUEBLERO  ...               16878   
 1  ESCUELA DE EDUCACIÓN PRIMARIA PRIVADA  ...             51111.8   
 
   MONTO RECLAMADO MONT

In [6]:
{k: (df.shape, df.dtypes) for k, df in raw_data.items()}

{'stores': ((81883, 24),
  ID_EXP                         object
  FECHA_INGRESO          datetime64[ns]
  FECHA_FIN                      object
  FECHA DE CIERRE                object
  TIPO_CONCILIACION              object
  ESTADO_PROCESAL                object
  PROVEEDOR                      object
  NOMBRE_COMERCIAL               object
  GIRO                           object
  SECTOR                         object
  ODECO                          object
  ESTADO_UA                      object
  TIPO_RECLAMACION               object
  MOTIVO_RECLAMACION             object
  COSTO BIEN SERVICIO            object
  MONTO RECLAMADO               float64
  MONTO RECUPERADO               object
  PROCEDIMIENTO                  object
  BIEN O SERV                    object
  MEDIO INGRESO                  object
  TIPO PROD                      object
  MODALIDAD COMPRA               object
  MODALIDAD PAGO                 object
  PROB ESPECIAL                  object
  dtype: object

### 1.2. Unificar


In [8]:
unified_df = processor.normalize(raw_data)
print(unified_df.shape)
display(unified_df.head(5))
print(unified_df.dtypes)

(280155, 5)


Unnamed: 0,id,text,source,language,sentiment_score
0,de_0784695,"Leider, leider nach einmal waschen ausgebliche...",amazon,de,bad
1,de_0759207,zunächst macht der Anker Halter einen soliden ...,amazon,de,bad
2,de_0711785,Siegel sowie Verpackung war beschädigt und war...,amazon,de,bad
3,de_0964430,Habe dieses Produkt NIE erhalten und das Geld ...,amazon,de,bad
4,de_0474538,Die Träger sind schnell abgerissen Reißverschl...,amazon,de,bad


id                 object
text               object
source             object
language           object
sentiment_score    object
dtype: object


In [None]:
# Valores faltantes
missing = unified_df.isnull().sum()
missing_pct = (missing / len(unified_df) * 100).round(2)
missing_df = pd.DataFrame({
    'Count': missing,
})
display(missing_df)
print(unified_df.shape)
working_df = unified_df[unified_df['text'].notna()]
print(working_df.shape)

print("")

# Unique values
categorical_cols = working_df.select_dtypes(include=['object']).columns
for col in categorical_cols:  
    unique_count = working_df[col].nunique()
    print(f"- Column {col}: {unique_count} unique values")
    if unique_count <= 10:
        print(f"\t- Values: {working_df[col].unique()}")


Unnamed: 0,Count
id,0
text,1
source,0
language,0
sentiment_score,0


(280155, 5)
(280154, 5)

- Column id: 192420 unique values
- Column text: 273270 unique values
- Column source: 5 unique values
	- Values: ['amazon' 'stores' 'financial' 'university' 'news']
- Column language: 6 unique values
	- Values: ['de' 'en' 'es' 'fr' 'ja' 'zh']
- Column sentiment_score: 3 unique values
	- Values: ['bad' 'neutral' 'good']


### 2. Limpieza y normalización textual


In [None]:
clean_df = processor.process_corpus(unified_df, apply_spell_correction=False)
display(clean_df.head())

Processing corpus with 280155 documents...
Filtering by language (en/es)...
Removed 20000 documents with unsupported languages
Removing duplicates...
Removed 6884 duplicate entries
Cleaning text...
Tokenizing and lemmatizing...


Processing: 100%|██████████| 253271/253271 [10:34<00:00, 399.38it/s]  


Processing complete. Final corpus size: 253271 documents


Unnamed: 0,id,text,source,language,sentiment_score,clean_text,tokens,lemmas,pos_tags,entities,lemmas_text
5000,en_0199937,"These are AWFUL. They are see through, the fab...",amazon,en,bad,these are awful they are see through the fabri...,"[these, are, awful, they, are, see, through, t...","[these, be, awful, they, be, see, through, the...","[DET, AUX, ADJ, PRON, AUX, VERB, ADP, DET, NOU...","[{'text': 'don', 'label': 'PERSON', 'start': 2...",these be awful they be see through the fabric ...
5001,en_0863335,I bought 4 and NONE of them worked. Yes I used...,amazon,en,bad,i bought 4 and none of them worked yes i used ...,"[i, bought, 4, and, none, of, them, worked, ye...","[I, buy, 4, and, none, of, they, work, yes, I,...","[PRON, VERB, NUM, CCONJ, NOUN, ADP, PRON, VERB...","[{'text': '4', 'label': 'CARDINAL', 'start': 9...",I buy 4 and none of they work yes I use new ba...
5002,en_0565010,On first use it didn't heat up and now it does...,amazon,en,bad,on first use it didn't heat up and now it does...,"[on, first, use, it, did, n't, heat, up, and, ...","[on, first, use, it, do, not, heat, up, and, n...","[ADP, ADJ, NOUN, PRON, AUX, PART, VERB, ADP, C...","[{'text': 'first', 'label': 'ORDINAL', 'start'...",on first use it do not heat up and now it do n...
5003,en_0963290,You want an HONEST answer? I just returned fro...,amazon,en,bad,you want an honest answer i just returned from...,"[you, want, an, honest, answer, i, just, retur...","[you, want, an, honest, answer, I, just, retur...","[PRON, VERB, DET, ADJ, NOUN, PRON, ADV, VERB, ...","[{'text': 'first', 'label': 'ORDINAL', 'start'...",you want an honest answer I just return from u...
5004,en_0238156,The glue works fine but the container is impos...,amazon,en,bad,the glue works fine but the container is impos...,"[the, glue, works, fine, but, the, container, ...","[the, glue, work, fine, but, the, container, b...","[DET, NOUN, VERB, ADV, CCONJ, DET, NOUN, AUX, ...",[],the glue work fine but the container be imposs...


### 2.1. Guardado


In [None]:
output_path = Path("../data/processed_corpus.csv")
clean_df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ..\data\processed_corpus.csv


### 2.2. Balanceado?

In [1]:
import pandas as pd

clean_df = pd.read_csv("../data/processed_corpus.csv")

  clean_df = pd.read_csv("../data/processed_corpus.csv")


In [6]:
print("Source counts:\n", clean_df['source'].value_counts())

Source counts:
 source
financial     162421
stores         76001
amazon         10000
news            4838
university        11
Name: count, dtype: int64


In [None]:
print("Sentiment counts:\n", clean_df['sentiment_score'].value_counts())


Total Unified Rows: 253271
Sentiment counts:
 sentiment_score
bad        243037
good         5362
neutral      4872
Name: count, dtype: int64


In [8]:
from sklearn.utils import resample
import pandas as pd

# Separar por clase
bad_df = clean_df[clean_df['sentiment_score'] == 'bad']
good_df = clean_df[clean_df['sentiment_score'] == 'good']
neutral_df = clean_df[clean_df['sentiment_score'] == 'neutral']

# Tamaño de la clase minoritaria
min_size = min(len(good_df), len(neutral_df), len(bad_df))
print(f"Undersampling to {min_size} samples per class")

# Submuestrear cada clase
bad_downsampled = resample(bad_df, 
                           replace=False,
                           n_samples=min_size,
                           random_state=42)

good_downsampled = resample(good_df,
                            replace=False,
                            n_samples=min_size,
                            random_state=42)

neutral_downsampled = resample(neutral_df,
                               replace=False,
                               n_samples=min_size,
                               random_state=42)

# Combinar y mezclar
balanced_df = pd.concat([bad_downsampled, good_downsampled, neutral_downsampled])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verificar
print(f"\nTotal Balanced Rows: {len(balanced_df)}")
print("Balanced sentiment counts:\n", balanced_df['sentiment_score'].value_counts())

# Guardar
balanced_path = Path("../data/processed_corpus_balanced.csv")
balanced_df.to_csv(balanced_path, index=False)
print(f"Saved to {balanced_path}")

Undersampling to 4872 samples per class

Total Balanced Rows: 14616
Balanced sentiment counts:
 sentiment_score
good       4872
neutral    4872
bad        4872
Name: count, dtype: int64
Saved to ..\data\processed_corpus_balanced.csv
