# **Importacion paquetes**
---

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import gender_guesser.detector as gender
from wordcloud import WordCloud
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
from deepface import DeepFace
from textblob import TextBlob
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-07-04 02:10:19.872272: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# **Carga de los datos**
---

In [2]:
data=pd.read_csv('../../../Dataset_Review/Raw/CommentDataset_v2.csv').iloc[:,1:]

# **Extracción del genero**
---

In [3]:
d = gender.Detector(case_sensitive=False)
def solo_un_nombre(name):
    partes = name.split()
    if len(partes) > 1:
        return partes[0]
    else:
        return name

data['first_name'] = data['name'].apply(solo_un_nombre)
data['gender'] = data['first_name'].apply(d.get_gender)

data.loc[data['name'].str.contains('|'.join(['Javier', 'Jesus', 'Jesús', 'João','Txema'])), 'gender'] = 'male'
data.loc[data['name'].str.contains('|'.join(['Glória', 'Mary', 'Rocío', 'Lupe','Carol', 'Luz', 'Mary Paule', 'Leslie','María'])), 'gender'] = 'female'
data['gender']=data['gender'].str.replace('mostly_female','female').str.replace('mostly_male','male').str.replace('andy','unknown')

# **Extraccion de la valoracion**
---

In [4]:
def extraer_numeros(texto):
    numeros = re.findall(r'\d+', texto)
    if numeros:
        return int(numeros[0])
    else:
        return None

data['rating'] = data['valoracion'].apply(extraer_numeros)
data.drop('valoracion', axis=1, inplace=True)

# **Extracción del sentimiento**
---

In [5]:
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

data['sentimiento'] = data['comentario'].astype(str).apply(get_sentiment)

# **Es pregunta**
---

In [6]:
data['pregunta']=data['comentario'].str.contains('¿?').astype(float)

# **Promedio long palabra**
---

In [7]:
def prom_long(x):
    return sum(len(word) for word in TextBlob(x).words) / len(TextBlob(x).words)
data['prom_long_word']=data['comentario'].astype(str).apply(lambda x: prom_long(x))

# **Numero adjetivos**
---

In [8]:
def num_adj(x):
    return sum(1 for word, tag in zip(TextBlob(x).words,TextBlob(x).tags) if tag[1]=='JJ')
data['num_adj'] = data['comentario'].astype(str).apply(num_adj)

# **Limpieza del comentario**
---

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['text_clean'] = data['comentario'].astype(str).apply(preprocess_text)
data['tokens'] = data['text_clean'].apply(nltk.word_tokenize)
data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
data['text_clean'] = data['tokens'].apply(lambda x: ' '.join(x))
data.drop('comentario',axis=1,inplace=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mariolamas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Problemas**
---

In [10]:
problemas_comunes = ['ruido', 'limpieza', 'ubicación', 'wifi', 'parking', 'baños']
def contar_problemas(text, problemas):
    contador = {problema: text.count(problema) for problema in problemas}
    return contador

data['problemas'] = data['text_clean'].apply(lambda x: contar_problemas(x, problemas_comunes))

# **Separación problemas**
---

In [11]:
data['prob_ruido']=data['problemas'].apply(lambda x: x['ruido'])
data['prob_limp']=data['problemas'].apply(lambda x: x['limpieza'])
data['prob_ubi']=data['problemas'].apply(lambda x: x['ubicación'])
data['prob_wf']=data['problemas'].apply(lambda x: x['wifi'])
data['prob_park']=data['problemas'].apply(lambda x: x['parking'])
data['prob_bañ']=data['problemas'].apply(lambda x: x['baños'])
data.drop('problemas',axis=1,inplace=True)

# **Número de tokens**
---

In [12]:
data['num_toks']=data['tokens'].apply(len)

# **antiguedad**
---

In [13]:
data['ubicacion']=data['ubicacion'].apply(lambda x: np.nan if 'Lleva' not in x else re.findall(r'\d+',x)[0])

In [14]:
data['ubicacion'].isna().sum()

1854

# **LabelEncoder**
---

In [15]:
cols=['gender']
encoder=OrdinalEncoder()
data[cols]=encoder.fit_transform(data[cols])

# **Rellenamos nulos**
---

In [16]:
cat_cols=data[[col for col in data.columns if col not in ['tokens','imagen','apart_id','text_clean']]].select_dtypes(include='object').columns
num_cols=data[[col for col in data.columns if col not in ['user_id']]].select_dtypes(exclude='object').columns

imputer=SimpleImputer(strategy='median')
cat_imputer=SimpleImputer(strategy='most_frequent')

data[num_cols]=imputer.fit_transform(data[num_cols])
data[cat_cols]=cat_imputer.fit_transform(data[cat_cols])

In [17]:
data.head()

Unnamed: 0,name,imagen,user_id,apart_id,ubicacion,first_name,gender,rating,sentimiento,pregunta,...,num_adj,text_clean,tokens,prob_ruido,prob_limp,prob_ubi,prob_wf,prob_park,prob_bañ,num_toks
0,Barbara Diaz,https://a0.muscache.com/im/pictures/user/e97d0...,-5345305016665044310,https://www.airbnb.es/rooms/46527977?adults=1&...,6,Barbara,0.0,3.0,0.0,1.0,...,10.0,absolutamente bien lugar camas duras lejos cen...,"[absolutamente, bien, lugar, camas, duras, lej...",0.0,1.0,0.0,0.0,0.0,0.0,62.0
1,Txema,https://a0.muscache.com/im/pictures/user/0c5a3...,7554217637905518485,https://www.airbnb.es/rooms/46527977?adults=1&...,8,Txema,1.0,4.0,0.0,1.0,...,1.0,buenas gestión atención volveré usar casa,"[buenas, gestión, atención, volveré, usar, casa]",0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,Aloe,https://a0.muscache.com/im/pictures/user/4df2c...,2753252670262692822,https://www.airbnb.es/rooms/46527977?adults=1&...,9,Aloe,2.0,1.0,0.0,1.0,...,7.0,empezar mundos corre cobren euros tener wifi p...,"[empezar, mundos, corre, cobren, euros, tener,...",0.0,0.0,0.0,1.0,0.0,0.0,35.0
3,Nichola,https://a0.muscache.com/im/pictures/user/b8437...,5933799744255950932,https://www.airbnb.es/rooms/46527977?adults=1&...,6,Nichola,0.0,5.0,0.0,1.0,...,8.0,situado mejor zona benidorm vistas hermosa pla...,"[situado, mejor, zona, benidorm, vistas, hermo...",0.0,0.0,0.0,0.0,0.0,0.0,28.0
4,Elena,https://a0.muscache.com/im/pictures/user/b8f5e...,3658483447196535546,https://www.airbnb.es/rooms/46527977?adults=1&...,6,Elena,0.0,5.0,0.0,1.0,...,2.0,pasamos estancia agradable gustado,"[pasamos, estancia, agradable, gustado]",0.0,0.0,0.0,0.0,0.0,0.0,4.0


# **Guardamos el dataset limpio**
---

In [18]:
data.to_csv('Dataset_Review_cleaned_v1.csv',index=False)