In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('../data/train.csv',dtype={'tipodepropiedad':'category','ciudad':'category','provincia':'category'})
df['fecha'] = pd.to_datetime(df['fecha'])
df['anio'] = df['fecha'].dt.year
df["mes"] = df['fecha'].dt.month
df["dia"] = df['fecha'].dt.day

df = df.drop(columns=["lat","lng"])

In [3]:
df[["descripcion", "titulo"]].isna().sum()

descripcion    1619
titulo         5387
dtype: int64

In [4]:
df["descripcion"] = df["descripcion"].fillna("")
df["titulo"] = df["titulo"].fillna("")

In [5]:
import re
import string
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\[.*?¿\]\%;', '', text)
    text = re.sub('&', '', text)
    text = re.sub(';', '', text)
    #text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('ñ', 'n', text)
    #text = re.sub('\w*\d\w*', ' ', text)
    return text
 
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…«»]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\x95', ' ', text)
    text = re.sub('nbsp', '', text)
    return text

def sacar_acute(text):
    text = re.sub('acute', '', text)
    text = re.sub('tilde', '', text)
    text = re.sub('á', 'a', text)
    text = re.sub('é', 'e', text)
    text = re.sub('í', 'i', text)
    text = re.sub('ó', 'o', text)
    text = re.sub('ú', 'u', text)
    return text

def limpiar_texto(text):
    return sacar_acute(clean_text_round2(clean_text_round1(text)))

In [6]:
df["descripcion"] = df["descripcion"].apply(limpiar_texto)

In [7]:
df["descripcion"].loc[218827]

'estrene departamento 117 m2 (141 m2 totales), exterior, en 3er piso, con elevador, balcon (8 m2) con vista a la calle, estancia sala-comedor, cocina integral abierta con barra de granito, cuarto de lavado integrado, recamara principal con balcon, walk in closet y bano, 2 recamaras junior con closet (una con balcon interior) , bano completo, 2 cajones de estacionamiento (24 m2) independientes y techados. condominio de 14 deptos, elevador, caseta de vigilancia y roof garden comun con asador, tarja y medio bano. ver video en youtube buscando: estrene departamento 117 m2 en col. del valle. http://youtu.be/miugyaihx9u  y .  aceptamos creditos bancarios e infonavit.'

# Preprocess

In [8]:
import sys
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords

In [9]:
download('punkt') #tokenizer, run once
download('stopwords') #stopwords dictionary, run once
stop_words = stopwords.words('spanish')

[nltk_data] Downloading package punkt to /Users/tmacia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tmacia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
display(stop_words[:10])
print(f'Hay {len(stop_words)} stop words')

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se']

Hay 313 stop words


In [14]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

In [15]:
texts = df["descripcion"]

corpus = [preprocess(text) for text in texts]

# Word2Vec

In [17]:
from gensim.models import Word2Vec

In [18]:
import time

start_time = time.time()
model = Word2Vec(
        corpus,
        size=150,
        window=10,
        min_count=2,
        workers=10,
        iter=10)


print("--- %s seconds ---" % (time.time() - start_time))

--- 82.08875918388367 seconds ---


In [19]:
start_time = time.time()

model.train(corpus, total_examples=len(corpus), epochs=10)

print("--- %s seconds ---" % (time.time() - start_time))

--- 80.94431018829346 seconds ---


In [20]:
model.wv.most_similar(positive="garage")

[('garaje', 0.9489434361457825),
 ('cochera', 0.7315773963928223),
 ('estacionamiento', 0.7037502527236938),
 ('porch', 0.6791614294052124),
 ('garge', 0.595940351486206),
 ('grage', 0.5898770689964294),
 ('cohera', 0.5874559283256531),
 ('chochera', 0.5522443056106567),
 ('cocgera', 0.5011465549468994),
 ('estacionamientopara', 0.49807748198509216)]

In [21]:
model.wv.most_similar(positive="depto")

[('departamento', 0.7799702286720276),
 ('depa', 0.7091067433357239),
 ('dpto', 0.6181997060775757),
 ('ph', 0.612342357635498),
 ('deptos', 0.600521981716156),
 ('edif', 0.5557106137275696),
 ('dep', 0.5469179153442383),
 ('habitables', 0.5370544195175171),
 ('penthouse', 0.5134924650192261),
 ('quinto', 0.4916203022003174)]

In [22]:
model.wv.most_similar(positive="country")

[('raqueta', 0.6460662484169006),
 ('delfos', 0.6314917802810669),
 ('sonoma', 0.6269243955612183),
 ('mundet', 0.6119074821472168),
 ('jockey', 0.6099569797515869),
 ('libanes', 0.6079738140106201),
 ('golf', 0.6058748364448547),
 ('hipico', 0.6046826243400574),
 ('chivas', 0.5915467143058777),
 ('britania', 0.5851802825927734)]

In [24]:
model.wv.most_similar(positive="privado")

[('cerrado', 0.6715835332870483),
 ('serrado', 0.5468931198120117),
 ('viviendistico', 0.5302548408508301),
 ('exclusivo', 0.5170251131057739),
 ('vogilancia', 0.5076881051063538),
 ('errado', 0.47704222798347473),
 ('veh', 0.4702994227409363),
 ('tezahuapan', 0.4683929681777954),
 ('morera', 0.464046448469162),
 ('llamare', 0.45714154839515686)]

In [25]:
model.wv.most_similar(positive="barrio")

[('historica', 0.5810374617576599),
 ('callejon', 0.5516899228096008),
 ('poblado', 0.5505751967430115),
 ('pintoresco', 0.5281222462654114),
 ('ahuizotla', 0.5202915072441101),
 ('congregacion', 0.5175260901451111),
 ('antigua', 0.5161295533180237),
 ('localidad', 0.5095129013061523),
 ('famoso', 0.5086329579353333),
 ('catedral', 0.5079529881477356)]

In [40]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    
    if (len(doc) != 0):
        return np.mean(word2vec_model[doc], axis=0)
    
    return np.zeros(word2vec_model.vector_size, dtype=np.float32)

In [41]:
def fit(corpus, word2vec_model):
    fitted_corpus = [document_vector(word2vec_model, doc) for doc in corpus]
    
    return fitted_corpus

In [42]:
fitted_corpus = fit(corpus, model.wv)

In [51]:
pepe = pd.DataFrame(np.array(fitted_corpus))
pepe.rename(columns={x:str(f"descripcion-feature-{y}") for x,y in zip(pepe.columns,range(0,len(pepe.columns)))}, inplace=True)

In [55]:
merged = df.join(pepe)

In [61]:
merged.to_csv('../data/words.csv', index=False)

In [65]:
pepe.to_csv('../data/word-features.csv', index=False)

# Reducción de dimensiones con TSNE

In [66]:
from sklearn.decomposition import PCA

pca = PCA(n_components=25, whiten = False, random_state = 42)
pepe_pca = pca.fit_transform(pepe)

In [68]:
pepe_pca.shape

(240000, 25)

In [71]:
pepe_pca = pd.DataFrame(pepe_pca)
pepe_pca.rename(columns={x:str(f"descripcion-feature-{y}") for x,y in zip(pepe_pca.columns,range(0,len(pepe_pca.columns)))}, inplace=True)
pepe_pca


Unnamed: 0,descripcion-feature-0,descripcion-feature-1,descripcion-feature-2,descripcion-feature-3,descripcion-feature-4,descripcion-feature-5,descripcion-feature-6,descripcion-feature-7,descripcion-feature-8,descripcion-feature-9,descripcion-feature-10,descripcion-feature-11,descripcion-feature-12,descripcion-feature-13,descripcion-feature-14,descripcion-feature-15,descripcion-feature-16,descripcion-feature-17,descripcion-feature-18,descripcion-feature-19,descripcion-feature-20,descripcion-feature-21,descripcion-feature-22,descripcion-feature-23,descripcion-feature-24
0,-11.692229,2.503089,0.519673,1.323910,-1.039441,-1.040019,-2.780203,-0.363929,-1.528654,1.422107,1.756427,-0.467452,-0.282842,-0.118984,0.237997,1.189590,-0.400355,0.528403,-0.267933,0.063858,0.199827,0.605335,-0.159274,-0.182673,0.098831
1,-3.788801,0.484829,1.150321,0.877703,-1.844381,3.245395,0.817782,0.693933,0.380544,-1.052826,-0.288084,-0.633723,-1.391791,-1.388864,-0.082455,-0.280055,-1.206187,-0.591297,1.061093,0.069538,-0.709585,-0.157409,-0.199224,-0.197963,0.198450
2,4.662774,-1.252783,1.053544,-1.691091,-0.615319,-0.172887,0.345634,-1.291938,-0.488919,1.005102,-0.811115,0.696495,-0.339239,0.473141,-0.097444,-0.483784,-0.206538,0.331479,0.599825,-0.153530,-0.170633,0.299281,0.460262,0.225551,-0.143891
3,7.633575,1.505060,0.168435,-1.354925,-0.708616,-1.282041,2.290100,-1.667372,-0.711799,1.035094,0.934204,2.817818,-0.018554,-1.038684,0.691927,-0.995883,-0.294871,-0.309800,-0.370264,-0.600472,-0.006817,-0.389981,0.200636,0.044455,-0.044037
4,5.814485,-0.401020,0.492635,3.736059,-0.560138,3.782986,-0.949182,-1.887627,1.627066,0.777200,0.760332,0.418681,2.172269,2.563909,-0.040549,1.485764,-0.675824,1.515152,-0.493995,0.866324,-0.158946,0.434376,0.985617,-0.238136,-0.876546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239995,5.483692,2.678686,0.174989,-1.128349,0.369795,-0.115040,1.485210,-0.388147,-0.520465,0.374325,1.132236,1.639508,-0.182483,-0.568452,0.148077,-0.432541,0.108576,0.065108,0.195627,-0.429562,-0.389201,-0.159119,0.364816,-0.567372,0.226763
239996,0.449857,-0.201603,-0.043488,-0.748349,-2.761877,-2.842580,-1.104331,-1.900512,1.209107,-0.559228,-0.177589,0.698229,-0.211771,0.048650,-0.505233,0.912989,-0.812685,0.286128,-0.551395,0.331244,-0.091522,-0.195466,0.176563,0.458120,0.987334
239997,-3.796219,-0.624134,1.269743,4.204318,0.731148,-0.523924,1.339439,-0.306905,0.376435,-0.225971,0.161891,0.078221,0.232305,0.557964,-0.101884,0.048601,0.181539,-0.325622,0.283411,0.460983,-0.326412,-0.050925,0.392701,0.018401,0.460700
239998,-1.655307,-0.358587,-1.575439,1.367727,2.334899,1.504091,-1.470662,1.578868,0.217903,2.403297,-1.415408,0.025421,0.075577,-2.157093,1.152074,-0.490554,1.023452,0.037346,-1.436479,0.464723,1.014884,0.320749,-0.596132,0.033628,-0.114206


In [73]:
pepe_pca.to_csv('../data/word-features-reduced.csv', index=False)