In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('../data/train.csv',dtype={'tipodepropiedad':'category','ciudad':'category','provincia':'category'})
df['fecha'] = pd.to_datetime(df['fecha'])
df['anio'] = df['fecha'].dt.year
df["mes"] = df['fecha'].dt.month
df["dia"] = df['fecha'].dt.day

df = df.drop(columns=["lat","lng"])

In [3]:
df[["descripcion", "titulo"]].isna().sum()

descripcion    1619
titulo         5387
dtype: int64

In [4]:
df["descripcion"] = df["descripcion"].fillna("")
df["titulo"] = df["titulo"].fillna("")

In [5]:
import re
import string
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\[.*?¿\]\%;', '', text)
    text = re.sub('&', '', text)
    text = re.sub(';', '', text)
    #text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('ñ', 'n', text)
    #text = re.sub('\w*\d\w*', ' ', text)
    return text
 
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…«»]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\x95', ' ', text)
    text = re.sub('nbsp', '', text)
    return text

def sacar_acute(text):
    text = re.sub('acute', '', text)
    text = re.sub('tilde', '', text)
    text = re.sub('á', 'a', text)
    text = re.sub('é', 'e', text)
    text = re.sub('í', 'i', text)
    text = re.sub('ó', 'o', text)
    text = re.sub('ú', 'u', text)
    return text

def limpiar_texto(text):
    return sacar_acute(clean_text_round2(clean_text_round1(text)))

In [6]:
df["descripcion"] = df["descripcion"].apply(limpiar_texto)

In [7]:
df["descripcion"].loc[218827]

'estrene departamento 117 m2 (141 m2 totales), exterior, en 3er piso, con elevador, balcon (8 m2) con vista a la calle, estancia sala-comedor, cocina integral abierta con barra de granito, cuarto de lavado integrado, recamara principal con balcon, walk in closet y bano, 2 recamaras junior con closet (una con balcon interior) , bano completo, 2 cajones de estacionamiento (24 m2) independientes y techados. condominio de 14 deptos, elevador, caseta de vigilancia y roof garden comun con asador, tarja y medio bano. ver video en youtube buscando: estrene departamento 117 m2 en col. del valle. http://youtu.be/miugyaihx9u  y .  aceptamos creditos bancarios e infonavit.'

# Preprocess

In [8]:
import sys
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords

In [9]:
download('punkt') #tokenizer, run once
download('stopwords') #stopwords dictionary, run once
stop_words = stopwords.words('spanish')

[nltk_data] Downloading package punkt to /home/peter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/peter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
display(stop_words[:10])
print(f'Hay {len(stop_words)} stop words')

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se']

Hay 313 stop words


In [11]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

In [12]:
texts = df["descripcion"]

corpus = [preprocess(text) for text in texts]

# Word2Vec

In [13]:
from gensim.models import Word2Vec

In [14]:
import time

start_time = time.time()
model = Word2Vec(
        corpus,
        size=150,
        window=10,
        min_count=2,
        workers=10,
        iter=10)


print("--- %s seconds ---" % (time.time() - start_time))

--- 98.12273931503296 seconds ---


In [15]:
start_time = time.time()

model.train(corpus, total_examples=len(corpus), epochs=10)

print("--- %s seconds ---" % (time.time() - start_time))

--- 95.36427116394043 seconds ---


In [16]:
model.wv.most_similar(positive="garage")

[('garaje', 0.9522410035133362),
 ('cochera', 0.7278991341590881),
 ('porch', 0.7048817873001099),
 ('estacionamiento', 0.7040362358093262),
 ('grage', 0.652358889579773),
 ('cohera', 0.5909550189971924),
 ('garge', 0.53059983253479),
 ('chochera', 0.5106399059295654),
 ('estacionamineto', 0.5084848999977112),
 ('estacionamientopara', 0.4927503764629364)]

In [17]:
model.wv.most_similar(positive="depto")

[('departamento', 0.7702507376670837),
 ('depa', 0.7025709748268127),
 ('dpto', 0.6111317873001099),
 ('ph', 0.6067948341369629),
 ('deptos', 0.603897750377655),
 ('dep', 0.5814424157142639),
 ('habitables', 0.5566142201423645),
 ('noveno', 0.5261161923408508),
 ('quinto', 0.519364595413208),
 ('edif', 0.4994807541370392)]

In [18]:
model.wv.most_similar(positive="country")

[('mundet', 0.651491641998291),
 ('sonoma', 0.631123423576355),
 ('france', 0.615787148475647),
 ('hipico', 0.6134679317474365),
 ('libanes', 0.6051686406135559),
 ('raqueta', 0.5994173884391785),
 ('juniors', 0.5922484993934631),
 ('golf', 0.5856401920318604),
 ('delfos', 0.5819019079208374),
 ('chivas', 0.5801526308059692)]

In [19]:
model.wv.most_similar(positive="privado")

[('cerrado', 0.6817795634269714),
 ('salvaterra', 0.5889965891838074),
 ('exclusivo', 0.5420188903808594),
 ('errado', 0.5303549766540527),
 ('cerrdo', 0.503443717956543),
 ('bonterra', 0.4968777298927307),
 ('serrado', 0.486223042011261),
 ('cerradode', 0.4860995411872864),
 ('veh', 0.4821377098560333),
 ('cerrrado', 0.4788877069950104)]

In [20]:
model.wv.most_similar(positive="barrio")

[('casona', 0.5732358694076538),
 ('historica', 0.5605082511901855),
 ('poblado', 0.5450676679611206),
 ('callejon', 0.542628824710846),
 ('antigua', 0.5373273491859436),
 ('xico', 0.5216360092163086),
 ('atzompa', 0.5201309323310852),
 ('pintoresco', 0.5177381634712219),
 ('analco', 0.5157912373542786),
 ('magdalena', 0.5126667022705078)]

In [21]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    
    if (len(doc) != 0):
        return np.mean(word2vec_model[doc], axis=0)
    
    return np.zeros(word2vec_model.vector_size, dtype=np.float32)

In [22]:
def fit(corpus, word2vec_model):
    fitted_corpus = [document_vector(word2vec_model, doc) for doc in corpus]
    
    return fitted_corpus

In [23]:
fitted_corpus = fit(corpus, model.wv)

In [24]:
pepe = pd.DataFrame(np.array(fitted_corpus))
pepe.rename(columns={x:str(f"descripcion-feature-{y}") for x,y in zip(pepe.columns,range(0,len(pepe.columns)))}, inplace=True)

In [25]:
merged = df.join(pepe)

In [26]:
merged.to_csv('../data/words.csv', index=False)

In [27]:
pepe.to_csv('../data/word-features.csv', index=False)

# Reducción de dimensiones con TSNE

In [28]:
from sklearn.decomposition import PCA

pca = PCA(n_components=25, whiten = False, random_state = 42)
pepe_pca = pca.fit_transform(pepe)

In [29]:
pepe_pca.shape

(240000, 25)

In [30]:
pepe_pca = pd.DataFrame(pepe_pca)
pepe_pca.rename(columns={x:str(f"descripcion-feature-{y}") for x,y in zip(pepe_pca.columns,range(0,len(pepe_pca.columns)))}, inplace=True)
pepe_pca


Unnamed: 0,descripcion-feature-0,descripcion-feature-1,descripcion-feature-2,descripcion-feature-3,descripcion-feature-4,descripcion-feature-5,descripcion-feature-6,descripcion-feature-7,descripcion-feature-8,descripcion-feature-9,descripcion-feature-10,descripcion-feature-11,descripcion-feature-12,descripcion-feature-13,descripcion-feature-14,descripcion-feature-15,descripcion-feature-16,descripcion-feature-17,descripcion-feature-18,descripcion-feature-19,descripcion-feature-20,descripcion-feature-21,descripcion-feature-22,descripcion-feature-23,descripcion-feature-24
0,-11.673375,2.514868,0.596685,1.268745,-1.060728,-1.375381,-2.786111,-0.354453,-1.505592,1.165114,1.784490,-0.629234,-0.304889,0.012660,-0.729861,0.970246,-0.558811,-0.283312,0.174303,0.218451,0.436543,0.577925,-0.276663,-0.541203,0.346583
1,-3.760964,0.545303,1.066880,0.768181,-1.879376,3.310026,0.593549,0.766504,0.241499,-1.118730,-0.257748,-0.676669,-1.178097,-1.433828,-0.212080,-0.197150,-0.994404,0.705909,-1.192330,-0.210889,-0.747146,0.049909,-0.199831,-0.106250,0.115323
2,4.681644,-1.222615,1.091579,-1.686539,-0.633549,-0.096443,0.335661,-1.274177,-0.440656,1.086566,-0.710657,0.669215,-0.498445,0.409227,0.305229,-0.402773,-0.271333,-0.297129,-0.410064,-0.208316,0.035621,0.024372,0.420333,0.464439,-0.062783
3,7.605429,1.585403,0.247523,-1.417196,-0.849209,-1.078543,2.260079,-1.840012,-0.689938,1.191281,1.298142,2.601511,-0.178064,-0.721092,-0.386796,-1.130221,-0.370611,0.337399,0.354439,-0.631868,-0.195097,-0.224578,0.027137,0.165734,-0.125698
4,6.001575,-0.468368,0.257609,3.826220,-0.360680,3.829142,-1.336377,-1.855867,1.629979,0.792477,0.740615,0.268475,1.912819,2.681912,0.007932,1.257526,-0.971464,-1.138787,0.785725,0.705656,-0.199633,-0.014512,0.950608,0.181803,-1.086842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239995,5.498492,2.615131,0.147471,-1.208071,0.315118,0.045363,1.483148,-0.419483,-0.559118,0.369255,1.272302,1.472780,-0.156121,-0.519967,-0.032402,-0.476526,0.117020,-0.213807,-0.085642,-0.494136,-0.371822,-0.209369,0.504520,-0.375793,-0.307144
239996,0.470947,-0.218856,0.117870,-0.734796,-2.755664,-2.934243,-1.008219,-2.042357,1.140303,-0.435419,-0.091280,0.628354,-0.189932,-0.038500,0.290854,0.912352,-0.865432,-0.082796,0.659839,0.272841,0.000526,0.041256,-0.282689,0.831385,1.064461
239997,-3.780492,-0.531843,1.243909,4.192288,0.719304,-0.343034,1.431752,-0.269820,0.457102,-0.171428,0.188242,0.168959,0.228245,0.534087,0.210920,0.099381,0.218910,0.327907,-0.328235,0.338858,-0.530275,0.098773,0.287881,0.170613,0.277895
239998,-1.728627,-0.293374,-1.516743,1.335670,2.431537,1.404823,-1.636908,1.559298,0.156960,2.581804,-1.246778,0.022390,-0.015740,-2.060264,-1.192309,-0.828239,0.682819,0.173036,1.337896,0.597049,0.529717,0.260998,-0.353029,-0.269668,0.108847


In [31]:
pepe_pca.to_csv('../data/word-features-reduced.csv', index=False)