In [68]:
import pandas as pd
import numpy as np

### SIMPLE RECOMMENDATION ENGINE WITH 2 NGRAMS FOR DESCRIPTION

In [69]:
train = pd.read_csv('./data/original/properties_colombia_train.csv', sep = ',')
train.drop('id', axis=1, inplace=True)
train.rename(columns={'Unnamed: 0':'id'}, inplace=True)
train.set_index('id', inplace=True)

In [70]:
train['target'] = np.where(train['price'] > train['price'].mean(), 1, 0)
train = train[['title', 'description', 'target']]

train['target'].fillna(0, inplace=True)
train['description'].fillna(' ', inplace=True)
train['title'].fillna(' ', inplace=True)

In [71]:
from nltk.corpus import stopwords
stopwords = stopwords.words('spanish')

In [72]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer object
vectorizer = CountVectorizer(strip_accents='ascii', stop_words=stopwords, lowercase=False, ngram_range=(1,2))
# Import train_test_split
from sklearn.model_selection import train_test_split
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train['description'], train['target'], test_size=0.2,stratify=train['target'], random_state = 1234)

# Generate training Bow vectors
X_train_bow = vectorizer.fit_transform(X_train)
# Generate test BoW vectors
X_test_bow = vectorizer.transform(X_test)

# Import MultinomialNB
from sklearn.naive_bayes import MultinomialNB
# Create MultinomialNB object
clf = MultinomialNB()
# Train clf
clf.fit(X_train_bow, y_train)
# Compute accuracy on test set
accuracy = clf.score(X_test_bow, y_test)

print("The accuracy of the classifier on the test set is %.3f" % accuracy)



The accuracy of the classifier on the test set is 0.900


In [79]:
test = pd.read_csv('./data/original/properties_colombia_test.csv', sep = ',')
test.drop('id', axis=1, inplace=True)
test.rename(columns={'Unnamed: 0':'id'}, inplace=True)
test.set_index('id', inplace=True)
test = test[['title', 'description']]
test['description'].fillna(' ', inplace=True)
test['title'].fillna(' ', inplace=True)

In [80]:
# Predict the sentiment of a expensive sale
review1 = test['description'].sample().values
prediction = clf.predict(vectorizer.transform([str(review1)]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))
review1

The sentiment predicted by the classifier is 0


array(['OPORTUNIDAD| CASA LOTE EN VENTA Consta de. Sala comedor| cocina convencional| baño| patio. Opción de construir a gusto| terreno ideal  para uso comercial o bodega.Entorno Relevante. Centro comercial Centro Suba| Cai de Aures| D1| cerca a  vía principal| Parque Gloria Lara| Colegios cercanos.Barrios Colindantes. La Estanzuela| Aures| El Rosal| Lagos de Suba| San Jorge.'],
      dtype=object)

In [81]:
# Predict the sentiment of a expensive sale
review2 = test['description'].sample().values
prediction = clf.predict(vectorizer.transform([str(review2)]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))
review2

The sentiment predicted by the classifier is 1


array(['Hermosa Casa en Venta, diseñada y ubicada estratégicamente dentro de un lote de 1.303mt2, construcción de estilo rustico de un piso  mas guardilla que tiene salida a terraza superior, área construida de 150mt2. Tiene tres alcobas, la principal con baño, vestier y salida a una de las tres  terrazas que tiene esta acogedora casa. Cocina integral abierta, sala y comedor con con salida a otra terraza y zona de asados. Amplia e iluminada su diseño permite conexión constante con la naturaleza por sus amplios ventanales con vista a las montañas y a sus zonas verdes con arboles frutales, lote tiene pozo de agua, garaje hasta para 4 vehículos y deposito. El condominio tiene club house, cancha de tenis, microfútbol y cancha múltiple, piscina, jacuzzi, senderos ecológicos, 6 lagos y bosque nativo. Tan solo a 4 minutos de la 14 de alfaguara y 15min de Cali. !Ven y conoce tu próximo hogar!'],
      dtype=object)

In [86]:
test.shape[0]

65850

In [94]:
predictions = []
for i in range(test.shape[0]):
    prediction = clf.predict(vectorizer.transform([str(test.iloc[i,1])]))[0]
    predictions.append(prediction)

len(predictions)

65850

In [97]:
df = pd.DataFrame(predictions, columns=['target'])
df.to_csv('./data/predictions/predictions_nlp_recommendation.csv', index=False)

### ADVANCED NLP WITH SPACY