In [1]:
import pandas as pd
import numpy as np
#import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('spanish')
from functions import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# Import MultinomialNB
from sklearn.naive_bayes import MultinomialNB

### SIMPLE RECOMMENDATION ENGINE WITH COUNT VECTORIZER
### 2 NGRAMS FOR DESCRIPTION

First we import the original database (we should have saved a copy before all cleaning, sorry about that)

In [2]:
train = pd.read_csv('../datasets/henry_lab_disk/properties_colombia_train.csv', sep = ',')
train.drop('id', axis=1, inplace=True)
train.rename(columns={'Unnamed: 0':'id'}, inplace=True)
train.set_index('id', inplace=True)

We do the usual cleaning we do in train and test set

In [3]:
train['target'] = np.where(train['price'] > train['price'].mean(), 1, 0)
train = train[['title', 'description', 'target']]
train['target'].fillna(0, inplace=True)
train['description'].fillna(' ', inplace=True)
train['title'].fillna(' ', inplace=True)
train = trim_all_columns(train)
train['title'] = normalize_column(train, 'title')
train['description'] = normalize_column(train, 'description')
train['title'] = train['title'].str.lower().str.strip()
train['description'] = train['description'].str.lower().str.strip()
pattern = '|'.join(['\n','\r', '\t' ,'\xa0','\u200b',','])
train['title'] = clean_values(train['title'], pattern, value=' ')
train['description'] = clean_values(train['description'], pattern, value=' ')
pattern2 = '|'.join(['_', '[(|)]', '-',':',';'])
train['title'] = clean_values(train['title'], pattern2, regex = True, value=' ')
train['description'] = clean_values(train['description'], pattern2, regex = True, value=' ')
train['title'] = clean_values(train['title'], r"\<.*?\>", regex = True, value=' ')
train['description'] = clean_values(train['description'], pattern2, regex = True, value=' ')
train['title'] = clean_values(train['title'], r"\{.*?\}", regex = True, value=' ')
train['description'] = clean_values(train['description'], pattern2, regex = True, value=' ')
train['title'] = train['title'].str.replace(' +',' ', regex=True)
train['description'] = train['description'].str.replace(' +',' ', regex = True)
train['description'] = train['description'].str.replace('br / ','',regex = False)
train['description'] = train['description'].str.replace('/b','',regex = False)
train['description'] = train['description'].str.replace(' br ','',regex = False)
train['description'] = train['description'].str.replace(' b ','',regex = False)
train['description'] = train['description'].str.replace('&aacute ','a',regex = False)
train['description'] = train['description'].str.replace('&eacute ','e',regex = False)
train['description'] = train['description'].str.replace('&iacute ','i',regex = False)
train['description'] = train['description'].str.replace('&oacute ','o',regex = False)
train['description'] = train['description'].str.replace('&uacute ','u',regex = False)
train['description'] = train['description'].str.replace('&ntilde ','ñ',regex = False)
train['description'] = train['description'].str.replace('ref#\d+','',regex = True)
train['description'] = train['description'].str.replace('!!!','',regex = False)

Let's get hands on, and CountVectorize everything

https://towardsdatascience.com/basics-of-countvectorizer-e26677900f9c

In [4]:
# Create CountVectorizer object
vectorizer = CountVectorizer(strip_accents='ascii', stop_words=stopwords, lowercase=False, ngram_range=(1,2))

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train['description'], 
train['target'], test_size=0.2,stratify=train['target'], random_state = 1234)

# Generate training Bow vectors
X_train_bow = vectorizer.fit_transform(X_train)
# Generate test BoW vectors
X_test_bow = vectorizer.transform(X_test)

# Create MultinomialNB object
clf = MultinomialNB()
# Train clf
clf.fit(X_train_bow, y_train)
# Compute accuracy on test set
accuracy = clf.score(X_test_bow, y_test)

print("The accuracy of the classifier on the test set is %.3f" % accuracy)



The accuracy of the classifier on the test set is 0.894


In [5]:
test = pd.read_csv('../datasets/henry_lab_disk/properties_colombia_test.csv', sep = ',')
test.drop('id', axis=1, inplace=True)
test.rename(columns={'Unnamed: 0':'id'}, inplace=True)
test.set_index('id', inplace=True)
test = test[['title', 'description']]
test['description'].fillna(' ', inplace=True)
test['title'].fillna(' ', inplace=True)

In [6]:
# Predict the sentiment of a expensive sale
review1 = test['description'].sample().values
prediction = clf.predict(vectorizer.transform([str(review1)]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))
review1

The sentiment predicted by the classifier is 1


array(['Excelente oportunidad para comercio zonal, metropolitano, servicios profesionales, y dotacionales, en inmueble con amplios espacios y excelente ubicación en la Plazoleta de la Rebeca donde confluyen las principales arterias de la ciudad como la calle 26 y carreras 10 y 13.'],
      dtype=object)

In [7]:
# Predict the sentiment of a expensive sale
review2 = test['description'].sample().values
prediction = clf.predict(vectorizer.transform([str(review2)]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))
review2

The sentiment predicted by the classifier is 1


array(['<b>PR 11745. SE ARRIENDA APARTAMENTO EN SECTOR DE LA LOMA DE LAS BRUJAS, ENVIGADO</b><br><br>PR 11745. Apartamento en unidad cerrada sector las brujas, ambiente campestre, tranquilo, de poco flujo vehicular. Para estrenar. Cuenta con piso en porcelanato y madera, sala y comedor independiente, estar de tv, 3 alcobas con ba&ntilde;o en la principal, cocina integral tipo americano, alcoba y ba&ntilde;o de servicio, balc&oacute;n, terraza, y parqueaderos independientes cubiertos. Piscina, gym, sauna, parque infantil, cancha de squash, zonas verdes, vigilancia 24 horas, citofonia y circuito cerrado de tv.<br /><br><br> Características adicionales: <br>  <br><br> Ref#633005.'],
      dtype=object)

In [8]:
test.shape[0]

65850

In [9]:
predictions = []
for i in range(test.shape[0]):
    prediction = clf.predict(vectorizer.transform([str(test.iloc[i,1])]))[0]
    predictions.append(prediction)

len(predictions)

65850

In [10]:
df = pd.DataFrame(predictions, columns=['target'])
df.to_csv('./data/predictions/predictions_nlp_recommendation_v2.csv', index=False)