In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime as dt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

#filtrado de warnings
warnings.filterwarnings('ignore')

# haciendo los graficos un poco mas bonitos en matplotlib
plt.style.use('default') 
#plt.rcParams['figure.figsize'] = (20, 10)

# seteando tipo de grid en seaborn
sns.set_theme(style='ticks', palette=None, font_scale=.9) 

# elimino la notacion cientifica en los outputs en pandas
pd.options.display.float_format = '{:20,.2f}'.format

#N5 - Utilizando los textos de las reviews para realizar consultas por texto y empleando las técnicas de NLP(TF-IDF), de modo que la query “bad smell” devuelva el “business_id” que haya recibido una crítica acerca una experiencia de malos olores en el local.

In [2]:
review_df_business_text = pd.read_csv('/content/drive/MyDrive/TP_Datos_1C2022/review.csv', usecols=['business_id', 'text'])

In [3]:
review_df_business_text.head()

Unnamed: 0,business_id,text
0,XQfwVwDr-v0ZS3_CbbE5Xw,"If you decide to eat here, just be aware it is..."
1,e4Vwtrqf-wpJfwesgvdgxQ,Cute interior and owner (?) gave us tour of up...
2,cPepkJeRMtHapc_b2Oe_dw,I was really between 3 and 4 stars for this on...
3,bMratNjTG5ZFEA6hVyr-xQ,First time there and it was excellent!!! It fe...
4,bbEXAEFr4RYHLlZ-HFssTA,"Great burgers,fries and salad! Burgers have a..."


In [4]:
review_df_business_text.text.hasnans

False

In [5]:
max_features=len(review_df_business_text.text)*.1
max_features

383810.5

Me fije eso para usar el 10% de la cantidad de textos

In [6]:
tf_idf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=383810)

In [7]:
tf_idf_matrix = tf_idf_vectorizer.fit_transform(review_df_business_text.text)

In [8]:
nbrs_idf = NearestNeighbors(metric='cosine', algorithm='brute')
nbrs_idf.fit(tf_idf_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [9]:
query = tf_idf_vectorizer.transform(["bad smell"])
distances, indices = nbrs_idf.kneighbors(query, n_neighbors=3)
print(distances[0])

[0.32317411 0.33595894 0.35273495]


In [10]:
indices[0]

array([2902266, 2357908, 1746647])

In [11]:
for ind in indices[0]:
    print(review_df_business_text.text[ind])

I did not care for the smell of this place. While waiting the smell got so bad my wife and I walked out. The smell was a mix between bad seafood and a weird combination of spices. Not normal seafood smells.
Smell the carpet is disgusting.It is impossible to eat nice by that smell. Very bad attention too.
Went there recently and the entire restaurant stinks. The smell was so bad we couldn't even enjoy the food or experience. I would not go back until they do something  about that smell.


In [12]:
review_df_business_text.iloc[indices[0][0]]

business_id                               0StbKjld9f8GbeDRy0fS0A
text           I did not care for the smell of this place. Wh...
Name: 2902266, dtype: object

Respuesta:

In [13]:
review_df_business_text.business_id.iloc[indices[0][0]]

'0StbKjld9f8GbeDRy0fS0A'

#C4 - Si tomamos una muestra de negocios con más de 10 reviews y todas sus reviews como un texto concatenado, en términos de distancia de compresión normalizada para esos textos, cuáles son los dos negocios más parecidos? Muestrelos sus textos y distancia.

In [2]:
business_df = pd.read_csv('/content/drive/MyDrive/TP_Datos_1C2022/business.csv', usecols=['business_id', 'review_count', 'name'])

In [3]:
business_con_mas_de_10_df = business_df[business_df['review_count'] > 10]

In [36]:
business_con_mas_de_10_sample_df = business_con_mas_de_10_df.sample(frac=0.008, replace=False, random_state=1)

In [22]:
review_df = pd.read_csv('/content/drive/MyDrive/TP_Datos_1C2022/review.csv', usecols=['business_id', 'text'])

In [37]:
business_review_text_df = business_con_mas_de_10_sample_df.merge(review_df).drop(columns='review_count')

In [32]:
business_review_text_df.head()

Unnamed: 0,business_id,name,text
0,4DLIC5fF_QXg61Hwv2BPeQ,Mile End Delicatessen,Such a great addition to downtown. Popped in f...
1,4DLIC5fF_QXg61Hwv2BPeQ,Mile End Delicatessen,Classic NYC style Jewish deli in the Fairlane ...
2,4DLIC5fF_QXg61Hwv2BPeQ,Mile End Delicatessen,If downtown is going to become a pit of bachel...
3,4DLIC5fF_QXg61Hwv2BPeQ,Mile End Delicatessen,The best breakfast we had in Nashville! The po...
4,4DLIC5fF_QXg61Hwv2BPeQ,Mile End Delicatessen,"We love Montreal style bagels, which are crust..."


In [25]:
def joinear(serie):
  return ' '.join(serie)

In [38]:
textos_concatenados = business_review_text_df.groupby('business_id').agg({'text':joinear}).text.to_list()

In [27]:
import gzip

def compress_size(bytefile):
    return len(gzip.compress(bytefile))

def ncd(obj1, obj2):
    len1 = compress_size(obj1)
    len2 = compress_size(obj2)
    concat_len = len(gzip.compress(obj1+obj2))
    return (concat_len-min(len1, len2))/max(len1, len2)

In [39]:
from tqdm import tqdm
distancias_totales = []
for i in tqdm(range(len(textos_concatenados))):
  distancias = [ncd(textos_concatenados[j].encode(), textos_concatenados[i].encode()) for j in range(len(textos_concatenados)) if i != j]
  distancias_totales.append(distancias)

100%|██████████| 511/511 [46:45<00:00,  5.49s/it]


In [40]:
aux = []
for ind, elemento in enumerate(distancias_totales):
  indice = np.argsort(elemento)[0]
  aux.append((ind, indice, elemento[indice]))

In [41]:
from operator import itemgetter
min(aux,key=itemgetter(2))

(420, 294, 0.8590177815410669)

In [42]:
texto_1 = textos_concatenados[420]
texto_2 = textos_concatenados[294] 

In [43]:
business = []
contador = 0
for i in range(len(business_review_text_df)):
  review = business_review_text_df.iloc[i, 2]
  if review in texto_1:
    if len(business) !=0:
      if business_review_text_df.iloc[i, 0] in business:
        continue
    business.append(business_review_text_df.iloc[i, 0])
    contador += 1
  
  elif review in texto_2:
    if len(business) !=0:
      if business_review_text_df.iloc[i, 0] in business:
        continue
    business.append(business_review_text_df.iloc[i, 0])
    contador += 1

  elif contador == 2:
    break

In [44]:
business

['oI_0uod6jeq8fJJu8ySFSA', 'YqCQBJgUZHFFHRX1VgU3Ag']

In [45]:
display(
    business_review_text_df[business_review_text_df['business_id'] == business[0]].head(),
    business_review_text_df[business_review_text_df['business_id'] == business[1]].head()
)

Unnamed: 0,business_id,name,text
4005,oI_0uod6jeq8fJJu8ySFSA,Panda House,Came here on my lunch break.. read reviews abo...
4006,oI_0uod6jeq8fJJu8ySFSA,Panda House,I was raised in NY and Panda House is by far m...
4007,oI_0uod6jeq8fJJu8ySFSA,Panda House,The first thing I noticed when I walked in was...
4008,oI_0uod6jeq8fJJu8ySFSA,Panda House,Take out....\nOrdered Hot And Sour soup.. rece...
4009,oI_0uod6jeq8fJJu8ySFSA,Panda House,Horrible. I had to throw away all my food. Not...


Unnamed: 0,business_id,name,text
7447,YqCQBJgUZHFFHRX1VgU3Ag,Kowloon Restaurant,Very happy to see that they opened their doors...
7448,YqCQBJgUZHFFHRX1VgU3Ag,Kowloon Restaurant,Everything we ordered was really great. Best C...
7449,YqCQBJgUZHFFHRX1VgU3Ag,Kowloon Restaurant,They are open again! \nWe have tried many Chin...
7450,YqCQBJgUZHFFHRX1VgU3Ag,Kowloon Restaurant,They just opened for about a week when I did m...
7451,YqCQBJgUZHFFHRX1VgU3Ag,Kowloon Restaurant,I never got to try this place before the fire ...


Respuesta:

Los dos negocios mas parecidos son: 
*   Kowloon Restaurant
*   Panda House

Sus textos son:

In [46]:
texto_1

'Came here on my lunch break.. read reviews about it on yelp and decided to give it a try.. glad I did.. the food was made to order so it was nice and hot.. staff very friendly and it only took 10 mins.. I order the general tso and noodles and it was amazing plus it came with an egg roll! I was raised in NY and Panda House is by far my favorite NY-style Chinese fast food (you can dine-in too) restaurant in Nashville. I have never been disappointed in the quality of the food, the portions, or the service. They are one of the very few Chinese restaurants in the area who know how to make fried wontons so that they are both crispy on the outside and tender on the inside. The food is never too greasy or heavy with oil or sauce. The first thing I noticed when I walked in was how CLEAN this place looked and smelled! The staff was super fast and friendly ... and the food was fresh, hot and delicious!!!!!! What a pleasant surprise! Take out....\nOrdered Hot And Sour soup.. received WonTon soup!

In [47]:
texto_2

"Very happy to see that they opened their doors again after they remodeled.  We ordered take out, they are only offering take out and delivery currently.  The wait time I was given (and everyone that called while I wasn't waiting ) was 25 min. 45 min later my food was ready.  I live about 5 min away and when I got home all of the food had to reheated. However the food was very good. It wasn't the typical oil drenched American Chinese take out. It wasn't as delicious as food I have had in Chinatown but as far as local spots, this is back to being my go to place. Everything we ordered was really great. Best Chinese food we've found in the area so far. They are open again! \nWe have tried many Chinese restaurants in the area since they closed, but nothing ever came close to Kowloon.  We have ordered twice since they  re-opened, and have enjoyed it both times. They just opened for about a week when I did my Lunar New Year  dinner takeout so they had some hiccups.  I'm sure they will sort i

La ditancia es: 

`0.8590177815410669`

#C5

In [3]:
review_text_df = pd.read_csv('/content/drive/MyDrive/TP_Datos_1C2022/review.csv', usecols=['text'])

In [4]:
review_text_df.head()

Unnamed: 0,text
0,"If you decide to eat here, just be aware it is..."
1,Cute interior and owner (?) gave us tour of up...
2,I was really between 3 and 4 stars for this on...
3,First time there and it was excellent!!! It fe...
4,"Great burgers,fries and salad! Burgers have a..."


In [5]:
review_text_sample_df = review_text_df.sample(frac=0.8, replace=False, random_state=1)

In [6]:
textos_concatenados = review_text_sample_df.text.str.cat(sep=' ')

In [7]:
!pip install zstandard
import zstandard

def compress_size(bytefile):
    return len(zstandard.compress(bytefile))



In [8]:
%%timeit
zstandard.compress(textos_concatenados.encode())

1 loop, best of 5: 17.8 s per loop


In [9]:
compresion = zstandard.compress(textos_concatenados.encode())

In [10]:
%%timeit
zstandard.decompress(compresion)

1 loop, best of 5: 3.58 s per loop


In [11]:
len(textos_concatenados) / compress_size(textos_concatenados.encode())

2.731232498887861

In [21]:
len(compresion) / len(textos_concatenados)

0.36613506920673833

In [19]:
from scipy.stats import entropy
def calcular_entropia(texto):
  res = {}
  probas = []
  longitud = len(texto)
  
  for caracter in texto:
    res[caracter] = res.get(caracter, 0) + 1
  
  for caracter in res:
    frecuencia = res[caracter]
    proba = frecuencia/longitud
    probas.append(proba)
  
  return entropy(probas, base=2)

In [22]:
entropia = calcular_entropia(textos_concatenados) #son bits

In [24]:
entropia / 8

0.5557727191368754

In [28]:
8 / entropia

1.7992966649262978

###¿De cuanto es el ratio de compresión?

El ratio de compresion es: 

$2.7312324988878614$.

###¿Cuánto tarda en comprimir y descomprimir (por separado)? 

A partir de la ejecucion de colab, comprimir tarda mas o menos 2 minutos y descomprimir tarda 32 segundos.

`%%timeit` ayuda a ver que cada iteracion del proceso de descompresión tarda menos que el de compresión.

###¿Cuánto ocupa cada carácter en promedio una vez comprimido?

Una vez comprimido, cada caracter en promedio ocupa:

$0.36613506920673833$ bytes.

###Si tomamos la entropía base dos para los caracteres ¿cuánto da? ¿cuantos bytes por caracter son esos?

La entropia es:

$4.446181753095003$ bits

En terminos de bytes por caracter (asumiendo que 8 bits son un byte):

$0.5557727191368754$ bytes.

###Si utilizaramos un compresor aritmético por caracter, aproximadamente ¿cuál sería el ratio de compresión en el caso más optimista?

El compresor aritmético en el mejor de los casos lleva cada byte a la entropia de shannon bits, por lo que el ratio de compresión seria (en el caso más optimista):

$\frac{len(texto)*8}{len(texto)*entropia} = \frac{8}{entropia} = 1.7992966649262978$

###¿Cúal algoritmo de compresión de los dos sería mejor?

Entre `zstandard` y `el compresor aritmético`, el mejor compresor es el primero porque tiene un ratio de compresión mayor.