# Procesamiento de los datos para validación

In [1]:
import pandas as pd

## 1. Carga de los datos

In [None]:
yelp_reviews = pd.read_parquet("../datasets/yelp_reviews.parquet")
print("Total reviews:", yelp_reviews.shape[0])
yelp_reviews.head()

Total reviews: 4629668


Unnamed: 0,review_id,user_id,business_id,stars,text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,Rated 3.0 stars out of 5. If you decide to eat...
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,Rated 3.0 stars out of 5. Family diner. Had th...
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Rated 5.0 stars out of 5. Wow! Yummy, differe..."
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,Rated 4.0 stars out of 5. Cute interior and ow...
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,Rated 1.0 stars out of 5. I am a long term fre...


## 2. Selección de los usuarios de test

In [3]:
# Selección de reviews positivas
positive_reviews = yelp_reviews[yelp_reviews["stars"] >= 4]
positive_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,text
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Rated 5.0 stars out of 5. Wow! Yummy, differe..."
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,Rated 4.0 stars out of 5. Cute interior and ow...
5,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5.0,Rated 5.0 stars out of 5. Amazingly amazing wi...
7,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,Rated 4.0 stars out of 5. Locals recommended M...
8,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4.0,Rated 4.0 stars out of 5. Love going here for ...


In [4]:
# Selección de usuarios con 10 o más reviews positivas
eligible_users = positive_reviews.groupby("user_id").size()
eligible_users = eligible_users[eligible_users >= 10].index

# Muestreo
test_users = eligible_users.to_series().sample(n = 10000, random_state = 32)
test_users.head()

user_id
B0FMcfBgh3Qe8EV_35EvaA    B0FMcfBgh3Qe8EV_35EvaA
je_prwoc8_k_25EK9NsOtw    je_prwoc8_k_25EK9NsOtw
UKi083DTL40b8Eyh5jh3OQ    UKi083DTL40b8Eyh5jh3OQ
VnM17gnNyW9srO_Edl2EIA    VnM17gnNyW9srO_Edl2EIA
wrFEiYUhHi1toO7kJeqJ3g    wrFEiYUhHi1toO7kJeqJ3g
Name: user_id, dtype: object

### 2.1. Subconjunto de entrenamiento

In [5]:
train_reviews = yelp_reviews[~yelp_reviews["user_id"].isin(test_users)]
print("Total training reviews:", train_reviews.shape[0])
train_reviews.head()

Total training reviews: 4279039


Unnamed: 0,review_id,user_id,business_id,stars,text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,Rated 3.0 stars out of 5. If you decide to eat...
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,Rated 3.0 stars out of 5. Family diner. Had th...
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Rated 5.0 stars out of 5. Wow! Yummy, differe..."
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,Rated 4.0 stars out of 5. Cute interior and ow...
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,Rated 1.0 stars out of 5. I am a long term fre...


### 2.2. Subconjunto de validación

In [6]:
test_reviews = positive_reviews[positive_reviews["user_id"].isin(test_users)]
print("Total training reviews:", test_reviews.shape[0])
test_reviews.head()

Total training reviews: 251611


Unnamed: 0,review_id,user_id,business_id,stars,text
9,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Rated 4.0 stars out of 5. Good food--loved the...
10,UBp0zWyH60Hmw6Fsasei7w,4Uh27DgGzsp6PqrH913giQ,otQS34_MymijPTdNBoBdCw,4.0,Rated 4.0 stars out of 5. The bun makes the So...
17,lUUhg8ltDsUZ9h0xnwY4Dg,RreNy--tOmXMl1en0wiBOg,cPepkJeRMtHapc_b2Oe_dw,4.0,Rated 4.0 stars out of 5. I was really between...
38,EoN2xyKvTTn9B-z1grhYxw,DBYhpb5hrAYgQjQaMhNYyQ,oJ4ik-4PZe6gexxW-tSmsw,4.0,Rated 4.0 stars out of 5. I love the Brewpub f...
66,onlgwy5qGDEzddsrnIvtWg,pYXeL0RCqus2IfhthYCOyA,W7NxQw8UYFR0HLPrI08tvw,4.0,Rated 4.0 stars out of 5. Don't know what it i...


## 3. Selección del restaurante a predecir por usuario

In [7]:
reviews_to_predict = (
    test_reviews
    .groupby("user_id", group_keys=False)
    .apply(lambda df: df.sample(1, random_state=32))["review_id"]
    .tolist()
)
len(reviews_to_predict)

  .apply(lambda df: df.sample(1, random_state=32))["review_id"]


10000

### 3.1. Reseña con restaurante a predecir

In [8]:
business_to_predict = test_reviews[test_reviews["review_id"].isin(reviews_to_predict)]
print("Total business to predict:", business_to_predict.shape[0])
business_to_predict.head()

Total business to predict: 10000


Unnamed: 0,review_id,user_id,business_id,stars,text
88,LnKr0hwejzl71QmoQyTRDQ,7RU_xK1tEGlUvXfe0GvtEg,hAmuto6UndVroyd_DaD-TA,5.0,Rated 5.0 stars out of 5. Not sure why it took...
367,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5.0,Rated 5.0 stars out of 5. After a long hiatus ...
560,QEuRWU5UR6_z8hSs4pjjEA,jG-t2tqFuZLdqRHNn_y9bQ,7UxNDF6ZpabC3O0-Qbg3Xw,5.0,Rated 5.0 stars out of 5. My favorite place to...
581,7JKo522z7Fx6VfU27am4Rg,sHfY5a4-HPa9dhSSBvQK6Q,jQBPO3rYkNwIaOdQS5ktgQ,5.0,Rated 5.0 stars out of 5. Got a gift certifica...
615,YJU56J3q7mWp89xZ4H51tQ,JPhPZcdUXSSeplvPfW7Auw,Nd_3fSvYDCjM8YJdBx4Y9w,4.0,Rated 4.0 stars out of 5. The Honey Apple gril...


### 3.2. Reseñas para generar inputs

In [9]:
reviews_to_query = test_reviews[~test_reviews["review_id"].isin(reviews_to_predict)]
print("Total reviews to make queries:", reviews_to_query.shape[0])
reviews_to_query.head()

Total reviews to make queries: 241611


Unnamed: 0,review_id,user_id,business_id,stars,text
9,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Rated 4.0 stars out of 5. Good food--loved the...
10,UBp0zWyH60Hmw6Fsasei7w,4Uh27DgGzsp6PqrH913giQ,otQS34_MymijPTdNBoBdCw,4.0,Rated 4.0 stars out of 5. The bun makes the So...
17,lUUhg8ltDsUZ9h0xnwY4Dg,RreNy--tOmXMl1en0wiBOg,cPepkJeRMtHapc_b2Oe_dw,4.0,Rated 4.0 stars out of 5. I was really between...
38,EoN2xyKvTTn9B-z1grhYxw,DBYhpb5hrAYgQjQaMhNYyQ,oJ4ik-4PZe6gexxW-tSmsw,4.0,Rated 4.0 stars out of 5. I love the Brewpub f...
66,onlgwy5qGDEzddsrnIvtWg,pYXeL0RCqus2IfhthYCOyA,W7NxQw8UYFR0HLPrI08tvw,4.0,Rated 4.0 stars out of 5. Don't know what it i...


## 4. Dataset final

In [10]:
reviews_to_query = reviews_to_query.groupby("user_id")["text"].apply(list).reset_index()
validation_df = pd.merge(business_to_predict.loc[: , ["user_id", "business_id"]],
                         reviews_to_query.loc[: , ["user_id", "text"]],
                         how = "left", on = "user_id")

print("Total validation samples:", validation_df.shape[0])
validation_df.head()

Total validation samples: 10000


Unnamed: 0,user_id,business_id,text
0,7RU_xK1tEGlUvXfe0GvtEg,hAmuto6UndVroyd_DaD-TA,[Rated 5.0 stars out of 5. Love Marcos Pizza. ...
1,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,[Rated 5.0 stars out of 5. That bitter sweet d...
2,jG-t2tqFuZLdqRHNn_y9bQ,7UxNDF6ZpabC3O0-Qbg3Xw,[Rated 5.0 stars out of 5. Atmosphere was plea...
3,sHfY5a4-HPa9dhSSBvQK6Q,jQBPO3rYkNwIaOdQS5ktgQ,[Rated 5.0 stars out of 5. I can't get enough ...
4,JPhPZcdUXSSeplvPfW7Auw,Nd_3fSvYDCjM8YJdBx4Y9w,"[Rated 4.0 stars out of 5. Cozy atmosphere, fr..."


In [11]:
validation_df.to_csv("yelp_validation.csv", index = False)
train_reviews.to_csv("yelp_reviews_for_validation.csv", index = False)