In [21]:
import pandas as pd
import numpy as np

from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler

In [22]:
main_df = pd.read_csv('final-dataset/dataset_main.csv')

In [23]:
review_column = ['review 1', 'review 2', 'review 3', 'review 4', 'review 5']
review_df = main_df.melt(id_vars=['id', 'types'], value_vars=review_column, var_name='review_number', value_name='review').dropna()

In [24]:
unique_ratio = 0.2
total_review_count = len(review_df)
unique_user_count = int(total_review_count * unique_ratio)

mean_reviews_per_user = total_review_count / unique_user_count

In [25]:
review_counts = np.random.poisson(mean_reviews_per_user, unique_user_count)

adjustment_factor = total_review_count / review_counts.sum()
adjusted_review_counts = np.floor(review_counts * adjustment_factor).astype(int)

if adjusted_review_counts.sum() < total_review_count:
    deficit = total_review_count - adjusted_review_counts.sum()
    additional_indices = np.random.choice(range(unique_user_count), deficit, replace=True)
    for idx in additional_indices:
        adjusted_review_counts[idx] += 1

In [26]:
user_ids = [f'user_{i + 1}' for i in range(unique_user_count)]
assigned_users = np.repeat(user_ids, adjusted_review_counts)
np.random.shuffle(assigned_users)
review_df['user_id'] = assigned_users[:total_review_count]

In [27]:
user_distribution = review_df['user_id'].value_counts()
print(user_distribution.describe())

count    6103.000000
mean        5.062264
std         2.352023
min         1.000000
25%         3.000000
50%         5.000000
75%         7.000000
max        16.000000
Name: count, dtype: float64


In [28]:
review_df['sentiment-score'] = review_df['review'].apply(lambda review: TextBlob(review).sentiment.polarity)

scaler = MinMaxScaler()
scaler.fit(review_df[['sentiment-score']])

review_df['sentiment-score'] = scaler.transform(review_df[['sentiment-score']])

In [29]:
review_df

Unnamed: 0,id,types,review_number,review,user_id,sentiment-score
0,ChIJYcGr7GSb0S0RckePBrCWikw,"hotel, lodging",review 1,"It has quite small room, and the hallway is qu...",user_2573,0.506250
1,ChIJZbWX6Aia0S0R0tM3h1RZ1h8,"indonesian_restaurant, restaurant, food",review 1,"Surprisingly, a really good warung that’s hidd...",user_2062,0.606250
2,ChIJYyHbhgia0S0RzdjNXLmcf54,"tourist_attraction, restaurant, food",review 1,"Only had a fleeting visit here, came by coach,...",user_2348,0.491667
3,ChIJ6zf9LJCb0S0RFv3BdLl61ZY,"coffee_shop, cafe, food, store",review 1,"One word, underrated! How come place like this...",user_1448,0.672338
4,ChIJxaITmQia0S0RyrbukE8vsJU,"tourist_attraction, place_of_worship",review 1,"This temple is located in Singaraja, located i...",user_4426,0.569762
...,...,...,...,...,...,...
30890,ChIJf6d0f5Nb0i0Rq_tEZeZB_Nc,tourist_attraction,review 5,"🚶 I walked down to the beach, but it’s quite s...",user_3426,0.582200
30891,ChIJk41ld9Fb0i0RStb1NFpDAno,"restaurant, bar, food",review 5,Tropical Temptation is a beach club gem! 💎 The...,user_816,0.808929
30892,ChIJO30FJJdd0i0RCOyG8WcyJBk,tourist_attraction,review 5,it was really hot when we went here but its a ...,user_26,0.641667
30893,ChIJvazdGXtb0i0RA-4XopPth7A,"bar, restaurant, food",review 5,"I went to this beach a few days ago, the food ...",user_2840,0.823344


In [30]:
review_df.to_csv('final-dataset/dataset_reviews.csv', index=False)