In [1]:
import pandas as pd
import numpy as np

from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler

In [2]:
main_df = pd.read_csv('final-dataset/main_dataset.csv')

In [3]:
review_column = ['review 1', 'review 2', 'review 3', 'review 4', 'review 5']
review_df = main_df.melt(id_vars=['id', 'types'], value_vars=review_column, var_name='review_number', value_name='review').dropna()

In [4]:
unique_ratio = 0.5
total_review_count = len(review_df)
unique_user_count = int(total_review_count * unique_ratio)

mean_reviews_per_user = total_review_count / unique_user_count

In [5]:
review_counts = np.random.poisson(mean_reviews_per_user, unique_user_count)

adjustment_factor = total_review_count / review_counts.sum()
adjusted_review_counts = np.floor(review_counts * adjustment_factor).astype(int)

if adjusted_review_counts.sum() < total_review_count:
    deficit = total_review_count - adjusted_review_counts.sum()
    additional_indices = np.random.choice(range(unique_user_count), deficit, replace=True)
    for idx in additional_indices:
        adjusted_review_counts[idx] += 1

In [6]:
user_ids = [f'user_{i + 1}' for i in range(unique_user_count)]
assigned_users = np.repeat(user_ids, adjusted_review_counts)
np.random.shuffle(assigned_users)
review_df['user_id'] = assigned_users[:total_review_count]

In [7]:
user_distribution = review_df['user_id'].value_counts()
print(user_distribution.describe())

count    11114.000000
mean         2.414972
std          1.387561
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         12.000000
Name: count, dtype: float64


In [8]:
review_df['sentiment-score'] = review_df['review'].apply(lambda review: TextBlob(review).sentiment.polarity)

scaler = MinMaxScaler()
scaler.fit(review_df[['sentiment-score']])

review_df['sentiment-score'] = scaler.transform(review_df[['sentiment-score']])

In [9]:
review_df

Unnamed: 0,id,types,review_number,review,user_id,sentiment-score
0,ChIJYcGr7GSb0S0RckePBrCWikw,"hotel, lodging",review 1,"It has quite small room, and the hallway is qu...",user_5227,0.506250
1,ChIJZbWX6Aia0S0R0tM3h1RZ1h8,"indonesian_restaurant, restaurant, food",review 1,"Surprisingly, a really good warung that’s hidd...",user_892,0.606250
2,ChIJYyHbhgia0S0RzdjNXLmcf54,"tourist_attraction, restaurant, food",review 1,"Only had a fleeting visit here, came by coach,...",user_3832,0.491667
3,ChIJ6zf9LJCb0S0RFv3BdLl61ZY,"coffee_shop, cafe, food, store",review 1,"One word, underrated! How come place like this...",user_7720,0.672338
4,ChIJ63FmGgaa0S0RWD5dfwhjGHQ,"indonesian_restaurant, restaurant, food",review 1,"We came here for dinner, food was good, i like...",user_2293,0.593056
...,...,...,...,...,...,...
26835,ChIJoW-FX3tb0i0RswZrEuxU3dg,lodging,review 5,A nice Villa with a view to the Pandawa beach....,user_78,0.800000
26836,ChIJHcVJyNhd0i0Rtif8ZHzZBr0,tourist_attraction,review 5,Goes there at around 3PM and it looks so good ...,user_6797,0.875000
26837,ChIJQ_u4aFhd0i0RcsyvWJ9Sk50,"restaurant, food",review 5,"beautiful place, delicious food, the atmospher...",user_254,0.745833
26838,ChIJk41ld9Fb0i0RStb1NFpDAno,"restaurant, bar, food",review 5,Tropical Temptation is a beach club gem! 💎 The...,user_4987,0.808929


In [10]:
review_df.to_csv('final-dataset/review_dataset.csv', index=False)