In [432]:
import numpy as np
import pandas as pd
from numpy.f2py.auxfuncs import throw_error
from sklearn.metrics import mean_squared_error

items = pd.read_csv('data/items.csv')
reviews = pd.read_csv('data/reviews.csv')
users = pd.read_csv('data/users.csv')

In [433]:
users_reviews = reviews.merge(users, on='profile_url', how='inner')
all_dfs = users_reviews.merge(items, on='detail_id', how='inner')

In [434]:
merged_reviews = reviews.merge(items[['detail_id', 'name']], on='detail_id', how='left')
user_places_with_ratings = merged_reviews.groupby('profile_url').agg({'name': list, 'mark': list}).reset_index()
user_places_with_ratings.columns = ['profile_url', 'places_rated', 'ratings']
user_places_with_ratings.head()

ratings_df = pd.DataFrame({
    'profile_url': user_places_with_ratings['profile_url'].repeat(user_places_with_ratings['places_rated'].str.len()),
    'place': [place for places in user_places_with_ratings['places_rated'] for place in places],
    'rating': [rating for ratings in user_places_with_ratings['ratings'] for rating in ratings]
})

user_ratings = ratings_df.pivot_table(index='profile_url', columns='place', values='rating', fill_value=0)
user_ratings

place,Югос,15 Kitchen + Bar,19 Bar & Atmosphere,32.05,35mm Cinema Hall,5 Оборотов,5642 Высота,8 Дом культуры (клуб) имени И.В.Русакова,800C Contemporary Steak,Abbey Players Pub,...,Ярославский Вокзал,Яузский бульвар,бар Разведка,гастрономическая улица strEAT,метро площадь ильича,музей-квартира Н.С. Голованова,эZo Georgian Cuizine,​Фирменный Магазин Косметики Свобода,​​​​​​​TGI FRIDAYS™,№13 Ресторан
profile_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/Profile/-Dinkaaa-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/05121978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/05margarita16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/06carolinab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/070165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/Profile/zlobinaen777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/zoia1957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/zoritoAl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/Profile/zoyad782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [385]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import networkx as nx

user_ratings.reset_index(inplace=True)
user_ratings.set_index('profile_url', inplace=True)

similarity_matrix = cosine_similarity(user_ratings.fillna(0))
similarity_df = pd.DataFrame(similarity_matrix, index=user_ratings.index, columns=user_ratings.index)

user_sim_graph = nx.Graph()

threshold = 0.75

for user1 in similarity_df.index:
    for user2 in similarity_df.columns:
        if user1 != user2 and similarity_df.loc[user1, user2] > threshold:
            user_sim_graph.add_edge(user1, user2, weight=similarity_df.loc[user1, user2])

In [400]:
df = pd.merge(reviews, items, on='detail_id', how='left')
df = pd.merge(df, users, on='profile_url', how='left')

In [387]:
df

Unnamed: 0,mark,date,profile_url,detail_id,name,latitude,longitude,photos,rating,reviews_count,...,ART_AND_CULTURE_REVIEWER,NATURE_AND_PARK_REVIEWER,RESTAURANT_EXPERT_REVIEWER,BED_AND_BREAKFAST_INN_REVIEWER,LUXURY_HOTEL_REVIEWER,BEACH_REVIEWER,FINE_DINING_REVIEWER,COFFEE_AND_TEA_REVIEWER,BAR_AND_PUB_REVIEWER,DESSERT_AND_BAKERY_REVIEWER
0,5,2024-08-14 00:00:00,/Profile/yalmaree,668919,PANORAMA360,55.750030,37.537860,20.0,4.5,2291.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,5,2023-09-14 00:00:00,/Profile/AlexeevIgor,300367,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,5,2023-06-27 00:00:00,/Profile/_Q5845IB,300367,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,2023-06-06 00:00:00,/Profile/834alexeyo,300367,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,5,2023-05-29 00:00:00,/Profile/253marinap,300367,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63048,3,2016-04-27 00:00:00,/Profile/_W3128MJ,6111511,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
63049,3,2016-02-12 00:00:00,/Profile/Paprikaw,6111511,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
63050,4,2015-10-27 00:00:00,/Profile/Alphastud,6111511,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
63051,4,2015-01-30 00:00:00,/Profile/525alexanderp,6111511,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [444]:
def compute_weighted_score(user, place):
    if user not in user_sim_graph.nodes:
        return -1

    similar_users = [
        (neighbor, user_sim_graph[user][neighbor]['weight'])
        for neighbor in user_sim_graph.neighbors(user)
    ]

    similar_users_reviews = [
        (neighbor, weight,
         df.loc[(df['profile_url'] == neighbor) & (df['detail_id'] == place), 'mark'].values[0])
        for neighbor, weight in similar_users
        if not df.loc[(df['profile_url'] == neighbor) & (df['detail_id'] == place)].empty
    ]

    similar_users_reviews = sorted(similar_users_reviews, key=lambda x: x[1], reverse=True)

    top_similar_users = similar_users_reviews[:len(similar_users_reviews)]

    if not top_similar_users:
        return -1

    weighted_sum = sum(weight * score for _, weight, score in top_similar_users)
    total_weight = sum(weight for _, weight, _ in top_similar_users)

    if total_weight == 0:
        return -1

    return weighted_sum / total_weight

In [445]:
df_review = df.copy()
df_review['weighted_score'] = df_review.apply(
    lambda row: compute_weighted_score(row['profile_url'], row['detail_id']), axis=1)

df_review = df_review.drop(columns=['detail_id'])
df_review = df_review.drop(columns=['profile_url'])

In [438]:
[mark for mark in df_review['weighted_score'] if mark != -1 and mark < 4.6]

[4.0,
 3.0,
 3.0,
 4.0,
 2.0,
 4.492381972808401,
 4.0,
 3.0,
 3.984763945616803,
 4.0,
 2.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 2.0,
 4.0,
 2.0,
 4.49038105676658,
 4.0,
 3.0,
 2.0,
 2.0,
 2.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 3.0,
 4.0,
 3.0,
 2.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.974183935873987,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 3.0,
 4.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.507618027191598,
 3.0,
 4.0,
 4.0,
 3.0,
 4.0,
 4.0,
 3.0,
 2.0,
 2.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 

In [439]:
df_review = df_review.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_review['mark'].value_counts())
print(len(df_review))

mark
5    32889
4    18358
3     8259
2     2369
1     1178
Name: count, dtype: int64
63053


In [446]:
df_review

Unnamed: 0,mark,date,name,latitude,longitude,photos,rating,reviews_count,WEBSITE,PHONE,...,NATURE_AND_PARK_REVIEWER,RESTAURANT_EXPERT_REVIEWER,BED_AND_BREAKFAST_INN_REVIEWER,LUXURY_HOTEL_REVIEWER,BEACH_REVIEWER,FINE_DINING_REVIEWER,COFFEE_AND_TEA_REVIEWER,BAR_AND_PUB_REVIEWER,DESSERT_AND_BAKERY_REVIEWER,weighted_score
0,5,2024-08-14 00:00:00,PANORAMA360,55.750030,37.537860,20.0,4.5,2291.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0
1,5,2023-09-14 00:00:00,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,1.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,-1.0
2,5,2023-06-27 00:00:00,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,5,2023-06-06 00:00:00,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,-1.0
4,5,2023-05-29 00:00:00,Собо́р Васи́лия Блаже́нного,55.753930,37.620796,20.0,4.5,12438.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63048,3,2016-04-27 00:00:00,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0
63049,3,2016-02-12 00:00:00,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,1.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0
63050,4,2015-10-27 00:00:00,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,-1.0
63051,4,2015-01-30 00:00:00,Ваби Саби,55.741913,37.653770,6.0,3.0,37.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,-1.0


In [447]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

In [448]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

scorer = 'neg_mean_squared_error' 

lasso = Lasso(max_iter=10000)

lasso_grid_search = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    verbose=2,
    n_jobs=-1
)

lasso_grid_search.fit(X_train_scaled, y_train)

best_model = lasso_grid_search.best_estimator_

print("Best Lasso Parameters:", lasso_grid_search.best_params_)
print("Best Lasso Score (Negative MSE):", lasso_grid_search.best_score_)

y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error:", mse)

selected_features = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": best_model.coef_
})
print("Selected Features and Coefficients:")
print(selected_features[selected_features["Coefficient"] != 0])


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Lasso Parameters: {'alpha': 0.01}
Best Lasso Score (Negative MSE): -0.7984075315065945
Test Mean Squared Error: 0.7894354151666992
Selected Features and Coefficients:
                                            Feature  Coefficient
0                                            rating     0.263699
1                                     reviews_count     0.016049
6                                       tags_Театры     0.005403
11          tags_Исторические достопримечательности    -0.000398
12                          tags_Памятники и статуи    -0.014265
15                              tags_Музеи искусств     0.003251
20  tags_Культурные объекты и достопримечательности    -0.010882
26                                       atmosphere    -0.019135
34                                 cuisines_Фастфуд    -0.010417
35                                cuisines_Японская    -0.000992
42                             cuisines_Современная  