In [82]:
import pandas as pd

items = pd.read_csv('data/items.csv')
reviews = pd.read_csv('data/reviews.csv')
users = pd.read_csv('data/users.csv')

items = items.drop(['name', 'photos', 'WEBSITE', 'PHONE', 'EMAIL', 'schedule', 'latitude', 'longitude'], axis=1)
reviews = reviews.drop(['date'], axis=1)

In [83]:
df = pd.merge(reviews, items, on='detail_id', how='left')
df = df.drop(columns=['detail_id'])
df = pd.merge(df, users, on='profile_url', how='left')
df = df.drop(columns=['profile_url'])

In [84]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df['mark'].value_counts())
print(len(df))

mark
5    32889
4    18358
3     8259
2     2369
1     1178
Name: count, dtype: int64
63053


In [85]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.1, 1, 5, 10, 20, 50, 100]
}

In [86]:
def feature_selection_scorer(estimator, X, y):
    y_pred_ = estimator.predict(X)
    mse_ = np.mean((y - y_pred_) ** 2)
    
    features = np.sum(estimator.coef_ != 0)
    
    return -mse_ + 0.01 * (X.shape[1] - features)

In [87]:
lasso = Lasso(max_iter=10000)

lasso_grid_search = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid,
    scoring=feature_selection_scorer,
    cv=5,
    verbose=2,
    n_jobs=-1
)

lasso_grid_search.fit(X_train, y_train)

best_model = lasso_grid_search.best_estimator_

print("Best Lasso Parameters:", lasso_grid_search.best_params_)
print("Best Lasso Score (Custom):", lasso_grid_search.best_score_)

non_zero_features = np.sum(best_model.coef_ != 0)
print(f"Number of Selected Features: {non_zero_features} out of {X_train.shape[1]}")

y_pred = best_model.predict(X_test)
mse = np.mean((y_test - y_pred) ** 2)
print("Test Mean Squared Error:", mse)

selected_features = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": best_model.coef_
})
print("Selected Features and Coefficients:")
print(selected_features[selected_features["Coefficient"] != 0])

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Lasso Parameters: {'alpha': 10}
Best Lasso Score (Custom): -0.09572003395149917
Number of Selected Features: 4 out of 82
Test Mean Squared Error: 0.8669350982198994
Selected Features and Coefficients:
              Feature  Coefficient
1       reviews_count     0.000032
2   description_score    -0.000006
31          max_price    -0.000003
32            ranking    -0.000125
