In [1]:
import pandas as pd
import numpy as np
# Visualización
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
 
from datetime import timedelta
# Modelos
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, BaggingRegressor
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score, \
roc_curve, roc_auc_score, ConfusionMatrixDisplay, multilabel_confusion_matrix, r2_score, mean_squared_error, mean_absolute_error, median_absolute_error
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv('../data/processed/clean.csv')

In [14]:
df['Content Rating'].value_counts()

Content Rating
Everyone           7420
Teen               1084
Mature 17+          461
Everyone 10+        397
Adults only 18+       3
Unrated               1
Name: count, dtype: int64

In [15]:
## crear la columna 'days_since_update' y borrar la columna Last Updated
days_since_update = df['Last Updated'].map(lambda x: datetime.date.today() - datetime.datetime.strptime(x, "%b-%y").date())
days_since_update = days_since_update.dt.days
df['days_since_update'] = days_since_update
df.drop(columns='Last Updated', inplace=True)

## escalado:
### App : crear un data frame y borrar la columna
### Category : OHE
### Reviews : MinMax
### Installs : MinMax
### Type : OHE
### Content Rating : OHE
### Genres : OHE
### Genres2 : OHE

In [16]:
## crear data frame de la columna APP y borrar la columna
app_names = df[['App']]
df.drop(columns='App', inplace=True)

In [17]:
##aplicar OHE
from sklearn.preprocessing import OneHotEncoder
def apply_onehot_encoder(train:pd.DataFrame, columns_to_encode:list, test:pd.DataFrame=None):
    
    # Resetear índices para evitar desalineación
    train = train.reset_index(drop=True)
    
    # Crear el OneHotEncoder
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

    # Ajustar y transformar las columnas seleccionadas
    transformed_data = encoder.fit_transform(train[columns_to_encode])

    # Crear un DataFrame con las columnas transformadas
    transformed_df = pd.DataFrame(transformed_data, columns=encoder.get_feature_names_out(columns_to_encode))
    
    # Concatenar con el DataFrame original excluyendo las columnas transformadas
    df_concatenated = pd.concat([train.drop(columns_to_encode, axis=1), transformed_df], axis=1)

    # Si se proporciona un segundo DataFrame, aplicar la misma transformación
    if test is not None:
        test = test.reset_index(drop=True)
        transformed_data_to_transform = encoder.transform(test[columns_to_encode])
        transformed_df_to_transform = pd.DataFrame(transformed_data_to_transform, columns=encoder.get_feature_names_out(columns_to_encode))
        df_to_transform_concatenated = pd.concat([test.drop(columns_to_encode, axis=1), transformed_df_to_transform], axis=1)
        return df_concatenated, df_to_transform_concatenated

    return df_concatenated

In [18]:
transformed_df = apply_onehot_encoder(df, ['Type', 'Category', 'Content Rating', 'Genres', 'Genres2'])

In [19]:
transformed_df

Unnamed: 0,Rating,Reviews,Installs,Price,days_since_update,Type_Free,Type_Paid,Category_art_and_design,Category_auto_and_vehicles,Category_beauty,Category_books_and_reference,Category_business,Category_comics,Category_communication,Category_dating,Category_education,Category_entertainment,Category_events,Category_family,Category_finance,Category_food_and_drink,Category_game,Category_health_and_fitness,Category_house_and_home,Category_libraries_and_demo,Category_lifestyle,Category_maps_and_navigation,Category_medical,Category_news_and_magazines,Category_parenting,Category_personalization,Category_photography,Category_productivity,Category_shopping,Category_social,Category_sports,Category_tools,Category_travel_and_local,Category_video_players,Category_weather,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated,Genres_Action,Genres_Adventure,Genres_Arcade,Genres_Art & Design,Genres_Auto & Vehicles,Genres_Beauty,Genres_Board,Genres_Books & Reference,Genres_Business,Genres_Card,Genres_Casino,Genres_Casual,Genres_Comics,Genres_Communication,Genres_Dating,Genres_Education,Genres_Educational,Genres_Entertainment,Genres_Events,Genres_Finance,Genres_Food & Drink,Genres_Health & Fitness,Genres_House & Home,Genres_Libraries & Demo,Genres_Lifestyle,Genres_Maps & Navigation,Genres_Medical,Genres_Music,Genres_Music & Audio,Genres_News & Magazines,Genres_Parenting,Genres_Personalization,Genres_Photography,Genres_Productivity,Genres_Puzzle,Genres_Racing,Genres_Role Playing,Genres_Shopping,Genres_Simulation,Genres_Social,Genres_Sports,Genres_Strategy,Genres_Tools,Genres_Travel & Local,Genres_Trivia,Genres_Video Players & Editors,Genres_Weather,Genres_Word,Genres2_Action & Adventure,Genres2_Brain Games,Genres2_Creativity,Genres2_Education,Genres2_Music & Video,Genres2_Pretend Play,Genres2_nan
0,4.1,159,10000,0.0,2247,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3.9,967,500000,0.0,2247,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4.7,87510,5000000,0.0,2035,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4.5,215644,50000000,0.0,2096,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4.3,967,100000,0.0,2096,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9361,4.0,7,500,0.0,2461,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9362,4.5,38,5000,0.0,2431,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9363,5.0,4,100,0.0,2066,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9364,4.5,114,1000,0.0,3343,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
## aplicar minmax
scaler = MinMaxScaler()
columnas_min_max = ['Reviews', 'Installs']
transformed_df[columnas_min_max] = scaler.fit_transform(transformed_df[columnas_min_max])

In [None]:
df_final = transformed_df.to_csv('../data/final/final.csv', index=False)

## Split los datos entre train y test

In [21]:
from sklearn.model_selection import train_test_split

X = transformed_df.drop(columns='Rating')
y = transformed_df['Rating']

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=12)



## Base line CV

In [45]:
modelos = {
    "Logistic Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "RandomForestClassifier": RandomForestRegressor(),
    "XGBoost": XGBRegressor(),
    "CatBoost": CatBoostRegressor(verbose=False)
    
}

# Define las métricas a usar
metricas = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']

resultados_dict = {}

for nombre_modelo, modelo in modelos.items():
    cv_resultados = cross_validate(modelo, x_train, y_train, cv=5, scoring=metricas)
    
    for metrica in metricas:
        clave = f"{nombre_modelo}_{metrica}"
        resultados_dict[clave] = cv_resultados[f"test_{metrica}"].mean()

# Convertir el diccionario de resultados en DataFrame
resultados_df = pd.DataFrame([resultados_dict])

In [47]:
resultados_df.T

Unnamed: 0,0
Logistic Regression_r2,0.04873
Logistic Regression_neg_mean_squared_error,-0.251998
Logistic Regression_neg_mean_absolute_error,-0.346961
Decision Tree_r2,-0.51997
Decision Tree_neg_mean_squared_error,-0.402128
Decision Tree_neg_mean_absolute_error,-0.387075
RandomForestClassifier_r2,0.102433
RandomForestClassifier_neg_mean_squared_error,-0.237724
RandomForestClassifier_neg_mean_absolute_error,-0.312498
XGBoost_r2,0.103553


## Grid search

In [36]:
param_grid = {
    'depth': [4, 6, 8, 10, 12],
    'learning_rate': [0.01, 0.03, 0.05, 0.07,  0.1],
    'iterations': [100, 200, 300, 500, 100]
}

# Initialize CatBoostRegressor
catboost = CatBoostRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(catboost, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=True)

# Perform grid search
grid_search.fit(X, y)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

# Print the best mean squared error
print("Best mean squared error:", -grid_search.best_score_)

Fitting 5 folds for each of 125 candidates, totalling 625 fits
0:	learn: 0.5424366	total: 1.51ms	remaining: 149ms
1:	learn: 0.5419956	total: 2.68ms	remaining: 131ms
2:	learn: 0.5415754	total: 3.85ms	remaining: 125ms
3:	learn: 0.5411391	total: 5.05ms	remaining: 121ms
4:	learn: 0.5407268	total: 6.32ms	remaining: 120ms
5:	learn: 0.5403494	total: 7.66ms	remaining: 120ms
6:	learn: 0.5399174	total: 8.95ms	remaining: 119ms
7:	learn: 0.5395207	total: 10.4ms	remaining: 119ms
8:	learn: 0.5391029	total: 11.6ms	remaining: 118ms
9:	learn: 0.5387456	total: 13ms	remaining: 117ms
10:	learn: 0.5384097	total: 14.5ms	remaining: 117ms
11:	learn: 0.5380078	total: 16ms	remaining: 118ms
12:	learn: 0.5376641	total: 17.4ms	remaining: 117ms
13:	learn: 0.5373332	total: 18.6ms	remaining: 114ms
14:	learn: 0.5370198	total: 19.8ms	remaining: 112ms
15:	learn: 0.5366931	total: 20.9ms	remaining: 110ms
16:	learn: 0.5363652	total: 22.1ms	remaining: 108ms
17:	learn: 0.5360620	total: 23.2ms	remaining: 106ms
18:	learn: 0.53

In [34]:
x_test.shape

(1874, 100)

## aplicar el modelo con los mejores parametros

In [58]:
model =  CatBoostRegressor (depth=10, iterations=500, learning_rate= 0.03,l2_leaf_reg=7, verbose=False )
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
median_ae = median_absolute_error(y_test, y_pred)
print("R-squared (R²):", r2)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Median Absolute Error:", median_ae)


R-squared (R²): 0.22506058791049366
Mean Absolute Error (MAE): 0.3098282869210403
Mean Squared Error (MSE): 0.2064669238217391
Root Mean Squared Error (RMSE): 0.45438631561892256
Median Absolute Error: 0.213204105556283


R^2 Score: 0.21391357818408718
