In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [109]:
df = pd.read_csv('book_match_model.csv', sep=';') 

In [110]:
df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Genre Principal,Genres Secondaires,Image-URL-L,Moyenne_Adolescent 12-17,Moyenne_Adulte 26-35,Moyenne_Adulte 36-50,Moyenne_Enfant 1-12,Moyenne_Jeune adulte 18-25,Moyenne_Senior 50+
0,000617695X,The Great & Secret Show,Barker Clive,1990-01-01,HarperCollins Publishers,Horror,"Fantasy, Fiction, Fantasy, Dark Fantasy, Thril...",http://images.amazon.com/images/P/000617695X.0...,,6.00,,,,
1,0007144350,Collins Complete Works of Oscar Wilde (Collins...,Oscar Wilde,2003-01-01,HarperCollins Publishers,Classics,"Fiction, Poetry, Plays, Short Stories, Literat...",http://images.amazon.com/images/P/0007144350.0...,,,10.000000,,,
2,0020199856,LOVE OF THE LAST TYCOON,F. Scott Fitzgerald,1995-01-01,Scribner,Classics,"Fiction, Literature, Literature, American, Nov...",http://images.amazon.com/images/P/0020199856.0...,,8.00,,,,
3,0020360754,Heart Songs and Other Stories,Annie Proulx,1995-01-01,Scribner,Short Stories,"Fiction, Literary Fiction, Literature, America...",http://images.amazon.com/images/P/0020360754.0...,,8.50,5.000000,,,
4,0020421907,Seaward,Susan Cooper,1987-01-01,Simon Pulse,Fantasy,"Young Adult, Fiction, Childrens, Adventure, Ch...",http://images.amazon.com/images/P/0020421907.0...,,8.00,,,9.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2442,8479019395,Amarse Con Los Ojos Abiertos / To Love With Ey...,Jorge Bucay,2003-01-01,Alfaguara/Santillana USA,Psychology,"Fiction, Self Help, Relationships, Philosophy,...",http://images.amazon.com/images/P/8479019395.0...,,5.50,,,,
2443,8483101610,"Soldados De Salamina (Coleccion Andanzas, 433)",Javier Cercas,2001-01-01,Tusquets,Fiction,"Cultural, Spain, European Literature, Spanish ...",http://images.amazon.com/images/P/8483101610.0...,,9.00,8.666667,,6.0,
2444,8484504794,El Misterio De Salem's Lot/Salem's Lot,Stephen King,2003-01-01,"Plaza & Janes Editores, S.A.",Horror,"Fiction, Paranormal, Vampires, Fantasy, Thrill...",http://images.amazon.com/images/P/8484504794.0...,,,,,9.0,
2445,8495618605,El Codigo Da Vinci / The Da Vinci Code,Dan Brown,2003-01-01,Ediciones Urano,Fiction,"Mystery, Thriller, Thriller, Mystery Thriller,...",http://images.amazon.com/images/P/8495618605.0...,6.0,5.75,7.666667,,9.0,


In [111]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor


In [112]:
df = df.copy()

# Nettoyage des genres
df['Genres Secondaires'] = df['Genres Secondaires'].fillna('')
df['Genre Principal'] = df['Genre Principal'].fillna('')

# Combinaison des genres principaux et secondaires
df['genre_combined'] = (df['Genre Principal'] + '|' + df['Genres Secondaires']).str.lower().str.split('[,|]')

In [113]:
# Encodage des genres avec MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre_combined'])

In [114]:
def get_age_group(age):
    if age <= 12:
        return 'Moyenne_Enfant 1-12'
    elif age <= 17:
        return 'Moyenne_Adolescent 12-17'
    elif age <= 25:
        return 'Moyenne_Jeune adulte 18-25'
    elif age <= 35:
        return 'Moyenne_Adulte 26-35'
    elif age <= 50:
        return 'Moyenne_Adulte 36-50'
    else:
        return 'Moyenne_Senior 50+'

In [115]:
def get_model_for_age_group(age_group_column):
    # On filtre les lignes qui ont une note dans cette tranche d’âge
    mask = ~df[age_group_column].isna()
    X = genre_encoded[mask]
    y = df.loc[mask, age_group_column].values

    model = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=5))
    model.fit(X, y)
    return model


In [116]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def get_model_for_age_group(age_group_column):
    # On filtre les lignes qui ont une note dans cette tranche d’âge
    mask = ~df[age_group_column].isna()
    X = genre_encoded[mask]
    y = df.loc[mask, age_group_column].values

    # Définir les paramètres à tester
    param_grid = {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],  # Essayer différentes valeurs de n_neighbors
        'kneighborsregressor__weights': ['uniform', 'distance'],  # Essayer avec et sans pondération des voisins
        'kneighborsregressor__metric': ['euclidean', 'manhattan']  # Tester différentes distances
    }

    # Utilisation de GridSearchCV pour optimiser les hyperparamètres
    model = make_pipeline(StandardScaler(), KNeighborsRegressor())
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)

    # Retourner le modèle optimal
    return grid_search.best_estimator_

def recommander_livres_knn_supervise(user_age, user_genres_principaux, user_genres_secondaires=[], top_k=5):
    # Déterminer la colonne cible correspondant à l'âge de l'utilisateur
    age_group_col = get_age_group(user_age)

    # Vérifier qu'il y a suffisamment de données
    if df[age_group_col].dropna().shape[0] < 5:
        raise ValueError(f"Pas assez de données pour la tranche d'âge {age_group_col}.")

    # Entraîner le modèle pour cette tranche avec optimisation d'hyperparamètres
    model = get_model_for_age_group(age_group_col)

    # Préparer les genres utilisateur
    user_genres = [g.lower() for g in user_genres_principaux + user_genres_secondaires]
    user_vector = mlb.transform([user_genres])
    user_input = user_vector.reshape(1, -1)

    # Prédire la note pour tous les livres de la tranche d’âge
    mask_valid = ~df[age_group_col].isna()
    X_all = genre_encoded[mask_valid]
    df_valid = df[mask_valid].copy()
    df_valid['predicted_rating'] = model.predict(X_all)

    # Filtrer les livres qui ont au moins un genre en commun
    df_valid = df_valid[df_valid['genre_combined'].apply(lambda g: any(gen in g for gen in user_genres))]

    # Trier par prédiction
    top_books = df_valid.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return top_books[['Book-Title', 'Book-Author', 'Publisher', age_group_col, 'predicted_rating']]


In [117]:
user_age = 30
user_genres = ['Fiction']
user_sous_genres = ['Thriller']

recommandations = recommander_livres_knn_supervise(user_age, user_genres, user_sous_genres)
recommandations

Unnamed: 0,Book-Title,Book-Author,Publisher,Moyenne_Adulte 26-35,predicted_rating
183,Betsy and Joe (Betsy & Tacy),Maud Hart Lovelace,HarperTrophy,10.0,9.172222
193,Emily of Deep Valley (Deep Valley Book),Maud Hart Lovelace,HarperTrophy,10.0,9.172222
310,Mansfield Park (Penguin Popular Classics),Jane Austen,Penguin Books Ltd,8.5,9.1
279,Sense and Sensibility,Jane Austen,Penguin USA (Paper),8.5,9.1
750,Mansfield Park (Modern Library Classics),Jane Austen,Modern Library,8.0,9.1


In [127]:
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

# Colonnes de notes à évaluer
age_group_columns = [
    'Moyenne_Enfant 1-12',
    'Moyenne_Adolescent 12-17',
    'Moyenne_Jeune adulte 18-25',
    'Moyenne_Adulte 26-35',
    'Moyenne_Adulte 36-50',
    'Moyenne_Senior 50+'
]

results = []

# Paramètres KNN à tester
param_grid = {
    'kneighborsregressor__n_neighbors': [3, 5, 7],
    'kneighborsregressor__weights': ['uniform', 'distance'],
    'kneighborsregressor__metric': ['euclidean', 'manhattan']
}

for col in age_group_columns:
    mask = ~df[col].isna()
    if mask.sum() < 5:
        continue

    X = genre_encoded[mask]
    y = df.loc[mask, col].values

    # GridSearch avec pipeline (scaling + KNN)
    pipeline = make_pipeline(StandardScaler(), KNeighborsRegressor())
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X, y)

    # Meilleur modèle
    best_model = grid_search.best_estimator_

    # Prédiction avec validation croisée
    y_pred = cross_val_predict(best_model, X, y, cv=5)
    y_pred = np.clip(y_pred, 0, 10)

    # Scores
    r2 = r2_score(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    mae = mean_absolute_error(y, y_pred)

    results.append({
        'Tranche d’âge': col,
        'R²': round(r2, 3),
        'RMSE': round(rmse, 3),
        'MAE': round(mae, 3),
        'Nb livres': len(y),
        'Best params': grid_search.best_params_
    })

# Affichage
scores_knn_df = pd.DataFrame(results)
display(scores_knn_df)




Unnamed: 0,Tranche d’âge,R²,RMSE,MAE,Nb livres,Best params
0,Moyenne_Enfant 1-12,-0.058,2.002,1.486,65,"{'kneighborsregressor__metric': 'manhattan', '..."
1,Moyenne_Adolescent 12-17,-0.252,1.803,1.504,187,"{'kneighborsregressor__metric': 'euclidean', '..."
2,Moyenne_Jeune adulte 18-25,-0.149,1.597,1.268,978,"{'kneighborsregressor__metric': 'manhattan', '..."
3,Moyenne_Adulte 26-35,-0.134,1.587,1.221,1535,"{'kneighborsregressor__metric': 'manhattan', '..."
4,Moyenne_Adulte 36-50,-0.142,1.582,1.199,1365,"{'kneighborsregressor__metric': 'manhattan', '..."
5,Moyenne_Senior 50+,-0.189,1.648,1.314,743,"{'kneighborsregressor__metric': 'euclidean', '..."


MODELE LINEAIRE 

In [118]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def get_linear_model_for_age_group(age_group_column):
    # Filtrer les livres notés dans cette tranche d'âge
    mask = ~df[age_group_column].isna()
    X = genre_encoded[mask]
    y = df.loc[mask, age_group_column].values

    # Pipeline standardisation + régression linéaire
    model = make_pipeline(StandardScaler(), LinearRegression())
    model.fit(X, y)

    return model


In [119]:
def recommander_livres_regression(user_age, user_genres_principaux, user_genres_secondaires=[], top_k=5):
    # 1. Déterminer la colonne cible (note moyenne) pour la tranche d’âge
    age_group_col = get_age_group(user_age)

    # 2. Vérifier qu’on a assez de données
    if df[age_group_col].dropna().shape[0] < 5:
        raise ValueError(f" Pas assez de données pour la tranche d’âge : {age_group_col}")

    # 3. Entraîner le modèle pour cette tranche d’âge
    model = get_linear_model_for_age_group(age_group_col)

    # 4. Préparer les genres de l’utilisateur
    user_genres = [g.lower() for g in user_genres_principaux + user_genres_secondaires]
    user_vector = mlb.transform([user_genres])
    user_input = user_vector.reshape(1, -1)

    # 5. Prédire les notes pour tous les livres de cette tranche d’âge
    mask_valid = ~df[age_group_col].isna()
    X_all = genre_encoded[mask_valid]
    df_valid = df[mask_valid].copy()
    df_valid['predicted_rating'] = model.predict(X_all)

    # 6. Filtrer les livres qui partagent au moins un genre avec l’utilisateur
    df_valid = df_valid[df_valid['genre_combined'].apply(lambda g: any(gen in g for gen in user_genres))]

    # 7. Trier par note prédite et retourner le top
    top_books = df_valid.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return top_books[['Book-Title', 'Book-Author', 'Publisher', age_group_col, 'predicted_rating']]


In [120]:
recommandations = recommander_livres_regression(
    user_age=30,
    user_genres_principaux=['Fiction'],
    user_genres_secondaires=['Thriller'],
    top_k=5
)

recommandations



Unnamed: 0,Book-Title,Book-Author,Publisher,Moyenne_Adulte 26-35,predicted_rating
1684,The Defense: A Novel (Vintage International),Vladimir Vladimirovich Nabokov,Vintage Books USA,9.0,12.070432
42,The Cheese Monkeys : A Novel in Two Semesters,Chip Kidd,Perennial,8.153846,11.146604
1919,Judy Moody Saves the World (Judy Moody),Megan McDonald,Candlewick Press (MA),10.0,10.933713
1787,The FALL OF A SPARROW: A NOVEL,Robert Hellenga,Scribner,9.0,10.595334
1918,Judy Moody Gets Famous! (Judy Moody),Megan McDonald,Candlewick Press (MA),10.0,10.509397


In [121]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Fonction pour entraîner le modèle Ridge pour une tranche d'âge
def get_linear_model_for_age_group(age_group_column):
    mask = ~df[age_group_column].isna()
    X = genre_encoded[mask]
    y = df.loc[mask, age_group_column].values

    # Ridge Regression avec standardisation
    model = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
    model.fit(X, y)

    return model

# Fonction de recommandation
def recommander_livres_regression(user_age, user_genres_principaux, user_genres_secondaires=[], top_k=5):
    # 1. Déterminer la colonne de notes cible
    age_group_col = get_age_group(user_age)

    # 2. Vérifier qu’il y a assez de données pour cette tranche d’âge
    if df[age_group_col].dropna().shape[0] < 5:
        raise ValueError(f"Pas assez de données pour la tranche d’âge : {age_group_col}")

    # 3. Entraîner le modèle Ridge
    model = get_linear_model_for_age_group(age_group_col)

    # 4. Préparer les genres de l’utilisateur
    user_genres = [g.lower() for g in user_genres_principaux + user_genres_secondaires]
    user_vector = mlb.transform([user_genres])
    user_input = user_vector.reshape(1, -1)

    # 5. Prédire les notes pour tous les livres valides
    mask_valid = ~df[age_group_col].isna()
    X_all = genre_encoded[mask_valid]
    df_valid = df[mask_valid].copy()

    # Prédiction + clipping entre 0 et 10
    df_valid['predicted_rating'] = model.predict(X_all)
    df_valid['predicted_rating'] = df_valid['predicted_rating'].clip(0, 10)

    # 6. Filtrer les livres qui partagent au moins un genre avec l'utilisateur
    df_valid = df_valid[df_valid['genre_combined'].apply(lambda g: any(gen in g for gen in user_genres))]

    # 7. Trier et retourner les meilleurs livres
    top_books = df_valid.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return top_books[['Book-Title', 'Book-Author', 'Publisher', age_group_col, 'predicted_rating']]


In [122]:
recommandations = recommander_livres_regression(
    user_age=30,
    user_genres_principaux=['Fiction'],
    user_genres_secondaires=['Thriller'],
    top_k=5
)

recommandations


Unnamed: 0,Book-Title,Book-Author,Publisher,Moyenne_Adulte 26-35,predicted_rating
1442,Lila: An Inquiry Into Morals,Robert M. Pirsig,Bantam Books,9.0,10.0
1919,Judy Moody Saves the World (Judy Moody),Megan McDonald,Candlewick Press (MA),10.0,10.0
559,Miracle on the 17th Green : A Novel about Life...,James Patterson,Back Bay Books,10.0,10.0
2343,The Lighthouse at the End of the World,Jules Verne,Fredonia Books (NL),10.0,9.998467
1107,Chorus Skating,Alan Dean Foster,Aspect,10.0,9.998447


In [125]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Tranches d'âge
age_group_columns = [
    'Moyenne_Enfant 1-12',
    'Moyenne_Adolescent 12-17',
    'Moyenne_Jeune adulte 18-25',
    'Moyenne_Adulte 26-35',
    'Moyenne_Adulte 36-50',
    'Moyenne_Senior 50+'
]

results = []

# Boucle sur chaque tranche
for col in age_group_columns:
    mask = ~df[col].isna()
    if mask.sum() < 5:
        continue  # Trop peu de données

    X = genre_encoded[mask]
    y = df.loc[mask, col].values

    # Modèle Ridge avec standardisation
    model = make_pipeline(StandardScaler(), Ridge(alpha=1.0))

    # Validation croisée
    y_pred = cross_val_predict(model, X, y, cv=5)
    y_pred = np.clip(y_pred, 0, 10)

    # Scores
    r2 = r2_score(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    mae = mean_absolute_error(y, y_pred)

    results.append({
        "Tranche d'âge": col,
        "R²": round(r2, 3),
        "RMSE": round(rmse, 3),
        "MAE": round(mae, 3),
        "Nb livres": mask.sum()
    })

# Résultats en DataFrame
scores_df = pd.DataFrame(results)
display(scores_df)




Unnamed: 0,Tranche d'âge,R²,RMSE,MAE,Nb livres
0,Moyenne_Enfant 1-12,-0.637,2.491,1.784,65
1,Moyenne_Adolescent 12-17,-2.54,3.033,2.358,187
2,Moyenne_Jeune adulte 18-25,-1.443,2.329,1.744,978
3,Moyenne_Adulte 26-35,-0.959,2.086,1.584,1535
4,Moyenne_Adulte 36-50,-0.865,2.021,1.534,1365
5,Moyenne_Senior 50+,-1.981,2.61,1.943,743
