## Jupyter Notebook para desarrollar la Task 3

Sistema de recomendación de 5 productos para cada sesión nueva.

Lectura de los DataFrames:

In [58]:
from pathlib import Path    
import pandas as pd

RAW_DATA_PATH = Path('../../data/raw')

train_df = pd.read_csv(         RAW_DATA_PATH / 'train.csv')
test_df = pd.read_csv(          RAW_DATA_PATH / 'test.csv')
products_df = pd.read_pickle(   RAW_DATA_PATH / 'products.pkl')
users_df = pd.read_csv(         RAW_DATA_PATH / 'users_data.csv')

In [59]:
import pandas as pd
import json

# Extraer datos del string (JSON) de la columna 'values'
def extract_data_from_string(df, column_name):
    df['country'] = df[column_name].str.extract(r"'country': \[(.*?)\]").map(lambda x: int(x.split(', ')[0]))
    df['R'] = df[column_name].str.extract(r"'R': \[(.*?)\]").map(lambda x: int(x.split(', ')[0]))
    df['F'] = df[column_name].str.extract(r"'F': \[(.*?)\]").map(lambda x: int(x.split(', ')[0]))
    df['M'] = df[column_name].str.extract(r"'M': \[(.*?)\]").map(lambda x: float(x.split(', ')[0]))
    
    return df.drop(columns=[column_name])

users_df = extract_data_from_string(users_df, 'values')

In [60]:
products_df['cod_section'] = products_df['cod_section'].fillna(0)
train_df['pagetype'] = train_df['pagetype'].fillna(0)
test_df['pagetype'] = test_df['pagetype'].fillna(0)
train_df['user_id'] = train_df['user_id'].fillna(0)
test_df['user_id'] = test_df['user_id'].fillna(0)

In [65]:
users_df.loc[:,'country'] = users_df['country'].astype('category')

products_df.loc[:,'cod_section'] = products_df['cod_section'].astype('int').astype('category')
products_df.loc[:,'color_id'] = products_df['color_id'].astype('category')
products_df.loc[:,'family'] = products_df['family'].astype('category')
products_df.loc[:,'discount'] = products_df['discount'].astype('int')

train_df.loc[:,'user_id'] = train_df['user_id'].astype('int')
train_df.loc[:,'add_to_cart'] = train_df['add_to_cart'].astype('int')
train_df.loc[:,'country'] = train_df['country'].astype('category')
train_df.loc[:,'device_type'] = train_df['device_type'].astype('category')
train_df.loc[:,'pagetype'] = train_df['pagetype'].astype('category')

test_df.loc[:,'user_id'] = test_df['user_id'].astype('int')
test_df.loc[:,'country'] = test_df['country'].astype('category')
test_df.loc[:,'device_type'] = test_df['device_type'].astype('category')
test_df.loc[:,'pagetype'] = test_df['pagetype'].astype('int').astype('category')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Basic Statistics
def print_df_info(df_name, df):
    print(f"\n=== {df_name} Analysis ===")
    print("\nShape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nSample:\n", df.head())
    print("\nDescriptive Stats:\n", df.describe())

for df_name, df in [('users_df', users_df), 
                    ('products_df', products_df), 
                    ('train_df', train_df)]:
    print_df_info(df_name, df)

# 2. Users Analysis
plt.figure(figsize=(15, 5))

plt.subplot(131)
users_df['country'].value_counts().plot(kind='bar')
plt.title('Users by Country')
plt.xticks(rotation=45)

plt.subplot(132)
users_df['R'].hist()
plt.title('Recency Distribution')

plt.subplot(133)
users_df['M'].hist()
plt.title('Monetary Value Distribution')
plt.tight_layout()
plt.show()

# 3. Products Analysis
plt.figure(figsize=(15, 5))

plt.subplot(131)
products_df['cod_section'].value_counts().plot(kind='bar')
plt.title('Products by Section')
plt.xticks(rotation=45)

plt.subplot(132)
products_df['discount'].hist()
plt.title('Discount Distribution')

# 4. Interactions Analysis
plt.figure(figsize=(15, 5))

plt.subplot(131)
train_df['add_to_cart'].value_counts(normalize=True).plot(kind='bar')
plt.title('Add to Cart Distribution')

plt.subplot(132)
train_df['device_type'].value_counts().plot(kind='bar')
plt.title('Device Types')

plt.subplot(133)
train_df['pagetype'].value_counts().plot(kind='bar')
plt.title('Page Types')
plt.tight_layout()
plt.show()

# 5. Key Metrics
print("\n=== Key Metrics ===")
print(f"Total Users: {len(users_df)}")
print(f"Total Products: {len(products_df)}")
print(f"Total Interactions: {len(train_df)}")
print(f"Add to Cart Rate: {train_df['add_to_cart'].mean():.2%}")

# User engagement
user_interactions = train_df.groupby('user_id').size()
print("\nInteractions per User:")
print(user_interactions.describe())

# Product popularity
product_popularity = train_df.groupby('partnumber').agg({
    'add_to_cart': ['count', 'mean']
}).round(3)
print("\nProduct Popularity:")
print(product_popularity.describe())

In [68]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# 1. Merge datasets
def prepare_data(train_df, users_df, products_df):
    # Merge all dataframes
    full_df = train_df.merge(users_df, on='user_id', how='left')\
                      .merge(products_df, on='partnumber', how='left')
    
    # Create user-item matrix
    interaction_matrix = pd.pivot_table(
        full_df,
        values='add_to_cart',
        index='user_id',
        columns='partnumber',
        fill_value=0
    )
    
    return full_df, interaction_matrix

# 2. Create and train model
def train_recommendation_model(interaction_matrix):
    # Convert to sparse matrix
    sparse_matrix = csr_matrix(interaction_matrix.values)
    
    # Initialize model
    model = NearestNeighbors(n_neighbors=10, metric='cosine')
    model.fit(sparse_matrix)
    
    return model, sparse_matrix

# 3. Get recommendations
def get_recommendations(model, sparse_matrix, user_idx, n_items=5):
    # Get similar items
    distances, indices = model.kneighbors(
        sparse_matrix[user_idx].reshape(1, -1),
        n_neighbors=n_items+1
    )
    
    return indices[0][1:]  # Exclude the user itself

In [69]:
# Main execution
# Prepare data
full_df, interaction_matrix = prepare_data(train_df, users_df, products_df)

# Train model
model, sparse_matrix = train_recommendation_model(interaction_matrix)

# Example recommendation for first user
user_idx = 0
recommendations = get_recommendations(model, sparse_matrix, user_idx)
print(f"Recommendations for user {interaction_matrix.index[user_idx]}:")
print(recommendations)

# Cold start strategy
popular_products = train_df[train_df['add_to_cart']==1]['partnumber'].value_counts().head(5)
print("\nPopular products (cold start):")
print(popular_products)

  num_cells = num_rows * num_columns


ValueError: negative dimensions are not allowed