In [None]:
import pandas as pd

# Load the datasets
articles = pd.read_csv('articles.csv')
customers = pd.read_csv('customers.csv')
sample_submission = pd.read_csv('sample_submission.csv')
transactions_train = pd.read_csv('transactions_train.csv')

# Display the first few rows and all columns of each dataset
print("Articles Dataset:")
print(articles.head())
print(articles.columns)
print("\nCustomers Dataset:")
print(customers.head())
print(customers.columns)
print("\nSample Submission Dataset:")
print(sample_submission.head())
print(sample_submission.columns)
print("\nTransactions Train Dataset:")
print(transactions_train.head())
print(transactions_train.columns)


#### Curating the dataset

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

def engineer_customer_features(customer_id, transactions_df, articles_df, customers_df, reference_date=None):
    """
    Extract rich features for a customer based on their transaction history
    
    Args:
        customer_id (str): The customer's unique identifier
        transactions_df (pd.DataFrame): Transaction history data
        articles_df (pd.DataFrame): Article metadata
        customers_df (pd.DataFrame): Customer metadata
        reference_date (datetime, optional): Reference date for time calculations
        
    Returns:
        dict: Structured feature dictionary
    """
    # Set reference date if not provided
    if reference_date is None:
        reference_date = pd.to_datetime('2020-09-22')  # End of dataset
    elif isinstance(reference_date, str):
        reference_date = pd.to_datetime(reference_date)
    
    # Get customer metadata
    customer_meta = customers_df[customers_df['customer_id'] == customer_id]
    if customer_meta.empty:
        customer_meta = {
            'age': None,
            'club_member_status': None,
            'fashion_news_frequency': None
        }
    else:
        customer_meta = customer_meta.iloc[0].to_dict()
    
    # Filter transactions for this customer
    customer_txns = transactions_df[transactions_df['customer_id'] == customer_id].copy()
    
    # Handle empty transaction history
    if customer_txns.empty:
        return {
            'customer_id': customer_id,
            'customer_meta': customer_meta,
            'purchase_history': [],
            'product_preferences': {},
            'temporal_patterns': {},
            'style_preferences': {}
        }
    
    # Ensure datetime format
    customer_txns['t_dat'] = pd.to_datetime(customer_txns['t_dat'])
    
    # Sort by date
    customer_txns = customer_txns.sort_values('t_dat')
    
    # Enrich transactions with article data
    enriched_txns = pd.merge(
        customer_txns,
        articles_df[['article_id', 'product_type_name', 'product_group_name', 
                     'colour_group_name', 'garment_group_name', 'index_group_name']],
        on='article_id',
        how='left'
    )
    
    # Calculate purchase history features
    purchase_history = {
        'total_purchases': len(customer_txns),
        'first_purchase_date': customer_txns['t_dat'].min().strftime('%Y-%m-%d'),
        'last_purchase_date': customer_txns['t_dat'].max().strftime('%Y-%m-%d'),
        'avg_price': float(customer_txns['price'].mean()),
        'recent_article_ids': customer_txns.tail(10)['article_id'].tolist()
    }
    
    # Calculate product preference features
    product_preferences = {
        'product_type': Counter(enriched_txns['product_type_name'].fillna('Unknown')).most_common(5),
        'product_group': Counter(enriched_txns['product_group_name'].fillna('Unknown')).most_common(3),
        'index_group': Counter(enriched_txns['index_group_name'].fillna('Unknown')).most_common(3)
    }
    
    # Calculate temporal pattern features
    enriched_txns['day_of_week'] = enriched_txns['t_dat'].dt.dayofweek
    enriched_txns['month'] = enriched_txns['t_dat'].dt.month
    
    # Purchase day patterns
    day_counts = Counter(enriched_txns['day_of_week'])
    total_days = sum(day_counts.values())
    day_distribution = {day: count/total_days for day, count in day_counts.items()}
    
    # Purchase month patterns
    month_counts = Counter(enriched_txns['month'])
    total_months = sum(month_counts.values())
    month_distribution = {month: count/total_months for month, count in month_counts.items()}
    
    # Purchase frequency
    if len(customer_txns) > 1:
        date_diffs = []
        dates = sorted(customer_txns['t_dat'].unique())
        for i in range(1, len(dates)):
            diff_days = (dates[i] - dates[i-1]).days
            date_diffs.append(diff_days)
        avg_days_between_purchases = sum(date_diffs) / len(date_diffs) if date_diffs else 0
    else:
        avg_days_between_purchases = 0
    
    # Use reference_date for days_since_last_purchase calculation
    temporal_patterns = {
        'day_distribution': day_distribution,
        'month_distribution': month_distribution,
        'avg_days_between_purchases': avg_days_between_purchases,
        'days_since_last_purchase': (reference_date - customer_txns['t_dat'].max()).days
    }
    
    # Calculate style preference features
    style_preferences = {
        'color': Counter(enriched_txns['colour_group_name'].fillna('Unknown')).most_common(5),
        'garment_group': Counter(enriched_txns['garment_group_name'].fillna('Unknown')).most_common(3)
    }
    
    # Assemble all features
    features = {
        'customer_id': customer_id,
        'customer_meta': customer_meta,
        'purchase_history': purchase_history,
        'product_preferences': product_preferences,
        'temporal_patterns': temporal_patterns,
        'style_preferences': style_preferences
    }
    
    return features

In [None]:
def create_training_dataset(transactions_df, articles_df, customers_df, max_samples=10000, train_cutoff='2020-08-15'):
    """
    Create a training dataset for the recommendation model
    
    Args:
        transactions_df (pd.DataFrame): Transaction history data
        articles_df (pd.DataFrame): Article metadata
        customers_df (pd.DataFrame): Customer metadata
        max_samples (int): Maximum number of customer samples to use
        train_cutoff (str): Date to split training and target purchases
        
    Returns:
        list: List of training examples with inputs and target outputs
    """
    # Convert cutoff to datetime
    cutoff_date = pd.to_datetime(train_cutoff)
    
    # Split transactions into training and target periods
    transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])
    train_txns = transactions_df[transactions_df['t_dat'] < cutoff_date]
    
    # Target transactions: 7 days after cutoff
    target_end_date = cutoff_date + pd.Timedelta(days=7)
    target_txns = transactions_df[(transactions_df['t_dat'] >= cutoff_date) & 
                                 (transactions_df['t_dat'] < target_end_date)]
    
    # Find customers who have both training and target transactions
    train_customers = set(train_txns['customer_id'].unique())
    target_customers = set(target_txns['customer_id'].unique())
    eligible_customers = list(train_customers.intersection(target_customers))
    
    # If we have too many customers, sample a subset
    if len(eligible_customers) > max_samples:
        import random
        random.seed(42)  # For reproducibility
        eligible_customers = random.sample(eligible_customers, max_samples)
    
    print(f"Creating training data for {len(eligible_customers)} customers...")
    
    # Create training examples
    training_examples = []
    
    for i, customer_id in enumerate(eligible_customers):
        if i % 100 == 0:
            print(f"Processing customer {i}/{len(eligible_customers)}")
        
        # Get features from training period - pass the cutoff date as reference
        features = engineer_customer_features(
            customer_id, 
            train_txns, 
            articles_df, 
            customers_df,
            reference_date=cutoff_date  # Use training cutoff as reference date
        )
        
        # Get target articles from target period
        target_articles = target_txns[target_txns['customer_id'] == customer_id]['article_id'].unique().tolist()
        
        # Create input prompt
        input_prompt = format_prompt_from_features(features)
        
        # Create target output (customer_id, article_id1, article_id2, ...)
        target_output = f"{customer_id}, {', '.join(map(str, target_articles))}"
        
        training_examples.append({
            "input": input_prompt,
            "output": target_output
        })
    
    return training_examples

def format_prompt_from_features(features):
    """
    Format customer features into a text prompt for the model
    
    Args:
        features (dict): Customer feature dictionary
        
    Returns:
        str: Formatted prompt
    """
    # Extract metadata
    age = features['customer_meta'].get('age', 'Unknown')
    membership = features['customer_meta'].get('club_member_status', 'Unknown')
    fashion_news = features['customer_meta'].get('fashion_news_frequency', 'Unknown')
    
    # Purchase history
    purchase_history = features['purchase_history']
    total_purchases = purchase_history['total_purchases']
    last_purchase = purchase_history['last_purchase_date']
    avg_price = purchase_history['avg_price']
    recent_articles = purchase_history['recent_article_ids'][-5:]  # Last 5 purchases
    
    # Product preferences
    product_types = features['product_preferences']['product_type']
    product_groups = features['product_preferences']['product_group']
    
    # Style preferences
    colors = features['style_preferences']['color']
    garment_groups = features['style_preferences']['garment_group']
    
    # Temporal patterns
    days_since_last = features['temporal_patterns']['days_since_last_purchase']
    purchase_frequency = 30 / max(1, features['temporal_patterns']['avg_days_between_purchases']) if features['temporal_patterns']['avg_days_between_purchases'] > 0 else 0
    
    # Format the prompt
    prompt = f"""Customer {features['customer_id']} details:
- Age: {age}
- Membership Status: {membership}
- Fashion News Frequency: {fashion_news}

Purchase History:
- Total Purchases: {total_purchases}
- Last Purchase Date: {last_purchase} ({days_since_last} days ago)
- Purchase Frequency: {purchase_frequency:.2f} items per month
- Average Price Point: {avg_price:.4f}
- Recent Purchases: {', '.join(map(str, recent_articles))}

Product Preferences:
- Favorite Product Types: {', '.join([f"{p[0]} ({p[1]} purchases)" for p in product_types[:3]])}
- Favorite Product Groups: {', '.join([f"{p[0]} ({p[1]} purchases)" for p in product_groups[:2]])}
- Favorite Garment Groups: {', '.join([f"{g[0]} ({g[1]} purchases)" for g in garment_groups[:2]])}

Style Preferences:
- Color Preferences: {', '.join([f"{c[0]} ({c[1]} purchases)" for c in colors[:3]])}

Based on this customer's profile and purchase history, recommend the specific article IDs they are most likely to purchase in the next 7 days.

Recommendations:"""
    
    return prompt

In [None]:
def build_and_save_training_data(transactions_df, articles_df, customers_df, output_path='train_data.csv', samples=5000):
    """
    Build and save the training dataset
    
    Args:
        transactions_df (pd.DataFrame): Transaction history data
        articles_df (pd.DataFrame): Article metadata
        customers_df (pd.DataFrame): Customer metadata
        output_path (str): Path to save the dataset
        samples (int): Number of examples to generate
        
    Returns:
        pd.DataFrame: The generated training data
    """
    # Create training examples
    training_examples = create_training_dataset(
        transactions_df, 
        articles_df, 
        customers_df, 
        max_samples=samples
    )
    
    # Convert to DataFrame
    train_df = pd.DataFrame(training_examples)
    
    # Save to CSV
    train_df.to_csv(output_path, index=False)
    
    print(f"Training data saved to {output_path}")
    print(f"Total examples: {len(train_df)}")
    
    return train_df

In [None]:
def analyze_training_data(train_df, transactions_df, articles_df):
    """
    Analyze the quality of the generated training data
    
    Args:
        train_df (pd.DataFrame): The training data
        transactions_df (pd.DataFrame): Original transaction data
        articles_df (pd.DataFrame): Article metadata
        
    Returns:
        dict: Analysis statistics
    """
    print(f"Total training examples: {len(train_df)}")
    
    # Check input prompt length
    prompt_lengths = train_df['input'].str.len()
    print(f"Prompt length statistics:")
    print(f"  Mean: {prompt_lengths.mean():.2f} characters")
    print(f"  Min: {prompt_lengths.min()} characters")
    print(f"  Max: {prompt_lengths.max()} characters")
    
    # Check target output
    # Extract article counts per example
    article_counts = []
    for output in train_df['output']:
        parts = output.split(',')
        if len(parts) > 1:  # First part is customer_id
            article_counts.append(len(parts) - 1)
        else:
            article_counts.append(0)
    
    print(f"Target article count statistics:")
    print(f"  Mean: {np.mean(article_counts):.2f} articles per customer")
    print(f"  Min: {min(article_counts)} articles")
    print(f"  Max: {max(article_counts)} articles")
    
    # Check distribution of article types in targets
    all_target_articles = []
    for output in train_df['output']:
        parts = output.split(',')
        if len(parts) > 1:
            articles = [int(a.strip()) for a in parts[1:] if a.strip().isdigit()]
            all_target_articles.extend(articles)
    
    # Get article metadata for targets
    target_articles_df = articles_df[articles_df['article_id'].isin(all_target_articles)]
    
    # Product type distribution
    product_type_dist = target_articles_df['product_type_name'].value_counts().head(10)
    print("\nTop 10 product types in target recommendations:")
    for ptype, count in product_type_dist.items():
        print(f"  {ptype}: {count} ({count/len(all_target_articles)*100:.2f}%)")
    
    # Return analysis results
    return {
        'example_count': len(train_df),
        'prompt_length': {
            'mean': prompt_lengths.mean(),
            'min': prompt_lengths.min(),
            'max': prompt_lengths.max()
        },
        'article_counts': {
            'mean': np.mean(article_counts),
            'min': min(article_counts),
            'max': max(article_counts)
        },
        'top_product_types': product_type_dist.to_dict()
    }

In [None]:
def main():
    """Main function to run the training data creation pipeline"""
    print("Loading datasets...")
    articles_df = pd.read_csv("articles.csv")
    customers_df = pd.read_csv("customers.csv")
    transactions_df = pd.read_csv("transactions_train.csv")
    
    print("Preparing training dataset...")
    # Use a smaller sample for initial testing
    train_df = build_and_save_training_data(
        transactions_df,
        articles_df,
        customers_df,
        samples=1000  # Start with smaller sample for testing
    )
    
    print("\nAnalyzing training data quality...")
    analysis = analyze_training_data(train_df, transactions_df, articles_df)
    
    # Create train/validation split
    from sklearn.model_selection import train_test_split
    train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)
    
    # Save splits
    train_data.to_csv("train_data_train.csv", index=False)
    val_data.to_csv("train_data_val.csv", index=False)
    
    print(f"\nSaved {len(train_data)} training examples and {len(val_data)} validation examples")
    
    # Show example
    print("\nExample training input:")
    print(train_df['input'].iloc[0][:500] + "...")
    print("\nExample training output:")
    print(train_df['output'].iloc[0])
    
    return train_df, analysis

# Run the pipeline
if __name__ == "__main__":
    train_df, analysis = main()

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "facebook/opt-1.3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_config, get_peft_model, TaskType

# ------------------------------
# 1. Load the Training Dataset
# ------------------------------
data_files = {"train": "train_data_train.csv", "validation": "train_data_val.csv"}
dataset = load_dataset("csv", data_files=data_files)
print("Training dataset columns:", dataset["train"].column_names)

Training dataset columns: ['input', 'output']


In [3]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [19]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import get_peft_config, get_peft_model, TaskType

# Set environment variables for debugging and GPU configuration
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
torch.cuda.empty_cache()

print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")

# ------------------------------
# 1. Load Model and Tokenizer (Qwen 7B Chat)
# ------------------------------
model_name = "Qwen/Qwen-7B-Chat"  # Model repository ID on Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto"
)
model.gradient_checkpointing_enable()
print("Model and tokenizer loaded successfully.")

# ------------------------------
# 2. Load the Training Dataset
# ------------------------------
data_files = {"train": "train_data_train.csv", "validation": "train_data_val.csv"}
dataset = load_dataset("csv", data_files=data_files)
print("Training dataset columns:", dataset["train"].column_names)

# Check for empty inputs/outputs
print("Checking for empty entries...")
empty_inputs = [i for i, x in enumerate(dataset["train"]) if not x["input"] or len(x["input"]) == 0]
empty_outputs = [i for i, x in enumerate(dataset["train"]) if not x["output"] or len(x["output"]) == 0]
print(f"Empty inputs: {empty_inputs}")
print(f"Empty outputs: {empty_outputs}")

# ------------------------------
# 3. Set Up LoRA Fine-Tuning (PEFT)
# ------------------------------
peft_config = get_peft_config({
    "peft_type": "LORA",
    "task_type": TaskType.CAUSAL_LM,
    "inference_mode": False,
    "r": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],  # Target modules for OPT/Qwen style models
})
model = get_peft_model(model, peft_config, autocast_adapter_dtype=torch.float32)
model.print_trainable_parameters()

# ------------------------------
# 4. Preprocessing Function
# ------------------------------
def preprocess_function(examples):
    combined_texts = []
    prompt_lengths = []
    
    # For each example, concatenate prompt and target
    for inp, target in zip(examples["input"], examples["output"]):
        prompt = f"Input: {inp}\nOutput:"
        combined = prompt + " " + target
        combined_texts.append(combined)
        
        # Compute prompt token length (with special tokens)
        prompt_tokens = tokenizer(prompt, add_special_tokens=True)["input_ids"]
        prompt_lengths.append(len(prompt_tokens))
    
    # Tokenize combined text without padding (we will let the collator pad)
    tokenized = tokenizer(
        combined_texts, 
        max_length=384,
        truncation=True,
        padding=False,
        return_tensors=None
    )
    
    # Create labels with prompt tokens masked (-100)
    labels = []
    for i, ids in enumerate(tokenized["input_ids"]):
        label_ids = ids.copy()
        prompt_len = min(prompt_lengths[i], len(label_ids))
        label_ids[:prompt_len] = [-100] * prompt_len
        labels.append(label_ids)
    
    tokenized["labels"] = labels
    return tokenized

# Process datasets
tokenized_train = dataset["train"].map(
    preprocess_function, 
    batched=True, 
    remove_columns=dataset["train"].column_names,
    desc="Processing training dataset"
)
tokenized_val = dataset["validation"].map(
    preprocess_function, 
    batched=True, 
    remove_columns=dataset["validation"].column_names,
    desc="Processing validation dataset"
)

# Create a proper data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

# ------------------------------
# 5. Training Arguments and Trainer Setup
# ------------------------------
training_args = TrainingArguments(
    output_dir="./qwen7b_finetuned",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    learning_rate=2e-5,
    warmup_ratio=0.05,
    max_grad_norm=0.5,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ------------------------------
# 6. Fine-Tuning
# ------------------------------
print("Starting training...")
trainer.train()

print("Saving model...")
trainer.save_model("./qwen7b_finetuned_final")
print("Training complete!")


GPU available: True
GPU name: NVIDIA GeForce RTX 4090
GPU memory: 25.756696576 GB


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- cpp_kernels.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- cpp_kernels.py
- qwen_generation_utils.py
. Make sure to double-check they do not contain any ad

KeyboardInterrupt: 

In [15]:
# Print some examples to verify format
for i in range(min(5, len(dataset['train']))):
    print(f"Example {i}:")
    print(f"Input: {dataset['train'][i]['input'][:100]}...")
    print(f"Output: {dataset['train'][i]['output']}")
    print()

Example 0:
Input: Customer 24fe302f04d039c18035f650f3702aab8ef038fa3a8be71a03168e67eb176eb0 details:
- Age: 48.0
- Mem...
Output: 24fe302f04d039c18035f650f3702aab8ef038fa3a8be71a03168e67eb176eb0, 762096003, 762096005

Example 1:
Input: Customer 361294c454f95944fafc18dc22e45a3093a41c636a77ad92b12bec7de43249f9 details:
- Age: 24.0
- Mem...
Output: 361294c454f95944fafc18dc22e45a3093a41c636a77ad92b12bec7de43249f9, 754238023

Example 2:
Input: Customer 2ae89a0dd849301a82bbc69a646e4df18c31f60bad72e8ef2e9caffe611f46cb details:
- Age: 25.0
- Mem...
Output: 2ae89a0dd849301a82bbc69a646e4df18c31f60bad72e8ef2e9caffe611f46cb, 713699005

Example 3:
Input: Customer 7e0a41e39052a62e30cbe0d370518516a3919e307ea6fc8cfad7b24bdd6488cb details:
- Age: 55.0
- Mem...
Output: 7e0a41e39052a62e30cbe0d370518516a3919e307ea6fc8cfad7b24bdd6488cb, 757303021, 859125005, 713997031, 850251002

Example 4:
Input: Customer 6c051182fe486826cdd331070e5af84f481293d7e61af6c86397dfa04284b534 details:
- Age: 25.0
- Mem...
Output