In [None]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple, Optional

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
TFIDF_MIN_DF = 3
TFIDF_MAX_FEATURES = None
NGRAM_RANGE = (1, 2)
TEXT_TOP_N = 50
TOP_K = 10

2 problems: 

#### I. Job prediction 

- Hybrid recommender system combining sequential and textual signals:
1. Markov chain to model job-to-job transitions
2. TF-IDF + cosine similarity for cold start and content fallback
Final ranking based on transition probability boosted by text similarity

#### II. Action Prediction
- Logistic Regression classifier, predicts next user action (e.g., apply vs view)
- Uses trailing number of recent "view" interactions as input feature, e.g. captures short-term engagement intensity signal

# I. Job Prediction

## 1. Loading Data

In [None]:
repo_dir = Path("/Users/enfants/Code/Job Offer Recommendation")
data_dir = repo_dir / "Data"

In [None]:
X_train = pd.read_csv(data_dir / "x_train.csv")
y_train = pd.read_csv(data_dir / "y_train.csv")
X_test  = pd.read_csv(data_dir / "x_test.csv")

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
def load_jobs_json(path):
    """
    Loads job_listings.json structured as:
    {
        "0": "TEXT...",
        "1": "TEXT...",
        ...
    }
    Returns a DataFrame with columns:
    - job_id (int)
    - job_text_raw (str)
    """

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, dict):
        raise ValueError("Expected JSON to be a dict of {job_id: text}")

    jobs_df = pd.DataFrame({
        "job_id": list(data.keys()),
        "job_text_raw": list(data.values())
    })

    jobs_df["job_id"] = jobs_df["job_id"].astype(int)

    return jobs_df


In [None]:
jobs_df = load_jobs_json(data_dir / "job_listings.json")

print(jobs_df.shape)
jobs_df.head()


## 2. Preprocessing

### 2.1 Text Preprocessing

In [None]:
def clean_text(text: str) -> str:
    """Basic cleaning: lowercase, remove weird chars, collapse spaces."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)                 # collapse whitespace
    text = re.sub(r"[^\w\s\-\/\+]", " ", text)       # keep letters/digits/_ and a few separators
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
jobs_df["job_text"] = jobs_df["job_text_raw"].map(clean_text)
jobs_df = jobs_df[["job_id", "job_text"]]

print(jobs_df.shape)
jobs_df.head()

### 2.2 Job Sequence Processing

In [None]:
def parse_list_column(df, col):
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return df


# parse input data (train & test)
X_train = parse_list_column(X_train, "job_ids")
X_train = parse_list_column(X_train, "actions")

X_test  = parse_list_column(X_test, "job_ids")
X_test  = parse_list_column(X_test, "actions")


# merge train inputs with targets
df = X_train.merge(y_train, on="session_id")


# rebuild full sequences (jobs + actions)
df["full_job_sequence"] = (
    df["job_ids"] + df["job_id"].apply(lambda x: [x])
)

df["full_action_sequence"] = (
    df["actions"] + df["action"].apply(lambda x: [x])
)

## 3. Evaluation: MRR

In [None]:
# split

train_sessions, val_sessions = train_test_split(
    df["session_id"].unique(),
    test_size=0.2,
    random_state=42
)

train_df = df[df["session_id"].isin(train_sessions)]
val_df = df[df["session_id"].isin(val_sessions)]

In [None]:
# implement metrics : Mean Reciprocal Ranking

def compute_mrr(y_true, y_pred, k=10):
    """
    y_true: list of true job_ids
    y_pred: list of list, each inner list = top-k predicted job_ids
    """
    rr = []
    for true_job, preds in zip(y_true, y_pred):
        if true_job in preds[:k]:
            rank = preds.index(true_job) + 1
            rr.append(1 / rank)
        else:
            rr.append(0)
    return sum(rr) / len(rr)


In [None]:
def evaluate_topk(y_true, y_pred, k=10):
    """
    y_true : list[int]
    y_pred : list[list[int]]
    k : int (top-k)

    Returns a dict with:
        - miss_rate
        - mean_rank (when hit)
        - min_rank
        - max_rank
    """

    assert len(y_true) == len(y_pred), "y_true and y_pred must have same length"

    hits = []
    ranks = []

    for true_job, preds in zip(y_true, y_pred):
        top_preds = preds[:k]

        if true_job in top_preds:
            hits.append(1)
            ranks.append(top_preds.index(true_job) + 1)
        else:
            hits.append(0)

    miss_rate = 1 - (sum(hits) / len(hits))

    if ranks:
        mean_rank = sum(ranks) / len(ranks)
        min_rank = min(ranks)
        max_rank = max(ranks)
    else:
        mean_rank = None
        min_rank = None
        max_rank = None

    return {
        "miss_rate": miss_rate,
        "mean_rank_when_hit": mean_rank,
        "min_rank": min_rank,
        "max_rank": max_rank
    }


## 4. Text Model (TF-IDF → vector by job_id)

In [None]:
def build_tfidf(
    jobs_df: pd.DataFrame,
    min_df: int = TFIDF_MIN_DF,
    max_features: int = TFIDF_MAX_FEATURES,
    ngram_range: Tuple[int, int] = NGRAM_RANGE,
) -> Tuple[TfidfVectorizer, np.ndarray, List[int]]:
    """
    Term Frequency – Inverse Document Frequency. Converts text into numerical vectors, giving more 
    weight to words that are frequent in a document but rare in the entire corpus.
    
    Returns:
      - fitted vectorizer
      - tfidf_matrix (sparse)
      - job_ids aligned with matrix rows
    """
    job_ids = jobs_df["job_id"].astype(int).tolist()
    texts = jobs_df["job_text"].fillna("").tolist()

    vectorizer = TfidfVectorizer(
        min_df=min_df,
        max_features=max_features,
        ngram_range=ngram_range,
        strip_accents="unicode",
    )
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix, job_ids

In [None]:
def build_text_neighbors(
    tfidf_matrix,
    job_ids: List[int],
    top_n: int = 50,
    n_jobs: int = -1
) -> Dict[int, List[Tuple[int, float]]]:
    """
    Returns mapping:
      job_id -> [(neighbor_job_id, similarity), ...] sorted by similarity desc
    """
    nn = NearestNeighbors(
        n_neighbors=top_n + 1,  # +1 because the closest is itself
        metric="cosine",
        algorithm="brute",
        n_jobs=n_jobs
    )
    nn.fit(tfidf_matrix)

    distances, indices = nn.kneighbors(tfidf_matrix, return_distance=True)

    id_by_row = np.array(job_ids)
    neighbors_map: Dict[int, List[Tuple[int, float]]] = {}

    for row_idx in range(indices.shape[0]):
        src_job_id = int(id_by_row[row_idx])
        row_neighbors = []
        for d, j in zip(distances[row_idx], indices[row_idx]):
            neighbor_job_id = int(id_by_row[j])
            if neighbor_job_id == src_job_id:
                continue
            sim = float(1.0 - d)  # cosine distance -> similarity
            row_neighbors.append((neighbor_job_id, sim))
        neighbors_map[src_job_id] = row_neighbors[:top_n]

    return neighbors_map

In [None]:
vectorizer, tfidf_matrix, tfidf_job_ids = build_tfidf(jobs_df)

print(tfidf_matrix.shape)

In [None]:
text_neighbors = build_text_neighbors(tfidf_matrix, tfidf_job_ids, top_n=TEXT_TOP_N)

text_neighbors[100][:5]

## 5. Markov Model

### 5.1 Build Transition Matrix

In [None]:
# extract all transitions (job_i to job_i+1)

transition_counts = defaultdict(int)

for seq in df["full_job_sequence"]:
    for i in range(len(seq) - 1):
        transition_counts[(seq[i], seq[i+1])] += 1


In [None]:
len(transition_counts)

In [None]:
# build job-to-job matrix

trans_df = pd.DataFrame(
    [(i, j, c) for (i, j), c in transition_counts.items()],
    columns=["from_job", "to_job", "count"]
)

In [None]:
trans_df["count"].value_counts().sort_index().plot(kind="bar")
plt.xlabel("Transition count")
plt.ylabel("Number of job-to-job pairs")
plt.title("Distribution of job-to-job transition counts")
plt.tight_layout()
plt.yscale("log")
plt.show()

In [None]:
# normalize in conditional probabilities
# Knowing that the candidate is in job A, what is the probability that they will move to job B?

trans_df["prob"] = (
    trans_df["count"] /
    trans_df.groupby("from_job")["count"].transform("sum")
)

In [None]:
trans_df.head()

In [None]:
trans_df.groupby("from_job")["prob"].sum().head()

In [None]:
def build_markov_map(trans_df: pd.DataFrame, top_n: int = TEXT_TOP_N) -> Dict[int, List[Tuple[int, float]]]:
    trans_df_sorted = trans_df.sort_values(["from_job", "prob"], ascending=[True, False])
    markov_map = {}
    for from_job, group in trans_df_sorted.groupby("from_job"):
        top = group.head(top_n)[["to_job", "prob"]].values.tolist()
        markov_map[int(from_job)] = [(int(j), float(p)) for j, p in top]
    return markov_map

In [None]:
# create a fallback in case job_id is not in from_job : overall popularity of jobs

popular_jobs = (
    trans_df.groupby("to_job")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
    .tolist()
)

In [None]:
# implement top 10 from last job
# "If the last job viewed is X, recommend the 10 most likely jobs after X."

def recommend_next_jobs(last_job, trans_df, popular_jobs, k=10):
    candidates = trans_df[trans_df["from_job"] == last_job]
    
    if len(candidates) == 0:
        return popular_jobs[:k]
    
    return (
        candidates
        .sort_values("prob", ascending=False)
        .head(k)["to_job"]
        .tolist()
    )


In [None]:
example_job = trans_df["from_job"].iloc[0]
recommend_next_jobs(example_job, trans_df, popular_jobs)

In [None]:
print("Jobs texte:", jobs_df["job_id"].nunique())
print("Jobs Markov from_job:", trans_df["from_job"].nunique())
print("Jobs Markov to_job:", trans_df["to_job"].nunique())

In [None]:
def build_job_popularity(trans_df: pd.DataFrame, seq_col: str = "full_job_sequence") -> Dict[int, float]:
    counts = {}
    for seq in trans_df[seq_col]:
        for job_id in seq:
            counts[job_id] = counts.get(job_id, 0) + 1

    total = sum(counts.values()) if counts else 1
    popularity = {job_id: c / total for job_id, c in counts.items()}
    return popularity

In [None]:
job_popularity = build_job_popularity(df)

### 5.2 Session-aware recommender

In [None]:
# group all recommendations on the whole session

# create a dictionnary from_job -> list[(to_job, prob)]

next_map = defaultdict(list)
for r in trans_df.itertuples(index=False):
    next_map[r.from_job].append((r.to_job, r.prob))

In [None]:
def recommend_from_session(job_seq, next_map, popular_jobs, k=10):
    scores = defaultdict(float)

    for j in job_seq:
        for to_job, prob in next_map.get(j, []):
            scores[to_job] += prob

    # fallback if no signal
    if not scores:
        return popular_jobs[:k]

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [job for job, _ in ranked[:k]]


In [None]:
# add weighted score by recency

def recommend_from_session_recency(job_seq, next_map, popular_jobs, k=10):
    scores = defaultdict(float)
    L = len(job_seq)

    for i, j in enumerate(job_seq):
        weight = (i + 1) / L   # linear recency
        for to_job, prob in next_map.get(j, []):
            scores[to_job] += weight * prob

    if not scores:
        return popular_jobs[:k]

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [job for job, _ in ranked[:k]]


In [None]:
# example on one session

seq = val_df.iloc[2]["full_job_sequence"]
history = seq[:-1]        # everything except the target
true_job = seq[-1]

preds_plain = recommend_from_session(history, next_map, popular_jobs)
preds_recent = recommend_from_session_recency(history, next_map, popular_jobs)

print("History:", history)
print("True job:", true_job)
print("Plain:", preds_plain)
print("Rank plain:", preds_plain.index(true_job) + 1 if true_job in preds_plain else "MISS")
print("Recency:", preds_recent)
print("Rank recency:", preds_recent.index(true_job) + 1 if true_job in preds_recent else "MISS")

In [None]:
# test differents k-values

k_values = [1, 3, 5, 7, 10]

def predict_last_k(seq, k):
    return seq[-k:] if len(seq) >= k else seq

results = {}

for k in k_values:
    y_true = []
    y_pred = []

    for _, row in val_df.iterrows():
        seq = row["full_job_sequence"]
        history = predict_last_k(seq[:-1], k)
        true_job = seq[-1]

        preds = recommend_from_session_recency(
            history,
            next_map,
            popular_jobs,
            k=10
        )

        y_true.append(true_job)
        y_pred.append(preds)

    mrr = compute_mrr(y_true, y_pred, k=10)
    results[k] = mrr
    print(f"k={k} → MRR@10 = {mrr:.4f}")


Performance decreases as k last jobs increases.
The best MRR@10 is obtained with **k = 1**, meaning the last visited job is the most informative.


This confirms that user intent shifts quickly and that older interactions add noise rather than signal.

## 6. Hybrid Recommender

In [None]:
def recommend_jobs(session_job_ids, k=10):
    seen = set(session_job_ids)
    last_job = session_job_ids[-1] if session_job_ids else None
    candidates = {}

    # 1. Markov
    if last_job in markov_map:
        for j, p in markov_map[last_job]:
            if j not in seen:
                candidates[j] = p

    # 2. Texte if few candidates
    if len(candidates) < k and last_job in text_neighbors:
        for j, sim in text_neighbors[last_job]:
            if j not in seen:
                candidates[j] = candidates.get(j, 0) + 0.8 * sim

    # 3. Popularity fallback
    if len(candidates) < k:
        for j, pop in sorted(job_popularity.items(), key=lambda x: x[1], reverse=True):
            if j not in seen:
                candidates[j] = candidates.get(j, 0) + 0.1 * pop
            if len(candidates) >= 200:
                break

    ranked = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    return [j for j, _ in ranked[:k]]

In [None]:
def recommend_markov_only(session_job_ids: List[int], markov_map, popularity, k=10) -> List[int]:
    last_job = session_job_ids[-1] if session_job_ids else None
    seen = set(session_job_ids)

    candidates = {}
    if last_job is not None and last_job in markov_map:
        for j, p in markov_map[last_job]:
            if j not in seen:
                candidates[j] = max(candidates.get(j, 0.0), p)

    # fallback popularity
    if len(candidates) < k and popularity:
        for j, pop in sorted(popularity.items(), key=lambda x: x[1], reverse=True):
            if j in seen:
                continue
            candidates[j] = candidates.get(j, 0.0) + 0.01 * pop
            if len(candidates) >= 500:
                break

    ranked = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    return [j for j, _ in ranked[:k]]

In [None]:
def recommend_text_only(session_job_ids, k=10):
    last_job = session_job_ids[-1]
    seen = set(session_job_ids)
    return [
        j for j, _ in text_neighbors.get(last_job, [])
        if j not in seen
    ][:k]

In [None]:
def safe_recommend(job_ids, k=10):
    recs = recommend_jobs(job_ids, k=k)

    if len(recs) < k:
        seen = set(job_ids) | set(recs)
        for j, _ in sorted(job_popularity.items(), key=lambda x: x[1], reverse=True):
            if j not in seen:
                recs.append(j)
            if len(recs) == k:
                break

    return recs

## 7. Offline Evaluation

In [None]:
# compute MRR@10 on validation

y_true = []
y_pred = []

for _, row in val_df.iterrows():
    last_job = row["full_job_sequence"][-2]          # last job seen
    true_job = row["full_job_sequence"][-1]              # target jon
    
    preds = recommend_next_jobs(
        last_job,
        trans_df,
        popular_jobs,
        k=10
    )
    
    y_true.append(true_job)
    y_pred.append(preds)

# saving baseline score

mrr = compute_mrr(y_true, y_pred, k=10)
print(f"MRR@10 (baseline Markov): {mrr:.4f}")


In [None]:
metrics = evaluate_topk(y_true, y_pred, k=10)
metrics

## 8. Executing

In [None]:
vectorizer, tfidf_matrix, tfidf_job_ids = build_tfidf(
    jobs_df,
    min_df=TFIDF_MIN_DF,
    max_features=None,
    ngram_range=NGRAM_RANGE
)

print(tfidf_matrix.shape)

In [None]:
text_neighbors = build_text_neighbors(
    tfidf_matrix,
    tfidf_job_ids,
    top_n=TEXT_TOP_N
)

In [None]:
markov_map = build_markov_map(trans_df, top_n=TEXT_TOP_N)
job_popularity = build_job_popularity(train_df)

In [None]:
val_pred = [recommend_text_only(seq, k=10) for seq in val_df["job_ids"]]
val_true = val_df["job_id"].tolist()

mrr_hybrid = compute_mrr(val_true, val_pred, k=10)
print("MRR Text Only:", mrr_hybrid)


In [None]:
val_pred_markov = [recommend_markov_only(seq, markov_map, job_popularity, k=10) for seq in val_df["job_ids"]]
mrr_markov = compute_mrr(val_true, val_pred_markov, k=10)

print("MRR@10 markov:", mrr_markov)
print("Delta:", mrr_hybrid - mrr_markov)

# II. Action Prediction

### 1. Small EDA on number of 'views'

In [None]:
# inspect initial distribution

df["action"].value_counts(normalize=True)

In [None]:
df["n_view"] = df["actions"].apply(
    lambda x: len(ast.literal_eval(x)) if isinstance(x, str) else len(x)
)

prop = (
    df.groupby("n_view")["action"]
      .value_counts(normalize=True)
      .unstack(fill_value=0)
)

prop.head()

In [None]:
prop.plot(
    kind="bar",
    stacked=True
)

plt.xlabel("Number of previous views (n_view)")
plt.ylabel("Proportion")
plt.title("Probability of next action given nb. of views in the series")
plt.legend(title="Next action")
plt.tight_layout()
plt.xticks(rotation=0)
plt.show()


In [None]:
def trailing_views(actions):
    c = 0
    for a in reversed(actions):
        if a == "view":
            c += 1
        else:
            break
    return c

df["trail_view"] = df["actions"].apply(
    lambda x: trailing_views(ast.literal_eval(x)) if isinstance(x, str) else trailing_views(x)
)


prop2 = (
    df.groupby("trail_view")["action"]
      .value_counts(normalize=True)
      .unstack(fill_value=0)
)

prop2.head()


In [None]:
prop2.plot(kind="bar", stacked=True)

plt.xlabel("Number of consecutive views at end of sequence")
plt.ylabel("Proportion")
plt.title("Next action distribution vs trailing nb. of views")
plt.legend(title="Next action")
plt.tight_layout()
plt.xticks(rotation=0)
plt.show()

### 2. Linear regression using one feature: the number of trailing 'view' actions before the next action

In [None]:
# preparing data

action_df = df[["full_action_sequence"]].copy()

# history = all actions except the last one
action_df["action_history"] = action_df["full_action_sequence"].apply(lambda x: x[:-1])

# target = last action
action_df["action_target"] = action_df["full_action_sequence"].apply(lambda x: x[-1])


# feature engineering

def trailing_views(actions):
    c = 0
    for a in reversed(actions):
        if a == "view":
            c += 1
        else:
            break
    return c

action_df["trail_view"] = action_df["action_history"].apply(trailing_views)

In [None]:
# model training

X_action = action_df[["trail_view"]]
y_action = action_df["action_target"]

action_clf = LogisticRegression()
action_clf.fit(X_action, y_action)


y_pred = action_clf.predict(X_val_ml)

print(classification_report(y_val_ml, y_pred))

bal_acc = balanced_accuracy_score(y_val_ml, y_pred)
print(f"Balanced accuracy: {bal_acc:.3f}")


- We train a simple logistic regression using one feature: the number of trailing 'view' actions.
- This feature is highly predictive of the next action.
- The model achieves 83% accuracy, far above the 60% baseline.
- It detects most 'apply' actions, with a recall of 87%.
- The more consecutive 'view' actions at the end of a session, the more likely the next action is to be 'apply'.


### 3. predict_action

In [None]:
def predict_action(actions):
    """
    actions: list[str] (history only)
    returns: "view" or "apply"
    """
    tv = trailing_views(actions)
    X = pd.DataFrame([[tv]], columns=["trail_view"])
    return action_clf.predict(X)[0]

# Final Prediction

In [None]:
def predict_session(job_ids, actions):
    """
    job_ids : list[int]
    actions : list[str]
    returns : (top10_jobs: list[int], action: str)
    """

    # Hybrid job recommendation
    top10_jobs = safe_recommend(job_ids, k=10)

    # Action prediction
    action = predict_action(actions)

    return top10_jobs, action


In [None]:
predictions = []

for _, row in X_test.iterrows():
    top10_jobs, action = predict_session(
        row["job_ids"],
        row["actions"]
    )

    predictions.append({
        "session_id": row["session_id"],
        "action": action,
        "job_id": top10_jobs
    })

submission_df = pd.DataFrame(predictions)

In [None]:
print(submission_df.shape)

# 10 jobs par ligne
print(submission_df["job_id"].apply(len).value_counts())

# distribution action
print(submission_df["action"].value_counts(normalize=True))

In [None]:
from_jobs = set(trans_df["from_job"].unique())

cold_rate = sum(
    row["job_ids"][-1] not in from_jobs
    for _, row in X_test.iterrows()
) / len(X_test)

cold_rate

## Submission file

In [None]:
submission_dir = repo_dir / "output"
submission_dir.mkdir(parents=True, exist_ok=True)

# Find existing versions
existing_files = list(submission_dir.glob("submission_hybrid_v*.csv"))

version_numbers = []

for f in existing_files:
    match = re.search(r"v(\d+)", f.stem)
    if match:
        version_numbers.append(int(match.group(1)))

next_version = max(version_numbers, default=0) + 1

file_name = f"submission_hybrid_v{next_version}.csv"
file_path = submission_dir / file_name

submission_df.to_csv(file_path, index=False)

print(f"Saved to: {file_path}")

In [None]:
check_df = pd.read_csv(file_path)

print("Shape:", check_df.shape)
print(check_df.head())

# Run Whole Pipeline

In [None]:
def run_pipeline():

    print("Building text model...")
    vectorizer, tfidf_matrix, tfidf_job_ids = build_tfidf(
        jobs_df,
        min_df=TFIDF_MIN_DF,
        max_features=TFIDF_MAX_FEATURES,
        ngram_range=NGRAM_RANGE
    )

    text_neighbors = build_text_neighbors(
        tfidf_matrix,
        tfidf_job_ids,
        top_n=TEXT_TOP_N
    )

    print("Building Markov model...")
    markov_map = build_markov_map(trans_df, top_n=TEXT_TOP_N)
    popularity = build_job_popularity(train_df)

    print("Generating submission...")
    predictions = []

    for _, row in X_test.iterrows():
        top10_jobs = recommend_jobs(row["job_ids"])

        action = predict_action(row["actions"])

        predictions.append({
            "session_id": row["session_id"],
            "action": action,
            "job_id": top10_jobs
        })

    submission_df = pd.DataFrame(predictions)

    print("Sanity checks...")
    assert submission_df.shape[0] == len(X_test)
    assert submission_df["job_id"].apply(len).eq(TOP_K).all()
    assert submission_df.isna().sum().sum() == 0

    print("Saving submission...")

    submission_dir = repo_dir / "output"
    submission_dir.mkdir(parents=True, exist_ok=True)

    # Find existing versions
    existing_files = list(submission_dir.glob("submission_hybrid_v*.csv"))

    version_numbers = []

    for f in existing_files:
        match = re.search(r"v(\d+)", f.stem)
        if match:
            version_numbers.append(int(match.group(1)))

    next_version = max(version_numbers, default=0) + 1

    file_name = f"submission_hybrid_v{next_version}.csv"
    file_path = submission_dir / file_name

    submission_df.to_csv(file_path, index=False)

    print(f"Saved to: {file_path}")

    print("Done.")
    return submission_df

In [None]:
submission_df = run_pipeline()