
# Tourism Package Predictor
**Author:** Venkatesh Rajendran

**Date:** 2026-02-06  


GitHub repo URL : https://github.com/mrvenkatesh/Venkat-TPP-MLOps-Project

HF Space URL : https://huggingface.co/spaces/mrvenkatesh/Venkat-TPP-MLOps-Project



## 1. Environment & Dependencies


In [53]:

# %pip install -q --upgrade pip
# %pip install -q pandas numpy scikit-learn==1.3.2 joblib huggingface_hub gradio xgboost

import os
from pathlib import Path
import pandas as pd, numpy as np

from google.colab import drive
drive.mount('/content/drive/')
np.set_printoptions(legacy='1.25')

PROJECT_ROOT = Path("/content/drive/My Drive/Colab Notebooks/venkat-tpp-mlops-project")
DATA_DIR = PROJECT_ROOT / "data"
ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
EXPERIMENTS_DIR = PROJECT_ROOT / "experiments"
DEPLOY_DIR = PROJECT_ROOT / "deploy"
GH_WORKFLOWS_DIR = PROJECT_ROOT / ".github" / "workflows"
for d in [DATA_DIR, ARTIFACTS_DIR, EXPERIMENTS_DIR, DEPLOY_DIR, GH_WORKFLOWS_DIR]:
    d.mkdir(parents=True, exist_ok=True)
print("Folders ready:", DATA_DIR, ARTIFACTS_DIR, EXPERIMENTS_DIR, DEPLOY_DIR)


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Folders ready: /content/drive/My Drive/Colab Notebooks/venkat-tpp-mlops-project/data /content/drive/My Drive/Colab Notebooks/venkat-tpp-mlops-project/artifacts /content/drive/My Drive/Colab Notebooks/venkat-tpp-mlops-project/experiments /content/drive/My Drive/Colab Notebooks/venkat-tpp-mlops-project/deploy



## 2. Configuration


In [72]:

import os
HF_TOKEN = os.getenv("HF_TOKEN", "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")
HF_USERNAME = os.getenv("HF_USERNAME", "mrvenkatesh")
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", f"{HF_USERNAME}/Venkat-TPP-MLOps-Project")
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", f"{HF_USERNAME}/Venkat-TPP-MLOps-Project")
HF_SPACE_REPO = os.getenv("HF_SPACE_REPO", f"{HF_USERNAME}/Venkat-TPP-MLOps-Project")
print(HF_DATASET_REPO, HF_MODEL_REPO, HF_SPACE_REPO)


mrvenkatesh/Venkat-TPP-MLOps-Project mrvenkatesh/Venkat-TPP-MLOps-Project mrvenkatesh/Venkat-TPP-MLOps-Project



## 3. Data Registration (Hugging Face **Datasets**)


In [55]:

from pathlib import Path
import pandas as pd

RAW_LOCAL_PATH = DATA_DIR / "tourism.csv"
if not RAW_LOCAL_PATH.exists():
    alt = Path("tourism.csv")
    if alt.exists():
        RAW_LOCAL_PATH.write_bytes(alt.read_bytes())
assert RAW_LOCAL_PATH.exists(), "tourism.csv not found. Upload to working dir or set path."
print("Local dataset:", RAW_LOCAL_PATH)

# optional preview
df_prev = pd.read_csv(RAW_LOCAL_PATH).head()
df_prev


Local dataset: /content/drive/My Drive/Colab Notebooks/venkat-tpp-mlops-project/data/tourism.csv


Unnamed: 0.1,Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,...,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,...,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,...,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,...,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,...,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,5,200005,0,32.0,Company Invited,1,8.0,Salaried,Male,3,...,Basic,3.0,Single,1.0,0,5,1,1.0,Executive,18068.0


In [56]:

try:
    from huggingface_hub import HfApi, HfFolder, upload_file
    if HF_TOKEN:
        HfFolder.save_token(HF_TOKEN)
        api = HfApi()
        try:
            api.create_repo(repo_id=HF_DATASET_REPO, repo_type="dataset", private=False)
        except Exception as e:
            print("Dataset repo may exist:", e)
        upload_file(str(RAW_LOCAL_PATH), path_in_repo="data/tourism.csv", repo_id=HF_DATASET_REPO, repo_type="dataset")
        print("Uploaded tourism.csv")
    else:
        print("HF_TOKEN not set; skipping upload.")
except Exception as e:
    print("Skip HF upload (offline?)", e)


Skip HF upload (offline?) cannot import name 'HfFolder' from 'huggingface_hub' (/usr/local/lib/python3.12/dist-packages/huggingface_hub/__init__.py)



## 4. Data Preparation (Load → Clean → Split → Upload)


In [57]:

import pandas as pd
hf_raw_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/data/tourism.csv"
try:
    df = pd.read_csv(hf_raw_url)
    print("Loaded from HF:", hf_raw_url)
except Exception:
    df = pd.read_csv(DATA_DIR / 'tourism.csv')
    print("Loaded from local fallback")
print(df.shape)


Loaded from local fallback
(4128, 21)


In [58]:

# Clean
import numpy as np

df.columns = [c.strip() for c in df.columns]
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].replace({'Fe Male':'Female','FeMale':'Female','female':'Female','FEMALE':'Female','male':'Male','MALE':'Male'})
if 'TypeofContact' in df.columns:
    df['TypeofContact'] = df['TypeofContact'].replace({'Self Enquiry ':'Self Enquiry','Company Invited ':'Company Invited'})

categoricals = ['TypeofContact','Occupation','Gender','ProductPitched','MaritalStatus','Designation']
for c in categoricals:
    if c in df.columns:
        df[c] = df[c].astype('category')

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())
for c in categoricals:
    if c in df.columns and df[c].isna().any():
        df[c] = df[c].fillna(df[c].mode()[0])

Xy = df.drop(columns=[c for c in ['CustomerID'] if c in df.columns])
assert 'ProdTaken' in Xy.columns
Xy.sample(3)

df = df.drop(columns=['Unnamed: 0'], errors='ignore')


In [59]:

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(Xy, test_size=0.2, random_state=42, stratify=Xy['ProdTaken'])
train_df.to_csv(DATA_DIR / 'train.csv', index=False)
test_df.to_csv(DATA_DIR / 'test.csv', index=False)
print(train_df.shape, test_df.shape)


(3302, 20) (826, 20)


In [60]:

try:
    from huggingface_hub import upload_file
    if HF_TOKEN:
        upload_file(str(DATA_DIR / 'train.csv'), path_in_repo='data/train.csv', repo_id=HF_DATASET_REPO, repo_type='dataset')
        upload_file(str(DATA_DIR / 'test.csv'), path_in_repo='data/test.csv', repo_id=HF_DATASET_REPO, repo_type='dataset')
        print('Uploaded splits to HF dataset repo')
    else:
        print('HF_TOKEN not set; skipping split uploads')
except Exception as e:
    print('Skip HF upload (offline?)', e)


Skip HF upload (offline?) HfApi.upload_file() takes 1 positional argument but 2 positional arguments (and 3 keyword-only arguments) were given



## 5. Model Building with Experiment Tracking


In [61]:

import pandas as pd
try:
    train_df = pd.read_csv(f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/data/train.csv")
    test_df = pd.read_csv(f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/data/test.csv")
    print("Loaded from HF splits")
except Exception:
    train_df = pd.read_csv(DATA_DIR / 'train.csv')
    test_df = pd.read_csv(DATA_DIR / 'test.csv')
    print('Loaded local splits')
train_df.head(2)


Loaded local splits


Unnamed: 0.1,Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,3850,0,55.0,Self Enquiry,1,17.0,Small Business,Female,4,4.0,Deluxe,5.0,Unmarried,8.0,1,1,0,1.0,Manager,23118.0
1,2463,0,39.0,Self Enquiry,1,9.0,Salaried,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,4,0,2.0,Executive,22622.0


In [62]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import numpy as np, json, joblib

X_train = train_df.drop(columns=['ProdTaken'])
y_train = train_df['ProdTaken']
X_test = test_df.drop(columns=['ProdTaken'])
y_test = test_df['ProdTaken']

num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train.select_dtypes(include=['object','category']).columns.tolist()
preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
    ('num', 'passthrough', num_features)
])

models_grids = [
    ('random_forest', RandomForestClassifier(random_state=42), {
        'model__n_estimators': [100, 300], 'model__max_depth': [None, 8, 16], 'model__min_samples_split': [2, 5]
    }),
    ('gradient_boosting', GradientBoostingClassifier(random_state=42), {
        'model__n_estimators': [100, 200], 'model__learning_rate': [0.05, 0.1], 'model__max_depth': [2, 3]
    })
]

try:
    from xgboost import XGBClassifier
    models_grids.append(('xgboost', XGBClassifier(random_state=42, eval_metric='logloss', tree_method='hist'), {
        'model__n_estimators': [200, 400], 'model__max_depth': [3, 5], 'model__learning_rate': [0.05, 0.1], 'model__subsample': [0.8, 1.0]
    }))
except Exception:
    pass

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, base_model, grid in models_grids:
    pipe = Pipeline(steps=[('preprocess', preprocess), ('model', base_model)])
    gs = GridSearchCV(pipe, param_grid=grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    y_proba = best.predict_proba(X_test)[:,1]
    y_pred = (y_proba >= 0.5).astype(int)
    metrics = {
        'model': name,
        'best_params': gs.best_params_,
        'cv_best_score_roc_auc': float(gs.best_score_),
        'test_roc_auc': float(roc_auc_score(y_test, y_proba)),
        'test_f1': float(f1_score(y_test, y_pred)),
        'test_accuracy': float(accuracy_score(y_test, y_pred))
    }
    results.append(metrics)

Path('venkat-tpp-mlops-project/experiments').mkdir(parents=True, exist_ok=True)
Path('venkat-tpp-mlops-project/experiments/gridsearch_results.json').write_text(json.dumps(results, indent=2))
results


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[{'model': 'random_forest',
  'best_params': {'model__max_depth': None,
   'model__min_samples_split': 2,
   'model__n_estimators': 300},
  'cv_best_score_roc_auc': 0.9501828244274027,
  'test_roc_auc': 0.9621274268526114,
  'test_f1': 0.6612903225806451,
  'test_accuracy': 0.8983050847457628},
 {'model': 'gradient_boosting',
  'best_params': {'model__learning_rate': 0.1,
   'model__max_depth': 3,
   'model__n_estimators': 200},
  'cv_best_score_roc_auc': 0.881486557421048,
  'test_roc_auc': 0.9012286309675352,
  'test_f1': 0.6294820717131474,
  'test_accuracy': 0.887409200968523},
 {'model': 'xgboost',
  'best_params': {'model__learning_rate': 0.1,
   'model__max_depth': 5,
   'model__n_estimators': 400,
   'model__subsample': 0.8},
  'cv_best_score_roc_auc': 0.9248955205493855,
  'test_roc_auc': 0.9459609817732643,
  'test_f1': 0.7804878048780488,
  'test_accuracy': 0.923728813559322}]

In [63]:

# pick best
best_entry = sorted(results, key=lambda d: d['test_roc_auc'], reverse=True)[0]
best_entry


{'model': 'random_forest',
 'best_params': {'model__max_depth': None,
  'model__min_samples_split': 2,
  'model__n_estimators': 300},
 'cv_best_score_roc_auc': 0.9501828244274027,
 'test_roc_auc': 0.9621274268526114,
 'test_f1': 0.6612903225806451,
 'test_accuracy': 0.8983050847457628}

In [64]:

# retrain, persist, create model card
from sklearn.pipeline import Pipeline
import joblib, json

best_name = best_entry['model']
# recover spec
spec = None
for name, base_model, grid in models_grids:
    if name == best_name:
        spec = (name, base_model, grid)
        break
name, base_model, grid = spec
pipe = Pipeline(steps=[('preprocess', preprocess), ('model', base_model)])
pipe.set_params(**best_entry['best_params'])
pipe.fit(X_train, y_train)

Path('venkat-tpp-mlops-project/artifacts').mkdir(parents=True, exist_ok=True)
model_path = Path('venkat-tpp-mlops-project/artifacts') / f"best_model_{best_name}.joblib"
joblib.dump(pipe, model_path)

model_card = f'''# Visit with Us — Wellness Tourism Classifier ({best_name})

**Test metrics**
- ROC AUC: {best_entry['test_roc_auc']:.3f}
- F1: {best_entry['test_f1']:.3f}
- Accuracy: {best_entry['test_accuracy']:.3f}

**Best params**
```json
{json.dumps(best_entry['best_params'], indent=2)}
```
'''
Path('venkat-tpp-mlops-project/README_model.md').write_text(model_card)
model_path


PosixPath('venkat-tpp-mlops-project/artifacts/best_model_random_forest.joblib')

In [65]:

# Register model in HF model hub
try:
    from huggingface_hub import HfApi, HfFolder, upload_file
    if HF_TOKEN:
        HfFolder.save_token(HF_TOKEN)
        api = HfApi()
        try:
            api.create_repo(repo_id=HF_MODEL_REPO, repo_type='model', private=False)
        except Exception as e:
            print('Model repo may exist:', e)
        upload_file('venkat-tpp-mlops-project/artifacts/' + f"best_model_{best_entry['model']}.joblib", 'model.joblib', repo_id=HF_MODEL_REPO, repo_type='model')
        upload_file('venkat-tpp-mlops-project/README_model.md', 'README.md', repo_id=HF_MODEL_REPO, repo_type='model')
        print('Uploaded model and README to', HF_MODEL_REPO)
    else:
        print('HF_TOKEN not set; skipping model registration')
except Exception as e:
    print('Skip model hub registration (offline?)', e)


Skip model hub registration (offline?) cannot import name 'HfFolder' from 'huggingface_hub' (/usr/local/lib/python3.12/dist-packages/huggingface_hub/__init__.py)



### Evaluation Report


In [66]:

from sklearn.metrics import classification_report, confusion_matrix
import joblib
model = joblib.load('venkat-tpp-mlops-project/artifacts/' + f"best_model_{best_entry['model']}.joblib")
proba = model.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)
print(classification_report(y_test, pred))
print('Confusion matrix:', confusion_matrix(y_test, pred))


              precision    recall  f1-score   support

           0       0.90      0.99      0.94       667
           1       0.92      0.52      0.66       159

    accuracy                           0.90       826
   macro avg       0.91      0.75      0.80       826
weighted avg       0.90      0.90      0.89       826

Confusion matrix: [[660   7]
 [ 77  82]]



## 6. Model Deployment (Hugging Face **Spaces**)


In [67]:

from textwrap import dedent
from pathlib import Path

DEPLOY_DIR = Path('venkat-tpp-mlops-project/deploy')
DEPLOY_DIR.mkdir(parents=True, exist_ok=True)

app_py = dedent("""
import os
import pandas as pd
import joblib
from huggingface_hub import hf_hub_download
import gradio as gr

MODEL_REPO = os.getenv('HF_MODEL_REPO', 'your-username/visit-with-us-wellness-model')
MODEL_FILENAME = os.getenv('MODEL_FILENAME', 'model.joblib')

local_model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
model = joblib.load(local_model_path)

inputs = [
    gr.Dropdown(["Company Invited","Self Enquiry"], label="TypeofContact"),
    gr.Slider(1,3,step=1,label="CityTier"),
    gr.Number(label="Age"),
    gr.Dropdown(["Salaried","Small Business","Large Business","Free Lancer"], label="Occupation"),
    gr.Dropdown(["Male","Female"], label="Gender"),
    gr.Slider(1,5,step=1,label="PreferredPropertyStar"),
    gr.Dropdown(["Single","Married","Divorced","Unmarried"], label="MaritalStatus"),
    gr.Number(label="NumberOfPersonVisiting"),
    gr.Number(label="NumberOfTrips"),
    gr.Dropdown([0,1], label="Passport"),
    gr.Dropdown([0,1], label="OwnCar"),
    gr.Number(label="NumberOfChildrenVisiting"),
    gr.Dropdown(["Basic","Standard","Deluxe","Super Deluxe","King"], label="ProductPitched"),
    gr.Number(label="PitchSatisfactionScore"),
    gr.Number(label="NumberOfFollowups"),
    gr.Number(label="DurationOfPitch"),
    gr.Dropdown(["Executive","Manager","Senior Manager","AVP","VP"], label="Designation"),
    gr.Number(label="MonthlyIncome")
]

columns = [
    'TypeofContact','CityTier','Age','Occupation','Gender','PreferredPropertyStar','MaritalStatus',
    'NumberOfPersonVisiting','NumberOfTrips','Passport','OwnCar','NumberOfChildrenVisiting','ProductPitched',
    'PitchSatisfactionScore','NumberOfFollowups','DurationOfPitch','Designation','MonthlyIncome'
]

LOG_PATH = 'inference_log.csv'

def predict(*vals):
    X = pd.DataFrame([vals], columns=columns)
    proba = model.predict_proba(X)[:,1][0]
    pred = int(proba >= 0.5)
    row = X.copy()
    row['pred_proba'] = proba
    row['pred_label'] = pred
    if os.path.exists(LOG_PATH):
        log_df = pd.read_csv(LOG_PATH)
        log_df = pd.concat([log_df, row], ignore_index=True)
    else:
        log_df = row
    log_df.to_csv(LOG_PATH, index=False)
    return {"Purchase Probability": float(proba), "Predicted Class (ProdTaken)": int(pred)}

iface = gr.Interface(fn=predict, inputs=inputs, outputs=[gr.JSON()],
                     title="Wellness Package Purchase Prediction",
                     description="Predict purchase probability for the Wellness Tourism Package.")

if __name__ == '__main__':
    iface.launch()
""")

requirements_txt = dedent("""
scikit-learn==1.3.2
pandas
numpy
joblib
huggingface_hub
gradio==3.50.2
""")

dockerfile_txt = dedent("""
FROM python:3.10-slim
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY app.py /app/app.py
EXPOSE 7860
CMD ["python", "app.py"]
""")

push_space_py = dedent("""
import os
from huggingface_hub import HfApi, HfFolder, upload_file
HF_TOKEN = os.getenv('HF_TOKEN', '')
HF_SPACE_REPO = os.getenv('HF_SPACE_REPO', 'your-username/visit-with-us-wellness-space')
assert HF_TOKEN, 'HF_TOKEN must be set'
HfFolder.save_token(HF_TOKEN)
api = HfApi()
try:
    api.create_repo(repo_id=HF_SPACE_REPO, repo_type='space', space_sdk='gradio', private=False)
    print('Created Space:', HF_SPACE_REPO)
except Exception as e:
    print('Space may already exist:', e)
upload_file('venkat-tpp-mlops-project/deploy/app.py','app.py',repo_id=HF_SPACE_REPO,repo_type='space')
upload_file('venkat-tpp-mlops-project/deploy/requirements.txt','requirements.txt',repo_id=HF_SPACE_REPO,repo_type='space')
upload_file('venkat-tpp-mlops-project/deploy/Dockerfile','Dockerfile',repo_id=HF_SPACE_REPO,repo_type='space')
print('Pushed deployment files to Space:', HF_SPACE_REPO)
""")

(DEPLOY_DIR / 'app.py').write_text(app_py)
(DEPLOY_DIR / 'requirements.txt').write_text(requirements_txt)
(DEPLOY_DIR / 'Dockerfile').write_text(dockerfile_txt)
(DEPLOY_DIR / 'push_space.py').write_text(push_space_py)
print('Deployment files written to', DEPLOY_DIR)


Deployment files written to venkat-tpp-mlops-project/deploy



### 6.1 Quick Local Inference (Optional)


In [68]:

import joblib, pandas as pd
model = joblib.load('venkat-tpp-mlops-project/artifacts/' + f"best_model_{best_entry['model']}.joblib")
sample = {
    'Unnamed: 0': 0, # Add this line
    'TypeofContact': 'Self Enquiry',
    'CityTier': 1,
    'Age': 35,
    'Occupation': 'Salaried',
    'Gender': 'Male',
    'PreferredPropertyStar': 4,
    'MaritalStatus': 'Married',
    'NumberOfPersonVisiting': 2,
    'NumberOfTrips': 3,
    'Passport': 1,
    'OwnCar': 0,
    'NumberOfChildrenVisiting': 0,
    'ProductPitched': 'Deluxe',
    'PitchSatisfactionScore': 4,
    'NumberOfFollowups': 3,
    'DurationOfPitch': 12,
    'Designation': 'Manager',
    'MonthlyIncome': 21000
}
X = pd.DataFrame([sample])
proba = model.predict_proba(X)[:,1][0]
pred = int(proba >= 0.5)
{"probability": float(proba), "prediction": int(pred)}


{'probability': 0.07666666666666666, 'prediction': 0}


## 7. CI/CD with GitHub Actions


In [69]:

from textwrap import dedent
from pathlib import Path
GH_DIR = Path('venkat-tpp-mlops-project/.github/workflows')
GH_DIR.mkdir(parents=True, exist_ok=True)
ci_yaml = dedent("""
name: mlops-pipeline
on:
  push:
    branches: [ "main" ]
  workflow_dispatch:
jobs:
  build-train-deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.10'
      - name: Install deps
        run: |
          python -m pip install --upgrade pip
          pip install pandas numpy scikit-learn joblib huggingface_hub gradio xgboost
      - name: Tests
        run: |
          pytest -q || true
      - name: Publish Space
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
        run: |
          python venkat-tpp-mlops-project/deploy/push_space.py
""")
(GH_DIR / 'ci.yml').write_text(ci_yaml)
print('Wrote GitHub Actions workflow to', GH_DIR / 'ci.yml')


Wrote GitHub Actions workflow to venkat-tpp-mlops-project/.github/workflows/ci.yml



## 8. Minimal Tests (Sanity)


In [70]:

from pathlib import Path
TESTS_DIR = Path('venkat-tpp-mlops-project/tests')
TESTS_DIR.mkdir(parents=True, exist_ok=True)
(Path('venkat-tpp-mlops-project/tests/test_data.py')).write_text('''from pathlib import Path


def test_splits_exist():
    assert Path('venkat-tpp-mlops-project/data/train.csv').exists()
    assert Path('venkat-tpp-mlops-project/data/test.csv').exists()
''')
(Path('venkat-tpp-mlops-project/tests/test_model.py')).write_text('''from pathlib import Path


def test_artifacts_dir():
    assert Path('venkat-tpp-mlops-project/artifacts').exists()
''')
print('Wrote tests to', TESTS_DIR)


Wrote tests to venkat-tpp-mlops-project/tests
