In [44]:
# импорт библиотек
from sklearn.datasets import load_wine
import os

In [45]:
# создадим папку с датой, чтобы её отправить на DVC
os.makedirs("data/raw", exist_ok=True) # сырые данные
os.makedirs("scr", exist_ok=True) # скрипты

wine = load_wine(as_frame=True)

wine.frame.to_csv("data/raw/wine.csv", index=False) # сохраняем данные локально

In [46]:
# напишем params.yaml
params = """
prepare:
    test_size: 0.2
    random_state: 42

train:
    data_path: data/prepared
    n_estimators: 20
    random_state: 42
"""

with open("params.yaml", "w", encoding="utf-8") as f:
    f.write(params)

In [47]:
# код для prepare.py
code = """
import os
import yaml
import pandas as pd
from sklearn.model_selection import train_test_split

def load_params():
    with open("params.yaml", "r") as f:
        return yaml.safe_load(f)

def main():
    params = load_params()
    test_size = params["prepare"]["test_size"]
    random_state = params["prepare"]["random_state"]

    df = pd.read_csv("data/raw/wine.csv")
    df = df.dropna(axis=0)

    X = df.drop("target", axis=1)
    y = df["target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    os.makedirs("data/prepared", exist_ok=True)
    X_train.to_csv("data/prepared/X_train.csv", index=False)
    X_test.to_csv("data/prepared/X_test.csv", index=False)
    y_train.to_csv("data/prepared/y_train.csv", index=False)
    y_test.to_csv("data/prepared/y_test.csv", index=False)

    print("Обработанные данные сохранены в data/prepared")

if __name__ == "__main__":
    main()

"""

with open("scr/prepare.py", "w", encoding="utf-8") as f:
    f.write(code)

In [48]:
# код для train.py
code = """
import yaml
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib, os
import mlflow
import mlflow.sklearn


def load_params():
    with open("params.yaml", "r") as f:
        return yaml.safe_load(f)

def load_data(path):
    X_train = pd.read_csv(path + "/X_train.csv")
    y_train = pd.read_csv(path + "/y_train.csv").squeeze()
    X_test = pd.read_csv(path + "/X_test.csv")
    y_test = pd.read_csv(path + "/y_test.csv").squeeze()
    return X_train, y_train, X_test, y_test

def build_model(params):
    n = params["train"]["n_estimators"]
    rs = params["train"]["random_state"]
    return RandomForestClassifier(n_estimators=n, random_state=rs)

def main():
    params = load_params()

    X_train, y_train, X_test, y_test = load_data(params["train"]["data_path"])

    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment("mlops_hw1_wine")

    with mlflow.start_run():
        model = build_model(params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")

        # Логирование параметров
        mlflow.log_param("n_estimators", params["train"]["n_estimators"])
        mlflow.log_param("random_state", params["train"]["random_state"])

        # Логирование метрик
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        mlflow.sklearn.log_model(model, "model")

        print("Эксперимент завершён и записан в MLflow")
        print(f"Accuracy: {acc:.4f}")

        os.makedirs("models", exist_ok=True)
        joblib.dump(model, "models/model.pkl")
        print("Сохранено в models/model.pkl")


if __name__ == "__main__":
    main()

"""

with open("scr/train.py", "w", encoding="utf-8") as f:
    f.write(code)

In [49]:
# напишем requirements.txt
requirements = """
dvc
mlflow
pandas==2.3.3
scikit-learn==1.7.2
pyyaml
joblib
"""

with open("requirements.txt", "w", encoding="utf-8") as f:
    f.write(requirements)

In [50]:
# переключение кодировку консоли нат UTF-8
!chcp 65001

Active code page: 65001


In [51]:
!git init
!dvc init

Reinitialized existing Git repository in D:/PythonProjects/MIPT/ML in prodaction/Р”Р—/.git/


ERROR: failed to initiate DVC - '.dvc' exists. Use `-f` to force.


In [52]:
# подключаем удалённый репозиторий
!git remote add origin https://github.com/lizka228/mlops_hw1_Mazurina_Elizaveta

error: remote origin already exists.


In [53]:
!git remote -v

origin	https://github.com/lizka228/mlops_hw1_Mazurina_Elizaveta (fetch)
origin	https://github.com/lizka228/mlops_hw1_Mazurina_Elizaveta (push)


In [54]:
# подтягиваем коммиты
!git pull origin main

Already up to date.


From https://github.com/lizka228/mlops_hw1_Mazurina_Elizaveta
 * branch            main       -> FETCH_HEAD


In [55]:
# Отправка датасета в dvc
!dvc add data/raw/wine.csv
!git add data/.gitignore
!git add data/raw/wine.csv.dvc
!git commit -m "Add raw dataset via DVC"


To track the changes with git, run:

	git add 'data\raw\wine.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


\u280b Checking graph

fatal: pathspec 'data/.gitignore' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    README.md

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/.gitignore
	DZ1.ipynb
	data/raw/.gitignore
	dvc-storage/
	dz1/
	mlflow.db
	mlops_hw1_Mazurina_Elizaveta/
	mlruns/
	models/
	params.yaml
	scr/

no changes added to commit (use "git add" and/or "git commit -a")


In [56]:
# добавим локальный remote
!dvc remote add -d localstorage ./dvc-storage
!git add .dvc/config
!git commit -m "Configure DVC remote storage (local)"

Setting 'localstorage' as a default remote.


ERROR: configuration error - config file error: remote 'localstorage' already exists. Use `-f|--force` to overwrite it.


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    README.md

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/.gitignore
	DZ1.ipynb
	data/raw/.gitignore
	dvc-storage/
	dz1/
	mlflow.db
	mlops_hw1_Mazurina_Elizaveta/
	mlruns/
	models/
	params.yaml
	scr/

no changes added to commit (use "git add" and/or "git commit -a")


In [57]:
# отправим данные
!dvc push

Everything is up to date.


In [58]:
# проверка, что добавилось
!git ls-files

.dvc/config
README.md
data/raw/wine.csv.dvc
dvc.lock
dvc.yaml
requirements.txt


In [59]:
# Отправляем в удалённый
!git branch -M main
!git push -u origin main

branch 'main' set up to track 'origin/main'.


Everything up-to-date


In [60]:
# Добавим скрипты
!git add scr/prepare.py scr/train.py
!git commit -m "Add scripts"

[main 92de823] Add scripts
 2 files changed, 97 insertions(+)
 create mode 100644 scr/prepare.py
 create mode 100644 scr/train.py


In [61]:
!git ls-files

.dvc/config
README.md
data/raw/wine.csv.dvc
dvc.lock
dvc.yaml
requirements.txt
scr/prepare.py
scr/train.py


In [62]:
# напишем dvc.yaml
dvc_yaml = """
stages:
    prepare:
        cmd: python scr/prepare.py
        deps:
            - scr/prepare.py
            - data/raw/wine.csv
            - params.yaml
        outs:
            - data/prepared:
                cache: false

    train:
        cmd: python scr/train.py
        deps:
            - scr/train.py
            - data/prepared
            - params.yaml
        outs:
            - models/model.pkl:
                cache: false

"""
open("dvc.yaml", "w").write(dvc_yaml)


446

In [63]:
!git add requirements.txt params.yaml
!git commit -m "Add params and requirements"

[main c627489] Add params and requirements
 1 file changed, 9 insertions(+)
 create mode 100644 params.yaml


In [64]:
!git ls-files

.dvc/config
README.md
data/raw/wine.csv.dvc
dvc.lock
dvc.yaml
params.yaml
requirements.txt
scr/prepare.py
scr/train.py


In [65]:
# запуск пайплайна
!dvc repro

'data\raw\wine.csv.dvc' didn't change, skipping
Running stage 'prepare':
> python scr/prepare.py
Обработанные данные сохранены в data/prepared

Stage 'train' didn't change, skipping
Use `dvc push` to send your updates to remote storage.


In [66]:
!git add dvc.yaml dvc.lock
!git commit -m "Add dvc.yaml"

On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    README.md

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/.gitignore
	DZ1.ipynb
	data/prepared/
	data/raw/.gitignore
	dvc-storage/
	dz1/
	mlflow.db
	mlops_hw1_Mazurina_Elizaveta/
	mlruns/
	models/

no changes added to commit (use "git add" and/or "git commit -a")


In [67]:
!dvc push

Everything is up to date.


In [None]:
!git add DZ1.ipynb
!git commit -m "Add .ipynb"

In [68]:
# Отправляем в удалённый
!git branch -M main
!git push -u origin main

branch 'main' set up to track 'origin/main'.


To https://github.com/lizka228/mlops_hw1_Mazurina_Elizaveta
   24de09b..c627489  main -> main


In [69]:
!git clone https://github.com/lizka228/mlops_hw1_Mazurina_Elizaveta

fatal: destination path 'mlops_hw1_Mazurina_Elizaveta' already exists and is not an empty directory.


In [29]:
!cd mlops_hw1_Mazurina_Elizaveta

In [70]:
!pip install -r requirements.txt
!dvc pull




[notice] A new release of pip available: 22.2.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Everything is up to date.


md5: e6dccd29b15c08636f50bd2c0fd80a6a
ERROR: failed to pull data from the cloud - Checkout failed for following targets:
data\raw\wine.csv
Is your cache up to date?
<https://error.dvc.org/missing-files>
