Merge pull request #4 from made-mlops-2022/homework1

homework1_TECHNOPARK
made-mlops-2022 · Nov 15, 2022 · 77327ce · 77327ce
2 parents 5c44c7a + c7824f0
commit 77327ce
Show file tree

Hide file tree

Showing 40 changed files with 1,639 additions and 1 deletion.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,41 @@
+name: ml_project ci
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: tests
+      run: |
+        python tests/test_data/generate_data.py
+        
+        export PYTHONPATH=$(pwd)
+
+        python tests/model_test/train_model_pipeline_test.py
+        python tests/model_test/predict_model_pipeline_test.py
+        python tests/features_test/build_features_test.py
+        python tests/make_dataset_test/test_make_dataset.py
diff --git a/README.md b/README.md
@@ -1 +1,67 @@
-# bykov_vladimir
+Homework 1
+==============================
+Heart Disease Cleveland production ready project
+
+
+Installation: 
+~~~
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+~~~
+Usage:
+~~~
+python src/train_model_pipeline.py
+~~~
+
+Test:
+~~~
+python PATH_TO_TEST_SCRIPT
+python tests/model_test/train_model_pipeline_test.py
+~~~
+
+Project Organization
+------------
+
+    ├── README.md          <- The top-level README for developers using this project.
+    ├── data
+    │   ├── predictions    <- Predictions.
+    │   └── raw            <- The original, immutable data dump.
+    │
+    ├── models             <- Trained and serialized models, model predictions, or model summaries
+    │
+    ├── notebooks          <- Jupyter notebooks. Naming convention is a number (for ordering),
+    │                         the creator's initials, and a short `-` delimited description, e.g.
+    │                         `1.0-jqp-initial-data-exploration`.
+    │
+    ├── requirements.txt   <- The requirements file for reproducing the analysis environment, e.g.
+    │                         generated with `pip freeze > requirements.txt`
+    │
+    ├── src                <- Source code for use in this project.
+    │   ├── __init__.py    <- Makes src a Python module
+    │   │
+    │   ├── data           <- code to download or generate data
+    │   │
+    │   ├── features       <- code to turn raw data into features for modeling
+    │   │
+    │   └── models         <- code to train models and then use trained models to make
+    │   
+    │
+    ├── tests                <- Source code for tests.
+    │   ├── __init__.py       <- Makes src a Python module
+    │   │
+    │   ├── features_test     <- test transformer, extracting features
+    │   │
+    │   ├── make_dataset_test <- test making dataset
+    │   │
+    │   ├── model_test        <- test training and prediction pipeline
+    │   │
+    │   └── test_data         <- for generating synthetic data   
+    └── requirements.txt   <- The requirements file for reproducing the analysis environment, e.g.
+                              generated with `pip freeze > requirements.txt`
+
+
+--------
+
+<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
+
diff --git a/configs/predict_config.yaml b/configs/predict_config.yaml
@@ -0,0 +1,21 @@
+input_data_path: "./tests/test_data/synthetic_data.csv"
+input_model_path: "./models/model.pkl"
+output_predictions_path: "./data/predictions/predictions.csv"
+feature_params:
+  categorical_features:
+    - 'sex'
+    - 'cp'
+    - 'fbs'
+    - 'restecg'
+    - 'exang'
+    - 'slope'
+    - 'ca'
+    - 'thal'
+  numerical_features:
+    - 'age'
+    - 'trestbps'
+    - 'chol'
+    - 'thalach'
+    - 'oldpeak'
+  features_to_drop:
+  target: 'condition'
diff --git a/configs/train_config_gb.yaml b/configs/train_config_gb.yaml
@@ -0,0 +1,27 @@
+input_data_path: "./data/raw/heart_cleveland_upload.csv"
+output_model_path: "./models/model_gb.pkl"
+splitting_params:
+  val_size: 0.3
+  random_state: 42
+model_params:
+  model: "GB"
+  n_estimators: 25
+  random_state: 42
+feature_params:
+  categorical_features:
+    - 'sex'
+    - 'cp'
+    - 'fbs'
+    - 'restecg'
+    - 'exang'
+    - 'slope'
+    - 'ca'
+    - 'thal'
+  numerical_features:
+    - 'age'
+    - 'trestbps'
+    - 'chol'
+    - 'thalach'
+    - 'oldpeak'
+  features_to_drop:
+  target: 'condition'
diff --git a/configs/train_config_rf.yaml b/configs/train_config_rf.yaml
@@ -0,0 +1,27 @@
+input_data_path: "./data/raw/heart_cleveland_upload.csv"
+output_model_path: "./models/model_rf.pkl"
+splitting_params:
+  val_size: 0.2
+  random_state: 42
+model_params:
+  model: "RF"
+  n_estimators: 100
+  random_state: 42
+feature_params:
+  categorical_features:
+    - 'sex'
+    - 'cp'
+    - 'fbs'
+    - 'restecg'
+    - 'exang'
+    - 'slope'
+    - 'ca'
+    - 'thal'
+  numerical_features:
+    - 'age'
+    - 'trestbps'
+    - 'chol'
+    - 'thalach'
+    - 'oldpeak'
+  features_to_drop:
+  target: 'condition'