In [1]:
import pandas as pd


In [2]:
data = pd.read_csv("/content/drive/MyDrive/MADE/semester2/ML_in_Production/heart.csv")

In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None


In [4]:
!pip install marshmallow-dataclass==8.3.0

Collecting marshmallow-dataclass==8.3.0
  Downloading https://files.pythonhosted.org/packages/a7/3f/7c13aa4730050c0fce7461bff3eadf311964e65a025be32797172ee245a2/marshmallow_dataclass-8.3.0-py3-none-any.whl
Collecting typing-inspect
  Downloading https://files.pythonhosted.org/packages/42/1c/66402db44184904a2f14722d317a4da0b5c8c78acfc3faf74362566635c5/typing_inspect-0.6.0-py3-none-any.whl
Collecting marshmallow<4.0,>=3.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/57/06/87c14302f51d76a94945ae08da449baccb91f4d710c339fd1a1499209c45/marshmallow-3.11.1-py2.py3-none-any.whl (46kB)
[K     |████████████████████████████████| 51kB 1.9MB/s 
[?25hCollecting mypy-extensions>=0.3.0
  Downloading https://files.pythonhosted.org/packages/5c/eb/975c7c080f3223a5cdaff09612f3a5221e4ba534f7039db34c35d95fa6a5/mypy_extensions-0.4.3-py2.py3-none-any.whl
Installing collected packages: mypy-extensions, typing-inspect, marshmallow, marshmallow-dataclass
Successfully installed marshmallow-3.11.

In [5]:
#from dataclasses import dataclass
#from .split_params import SplittingParams
#from .feature_params import FeatureParams
#from .train_params import TrainingParams
from marshmallow_dataclass import class_schema
import yaml

from dataclasses import dataclass, field
from typing import List, Optional, Any


@dataclass()
class ModelParams:
    model_type: str
    

@dataclass()
class FeatureParams:
    categorical: List[str]
    numerical: List[str]
    target: Optional[str]

@dataclass()
class SplittingParams:
    val_size : float
    random_state : int
    stratify : Optional[str] = None
    

@dataclass()
class TrainingConfigParams:
    input_data_path: str
    output_model_path: str
    metric_path: str
    splitting_params: SplittingParams
    model_params: ModelParams
    metric_params: List[str]
    feature_params: FeatureParams
    
    
    
ConfigSchema = class_schema(TrainingConfigParams)


def read_training_config_params(path: str) -> TrainingConfigParams:
    with open(path, "r") as config:
        schema = ConfigSchema()
        return schema.load(yaml.safe_load(config))

In [6]:
TrainingConfigParams = read_training_config_params("/content/drive/MyDrive/MADE/semester2/ML_in_Production/config_randomforest.yaml")

In [7]:
TrainingConfigParams


TrainingConfigParams(input_data_path='./data/raw/heart.csv', output_model_path='./models/model_forest.pkl', metric_path='./models/metrics_forest.json', splitting_params=SplittingParams(val_size=0.2, random_state=42, stratify='target'), model_params=ModelParams(model_type='RandomForestClassifier'), metric_params=['accuracy', 'roc_auc'], feature_params=FeatureParams(categorical=['sex', 'cp', 'fbs', 'exang', 'slope', 'ca', 'thal'], numerical=['age', 'trestbps', 'chol', 'restecg', 'thalach', 'oldpeak'], target='target'))

In [8]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


def create_pipeline_for_categorical_params() -> Pipeline:
    return Pipeline(
        [
            ("OH", OneHotEncoder())
        ]
    )

def create_pipeline_for_numerical_params() -> Pipeline:
    return Pipeline(
        [
            ("impute", SimpleImputer(np.nan, 'mean'))
            
        ]
    )


def create_transformer(params: FeatureParams) -> ColumnTransformer:
    transformer = ColumnTransformer(
        [
            ("pipeline_for_categorical_params",  create_pipeline_for_categorical_params(), 
            params.categorical),
            ("pipeline_for_numerical_params",  create_pipeline_for_numerical_params(), 
            params.numerical),
        ]
    )
    return transformer


def create_target(data: pd.DataFrame, params: FeatureParams) -> pd.Series:
    return data[params.target]


def create_feature_array(transformer: ColumnTransformer, df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(transformer.fit_transform(df))    

In [9]:
transformer = create_transformer(TrainingConfigParams.feature_params)
data_processed = create_feature_array(transformer,  data)

In [10]:
print(data_processed.head())

    0    1    2    3    4    5    6   ...   21    22     23     24   25     26   27
0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  ...  0.0  63.0  145.0  233.0  0.0  150.0  2.3
1  0.0  1.0  0.0  0.0  1.0  0.0  1.0  ...  0.0  37.0  130.0  250.0  1.0  187.0  3.5
2  1.0  0.0  0.0  1.0  0.0  0.0  1.0  ...  0.0  41.0  130.0  204.0  0.0  172.0  1.4
3  0.0  1.0  0.0  1.0  0.0  0.0  1.0  ...  0.0  56.0  120.0  236.0  1.0  178.0  0.8
4  1.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  57.0  120.0  354.0  1.0  163.0  0.6

[5 rows x 28 columns]


In [11]:
target = create_target(data, TrainingConfigParams.feature_params) 

In [12]:
from typing import Tuple
from sklearn.model_selection import train_test_split

def split_train_val_data(
    data: pd.DataFrame, target : pd.Series, params: SplittingParams
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    :rtype: object
    """
    stratify = None
    if params.stratify is not None:
      stratify = target
    train_data, val_data, y_train, y_test = train_test_split(
        data, target, test_size=params.val_size, random_state=params.random_state, shuffle = True, stratify = stratify
    )
    return train_data, val_data, y_train, y_test

In [13]:
train_data, val_data, y_train, y_test = split_train_val_data(data_processed, target,TrainingConfigParams.splitting_params )

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from typing import Dict, Union

ModelType = Union[RandomForestClassifier, LogisticRegression]

class ModelClass:
    def __init__(self, params : ModelParams):
        self.params = params
        
       
    def train(self, features: pd.DataFrame, target: pd.Series) -> ModelType:
        print(self.params.model_type)
        if self.params.model_type == "RandomForestClassifier":
            self.model = RandomForestClassifier(
                n_estimators=600, 
            ).fit(features, target)
        elif self.params.model_type == "GradientBoostingClassifier":
            self.model = GradientBoostingClassifier(n_estimators=100).fit(features, target)
        else:
            self.model = LogisticRegression(max_iter=1000).fit(features, target)
        
        return self.model

    def predict(self, features: pd.DataFrame) -> np.ndarray:
        predicts = self.model.predict(features)
        return predicts

    def evaluate(self, predicts: np.ndarray, target: pd.Series) -> Dict[str, int] :
        return {
            "accuracy": accuracy_score(target, predicts)
        }
        #"rmse": mean_squared_error(target, predicts, squared=False),
        #"mae": mean_absolute_error(target, predicts),
    
    def serialize_model(self, output: str) -> str:
        with open(output, "wb") as f:
            pickle.dump(self.model, f)
        return output



In [15]:
import json
import pickle

model = ModelClass(TrainingConfigParams.model_params)
model.train(train_data, y_train)#, training_pipeline_params.train_params   )

#    val_features = make_features(transformer, val_df)
#    val_target = extract_target(val_df, training_pipeline_params.feature_params)

#logger.info(f"val_features.shape is {val_features.shape}")
predicts = model.predict(val_data)#
    #    training_pipeline_params.feature_params.use_log_trick,
    #)

result = model.evaluate(predicts, y_test)
print(result)
 
with open("/content/drive/MyDrive/MADE/semester2/ML_in_Production/metrics.json", "w") as metric_file:
        json.dump(result, metric_file)
    
model.serialize_model("/content/drive/MyDrive/MADE/semester2/ML_in_Production/model.pkl")

#    return path_to_model, metrics


#@click.command(name="train_pipeline")
#@click.argument("config_path")
#def model_create(config_path: str):
#    params = read_model_params(config_path)
#    model_create_pipeline(params)


RandomForestClassifier
{'accuracy': 0.7704918032786885}


'/content/drive/MyDrive/MADE/semester2/ML_in_Production/model.pkl'

In [16]:
print(result)

{'accuracy': 0.7704918032786885}
