In [2]:
# config
import pandas as pd
import os.path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [3]:
from hydra import compose, initialize
from omegaconf import OmegaConf

#config_path = '/home/tatiana/PycharmProjects/ml_project_example/configs/train_config.yaml'
config_path = '/home/tatiana/MADE/MLOps/HW1/config.yaml'

initialize(config_path='', job_name="test_app", version_base=None)

hydra.initialize()

In [4]:
params = compose(config_name="config")

In [5]:
params

{'input_data_path': 'heart_cleveland_upload.csv', 'output_model_path': 'models/model.pkl', 'metric_path': 'models/metrics.json', 'downloading_params': {'s3_bucket': 'for-dvc', 'paths': ['train.csv', 'test.csv'], 'output_folder': 'data/raw/'}, 'splitting_params': {'val_size': 0.2, 'random_state': 42}, 'train_params': {'model_type': 'RandomForestClassifier', 'random_state': 42, 'forest_max_depth': 6}, 'feature_params': {'numerical_features': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition'], 'target': 'condition'}}

In [6]:
params.splitting_params

{'val_size': 0.2, 'random_state': 42}

In [7]:
# make_dataset.py
def read_data(path):
    df = pd.read_csv(path)
    return df

def split_train_val_data(data, params):
    train_data, val_data = train_test_split(data, 
                                            test_size=params['val_size'], 
                                            random_state=params['random_state'])
    return train_data, val_data

In [8]:
def train_model(features, target, train_params):
    if train_params.model_type == 'RandomForestClassifier':
        model = RandomForestClassifier(n_estimators=100, 
                                       random_state=train_params.random_state, 
                                       max_depth=train_params.forest_max_depth)
    elif train_params.model_type == 'LogisticRegression':
        model = LogisticRegression(random_state=train_params.random_state)
    else:
        raise NotImplementedError()
    model.fit(features, target)
    return model

def predict_model(model, features):
    predicts = model.predict(features)
    return predicts

def evaluate_model(predicts, target):
    f1 = f1_score(target, predicts)
    accuracy = accuracy_score(target, predicts)
    return{'accuracy':accuracy, 
           'f1_score':f1}

In [9]:
data = read_data(params.input_data_path)

train_df, val_df = split_train_val_data(data, params.splitting_params)
y_train = train_df[params.feature_params['target']]
y_val = val_df[params.feature_params['target']]
X_train = train_df[params.feature_params['numerical_features']]
X_train.drop(params.feature_params['target'], axis=1, inplace=True)
X_val = val_df[params.feature_params['numerical_features']]
X_val.drop(params.feature_params['target'], axis=1, inplace=True)

In [135]:
list(data.columns)

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'condition']

In [10]:
X_train.shape, X_val.shape

((237, 13), (60, 13))

In [129]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42, max_depth=3, n_estimators=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
evaluate_model(y_pred, y_val)

{'accuracy': 0.7333333333333333, 'f1_score': 0.7241379310344827}

In [131]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_val)
evaluate_model(y_pred, y_val)

{'accuracy': 0.7666666666666667, 'f1_score': 0.7586206896551724}

In [47]:
clf

In [17]:
# logger.info(f"starting of train process with params {params}")

def run_train_pipeline():
    # logger.info(f"starting of train process with params {params}")
    data = read_data(params.input_data_path)
    # logger.info(f"data  shape {data.shape}")
    train_df, val_df = split_train_val_data(data, params.splitting_params)
    y_train = train_df[params.feature_params['target']]
    X_train = train_df[params.feature_params['numerical_fearures']]
    X_train.drop(params.feature_params['target'])
    y_val = val_df[params.feature_params['target']]
    X_val = val_df[params.feature_params['numerical_features']]
    X_val.drop(params.feature_params['target'])
    # logger.info(f"train dataset shape is "{X_train.shape})
    # logger.info(f"validation dataset shape is {X_val.shape}")
    model = params.train_params['model_type']

In [5]:
import pandas as pd
df = pd.read_csv('heart_cleveland_upload.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [7]:
list(df.columns)

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'condition']

In [10]:
hydra._internal.hydra.GlobalHydra().clear()

hydra.experimental.initialize(config_dir=os.path.dirname(config_path))

cfg = hydra.experimental.compose(config_file=os.path.basename(config_path), overrides=[])

AttributeError: module 'hydra._internal.hydra' has no attribute 'GlobalHydra'

In [4]:
x = my_app()

usage: ipykernel_launcher.py [--help] [--hydra-help] [--version]
                             [--cfg {job,hydra,all}] [--resolve]
                             [--package PACKAGE] [--run] [--multirun]
                             [--shell-completion] [--config-path CONFIG_PATH]
                             [--config-name CONFIG_NAME]
                             [--config-dir CONFIG_DIR]
                             [--experimental-rerun EXPERIMENTAL_RERUN]
                             [--info [{all,config,defaults,defaults-tree,plugins,searchpath}]]
                             [overrides [overrides ...]]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
