### Библиотеки и константы

In [None]:
import pandas as pd
import json
import os
import joblib


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [28]:
#Константы
RANDOM_SEED=42
TEST_SIZE=0.3

### Загрузка данных

In [29]:
data = pd.read_csv('../datasets/heart.csv')

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [31]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


### Подготовка данных

In [32]:
target='target'

In [33]:
X=data.drop(target, axis=1)
y=data[target]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_SEED,
                                                    stratify=y)

### Пайплайн

In [35]:
pipe = make_pipeline(StandardScaler(),
                     LogisticRegression())

In [36]:
pipe.fit(X_train, y_train) 

In [37]:
print("Предсказание на тестовой выборке")
y_pred = pipe.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f'Точность модели: {score:.2f}')

Предсказание на тестовой выборке
Точность модели: 0.82


In [38]:
X_test.head(1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
289,40,1,0,152,223,0,1,181,0,0.0,2,0,3


In [39]:
data_json = X_test.head(5).to_json()

In [40]:
y_test.head(1)

289    0
Name: target, dtype: int64

In [41]:
for index, row in X_test.head(5).iterrows():
    a = dict(zip(X_test.columns, row.tolist()))
    print(a)

{'age': 40.0, 'sex': 1.0, 'cp': 0.0, 'trestbps': 152.0, 'chol': 223.0, 'fbs': 0.0, 'restecg': 1.0, 'thalach': 181.0, 'exang': 0.0, 'oldpeak': 0.0, 'slope': 2.0, 'ca': 0.0, 'thal': 3.0}
{'age': 43.0, 'sex': 1.0, 'cp': 0.0, 'trestbps': 132.0, 'chol': 247.0, 'fbs': 1.0, 'restecg': 0.0, 'thalach': 143.0, 'exang': 1.0, 'oldpeak': 0.1, 'slope': 1.0, 'ca': 4.0, 'thal': 3.0}
{'age': 47.0, 'sex': 1.0, 'cp': 2.0, 'trestbps': 138.0, 'chol': 257.0, 'fbs': 0.0, 'restecg': 0.0, 'thalach': 156.0, 'exang': 0.0, 'oldpeak': 0.0, 'slope': 2.0, 'ca': 0.0, 'thal': 2.0}
{'age': 48.0, 'sex': 1.0, 'cp': 0.0, 'trestbps': 122.0, 'chol': 222.0, 'fbs': 0.0, 'restecg': 0.0, 'thalach': 186.0, 'exang': 0.0, 'oldpeak': 0.0, 'slope': 2.0, 'ca': 0.0, 'thal': 2.0}
{'age': 39.0, 'sex': 0.0, 'cp': 2.0, 'trestbps': 138.0, 'chol': 220.0, 'fbs': 0.0, 'restecg': 1.0, 'thalach': 152.0, 'exang': 0.0, 'oldpeak': 0.0, 'slope': 1.0, 'ca': 0.0, 'thal': 2.0}


In [42]:
test_json = {'age': 40.0, 'sex': 1.0, 'cp': 0.0, 'trestbps': 152.0, 'chol': 223.0, 'fbs': 0.0, 'restecg': 1.0, 'thalach': 181.0, 'exang': 0.0, 'oldpeak': 0.0, 'slope': 2.0, 'ca': 0.0, 'thal': 3.0}

In [43]:
test_df = pd.DataFrame(data=test_json, index=[0])

In [44]:
pipe.predict(test_df)

array([1], dtype=int64)

### Сохранение модели

In [45]:
model_name = "log_reg"
model_ext = ".joblib"
model_path = os.path.join("../service/models", f'{model_name}{model_ext}')

joblib.dump(pipe, model_path)

print(f"Модель сохранена: {model_path}")

Модель сохранена: ../service/models\log_reg.joblib


### Загрузка и проверка

In [46]:
model = joblib.load(model_path)
print("Модель загружена")

print("Предсказание на тестовой выборке")
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f'Точность модели: {score:.2f}')

Модель загружена
Предсказание на тестовой выборке
Точность модели: 0.82
