# Предсказываем стоимость мед страховки



## Загужаем необходимые библиотеки

In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Загружаем данные

Собраны данные:
- age: возраст
- sex: пол
- bmi: индекс массы тела
- children: количество детей, охваченных медицинским страхованием / количество иждивенцев
- smoker: курение
- region: регион (northeast, southeast, southwest, northwest).
- charges: индивидуальные медицинские расходы (его и хотим предсказать)

In [22]:
df = pd.read_csv('insurance.csv')

In [23]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Смотрим статистику, что нет пропусков и отсуствующих значений

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [25]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


## Преобразуем строковые данные

In [26]:
# Заменяем пол и курение на числа
df['sex']=df['sex'].map({'male':1, 'female':0})
df['smoker']=df['smoker'].map({'yes':1,'no':0})

In [27]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [28]:
# Заменяем регион на набор отдельных колонок (is_southwest, is_southeast и тп)
df = pd.get_dummies(df, columns=['region'])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


## Формируем признаки и целевую переменную

In [30]:
X = df[['age','sex','bmi','children','smoker', 'region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']]
y = df['charges']

In [31]:
X, y

(      age  sex     bmi  children  smoker  region_northeast  region_northwest  \
 0      19    0  27.900         0       1                 0                 0   
 1      18    1  33.770         1       0                 0                 0   
 2      28    1  33.000         3       0                 0                 0   
 3      33    1  22.705         0       0                 0                 1   
 4      32    1  28.880         0       0                 0                 1   
 ...   ...  ...     ...       ...     ...               ...               ...   
 1333   50    1  30.970         3       0                 0                 1   
 1334   18    0  31.920         0       0                 1                 0   
 1335   18    0  36.850         0       0                 0                 0   
 1336   21    0  25.800         0       0                 0                 0   
 1337   61    0  29.070         0       1                 0                 1   
 
       region_southeast  r

## Разделяем данные на выборку для обучения/проверки

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Создаем и обучаем модель линейной регресии

In [33]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.coef_

array([  248.58755473,  -176.8955229 ,   372.45174266,   349.8926905 ,
       24394.76813436,   771.88040097,   349.71340345,  -798.99584307,
        -322.59796135])

## Получаем предсказание и оцениваем качество

In [34]:
# Получите предсказание
lr.predict(X_test)

array([ 8.62255836e+03,  1.70640769e+04,  8.63620932e+03,  6.16031445e+03,
        1.09206731e+04,  4.04247451e+03,  3.87497109e+04,  9.65393297e+03,
       -9.23331865e+02,  8.96191273e+03,  1.08444665e+04,  1.31591973e+04,
        1.42542958e+04,  1.12016308e+04,  6.97444637e+03,  1.47528788e+04,
        4.37515048e+03,  3.27905563e+04,  3.42116072e+04,  6.75335182e+03,
        1.10174425e+04,  1.18299593e+04,  7.11526695e+03,  4.68270212e+03,
        1.80761657e+03,  1.30938743e+04,  7.69783405e+03,  1.84074434e+03,
        1.35350844e+04,  3.47459795e+04,  1.28436273e+04,  8.09490461e+03,
        1.42787485e+04,  5.77200952e+03,  3.38638366e+04,  5.17354626e+03,
        7.68664248e+03,  3.04239003e+03,  3.67995873e+03, -5.63313167e+02,
        4.33282849e+03,  2.40967478e+04,  2.75820250e+04,  1.50302460e+04,
        3.57435135e+03,  9.23490571e+03,  3.16368542e+04,  1.36103760e+04,
        1.27298644e+03,  6.48573640e+03,  1.43886069e+04,  3.94149631e+03,
        1.48499741e+04,  

In [None]:
# Оцените качество, при помощи метода mean_squared_error для тестовой выборки

In [38]:
mean_squared_error(y_test, lr.predict(X_test))

37145488.306200825

In [39]:
mean_squared_error(y_train, lr.predict(X_train))

36522307.784683585

## Делаем предсказание для одного человека

In [40]:
# Заполняем данные по конкретному человеку
data = [{
    "age": 20,
    "sex": 1,
    "bmi": 30,
    "children": 2,
    "smoker": 1,
    "region_northeast": 0,
    "region_northwest": 0,
    "region_southeast": 1,
    "region_southwest": 0
}]

In [41]:
df_person = pd.DataFrame(data)
df_person.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,20,1,30,2,1,0,0,1,0


In [42]:
lr.predict(df_person)

array([27197.34409215])