In [2]:
pip install -Uq pandas scikit-learn numpy plotly

Note: you may need to restart the kernel to use updated packages.


In [16]:
import pandas as pd
import numpy as np
import sklearn.linear_model as lm
from sklearn.metrics import mean_squared_error
import plotly.express as px

In [17]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
fig = px.histogram(
    data,
    x="age",
    y="bmi",
    color="sex",
    marginal="box",
)
fig.show()

In [19]:

fig = px.histogram(
    data,
    x="age",
    y="charges",
    color="sex",
    marginal="box",
)
fig.show()

In [20]:
data["age"].value_counts()

age
18    69
19    68
46    29
52    29
50    29
47    29
48    29
51    29
45    29
20    29
24    28
27    28
28    28
25    28
23    28
49    28
54    28
53    28
22    28
21    28
26    28
31    27
41    27
44    27
43    27
42    27
29    27
30    27
40    27
32    26
33    26
57    26
34    26
55    26
56    26
35    25
58    25
37    25
59    25
39    25
36    25
38    25
62    23
60    23
63    23
61    23
64    22
Name: count, dtype: int64

<h5>Колонок с данными по людям в возрасте 18–19 лет слишком много по сравнению с другими возрастами. Дропнем 40 значений с каждой.</h5>
Ровнее выборка — точнее расчёт страховки.

In [44]:
border = 40
i, j = 0, 0
mdata = data
for p in range(2, len(data)):
    if data.iloc[p]["age"] == data.iloc[1]["age"] and i < border:
        mdata = mdata.drop(index=p, axis=0)
        i += 1
    if data.iloc[p]["age"] == data.iloc[0]["age"] and j < border:
        mdata = mdata.drop(index=p, axis=0)
        j += 1
mdata.reset_index()
print(mdata) 

fig = px.histogram(
    mdata,
    x="age",
    y="bmi",
    color="sex",
    marginal="box",
)
fig.show()

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1258 rows x 7 columns]


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


<h5>Данные необходимо нормализовать</h5>
Так как они сильно отличаются по значениям — к примеру, возраст измеряется в числах от 18 до 70, а количество детей — в натуральных числах, близких к нулю

In [46]:
for _ in range(len(mdata)):
    mdata = mdata.replace(mdata.iloc[_]["age"], mdata.iloc[_]["age"]/10)
    mdata = mdata.replace(mdata.iloc[_]["bmi"], round(((mdata.iloc[_]["bmi"] - min(mdata["bmi"])) / (max(mdata["bmi"] - min(mdata["bmi"]))))*10, 3))
    # mdata = mdata.replace(mdata.iloc[_]["children"], round(((mdata.iloc[_]["children"] - min(mdata["children"])) / (max(mdata["children"] - min(mdata["children"]))))*10, 3))

<h5>Некоторые колонки нельзя анализировать, т.к. их значения не получится высчитать</h5>
Переводим их в условные обозначения, записанные через целые числа int

In [47]:
mdata["sex"] = mdata["sex"].replace({"male": 0, "female": 1})
mdata["smoker"] = mdata["smoker"].replace({"no": 0, "yes": 1})
mdata["region"] = mdata["region"].replace({"northwest": 0.25, "southwest": 0.5, "southeast": 0.75, "northeast": 1.0})

In [48]:
mdata.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,3.5800000000000004e-28,1,5.075,0.0,1,0.5,16884.924
1,1.8000000000000002e-28,0,1.171,0.035,0,0.75,1725.5523
2,0.0,0,0.0,0.0,0,0.75,4449.462
3,0.0,0,0.149,0.0,0,0.25,21984.47061
4,0.0,0,0.0,0.0,0,0.25,3866.8552


In [11]:
mdata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1258 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1258 non-null   float64
 1   sex       1258 non-null   int64  
 2   bmi       1258 non-null   float64
 3   children  1258 non-null   float64
 4   smoker    1258 non-null   int64  
 5   region    1258 non-null   float64
 6   charges   1258 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 78.6 KB


<h4>Разбиваем датасет на тренировочные и тестовые сеты в пропорции 2/1</h4>

In [12]:
train_x = mdata.drop("charges", axis=1)[:-419]
test_x = mdata.drop("charges", axis=1)[-419:]

train_y = mdata["charges"][:-419]
test_y = mdata["charges"][-419:] 

<h5>Сначала попробуем готовую модель из библиотеки sklearn</h5>

In [13]:
insurance = lm.LinearRegression()
insurance.fit(train_x, train_y)

prediction_y = insurance.predict(test_x)

print(f"Mean squared error: {mean_squared_error(test_y, prediction_y):.2f}")

Mean squared error: 59532994.48


<h5>Теперь попробуем написать линейную регрессию сами</h5>

In [14]:
class GradientDescentLinearRegression:
    
    def __init__(self, learning_rate=0.01, max_iterations=100000, eps=1e-2):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.eps = eps
        
    def predict(self, X):
        return np.dot(X, self.w.T)
    
    def cost(self, X, y):
        pred_y = self.predict(X)
        loss = (y - pred_y)**2
        return np.mean(loss)

    def grad(self, X, y):
        pred_y = self.predict(X)
        d_intercept = -2*sum(y - pred_y)
        d_x = -2*sum(X[:,1:] * (y - pred_y).reshape(-1,1))
        g = np.append(np.array(d_intercept), d_x)
        return g / X.shape[0]

    def fit(self, X, y, verbose = True):
        self.w = np.zeros(X.shape[1])
        w_hist = [self.w]
        cost_hist = [self.cost(X, y)]
        
        for iter in range(self.max_iterations):
            g = self.grad(X, y)
            step = self.learning_rate * g
            self.w = self.w - step
            w_hist.append(self.w)
            
            J = self.cost(X, y)
            cost_hist.append(J)
            
            if verbose:
                print(f"Iter: {iter}, gradient: {g}, params: {self.w}, cost: {J}")
            
            if np.linalg.norm(w_hist[-1] - w_hist[-2]) < self.eps:
                break
        
        self.iterations = iter + 1
        self.w_hist = w_hist
        self.cost_hist = cost_hist
        
        return self

In [None]:
def MSE(test, true, result, iterations):
    if iterations == 0:
        
    return 

In [15]:
insurance = GradientDescentLinearRegression(0.0001, 1000000)
insurance.fit(np.array(train_x), np.array(train_y), verbose=False)
pred_y = insurance.predict(test_x)

print(f"Mean squared error: {mean_squared_error(test_y, pred_y):.2f}")

Mean squared error: 762550906.99
