In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Задание 1

Поработайте с датасетом про цены мобильных телефонов, обучите регрессию.

In [42]:
data = pd.read_csv('Cellphone.csv')

In [43]:
data.head()

Unnamed: 0,Product_id,Price,Sale,weight,resoloution,ppi,cpu core,cpu freq,internal mem,ram,RearCam,Front_Cam,battery,thickness
0,203,2357,10,135.0,5.2,424,8,1.35,16.0,3.0,13.0,8.0,2610,7.4
1,880,1749,10,125.0,4.0,233,2,1.3,4.0,1.0,3.15,0.0,1700,9.9
2,40,1916,10,110.0,4.7,312,4,1.2,8.0,1.5,13.0,5.0,2000,7.6
3,99,1315,11,118.5,4.0,233,2,1.3,4.0,0.512,3.15,0.0,1400,11.0
4,880,1749,11,125.0,4.0,233,2,1.3,4.0,1.0,3.15,0.0,1700,9.9


In [44]:
data['Price'].mean()

2215.5962732919256

In [45]:
data.Price.mode()

Unnamed: 0,Price
0,1734
1,2744


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Product_id    161 non-null    int64  
 1   Price         161 non-null    int64  
 2   Sale          161 non-null    int64  
 3   weight        161 non-null    float64
 4   resoloution   161 non-null    float64
 5   ppi           161 non-null    int64  
 6   cpu core      161 non-null    int64  
 7   cpu freq      161 non-null    float64
 8   internal mem  161 non-null    float64
 9   ram           161 non-null    float64
 10  RearCam       161 non-null    float64
 11  Front_Cam     161 non-null    float64
 12  battery       161 non-null    int64  
 13  thickness     161 non-null    float64
dtypes: float64(8), int64(6)
memory usage: 17.7 KB


In [47]:
data = data.drop(columns=['Product_id'])

In [48]:
X = data.drop(columns=['Price'])
y = data['Price']

In [49]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(Xtrain, ytrain) # обучение модели

pred_test = model.predict(Xtest) # предсказание
rmse = mean_squared_error(ytest, pred_test) ** 0.5
r2 = r2_score(ytest, pred_test)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 147.92055875562536
R2: 0.9614025134850815


Масштабирование и регуляризация

In [50]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# масштабирование
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)

# обучение
lasso = Lasso(alpha=0.1)
lasso.fit(Xtrain_scaled, ytrain)

# коэффициенты модели
lasso_coef = pd.Series(lasso.coef_, index=X.columns)

# зануленные признаки
print("Зануленные признаки:", list(lasso_coef[lasso_coef == 0].index))
lasso_coef

Зануленные признаки: []


Unnamed: 0,0
Sale,-30.869209
weight,-49.767513
resoloution,-90.606922
ppi,153.611445
cpu core,123.87894
cpu freq,76.22378
internal mem,164.624682
ram,163.821943
RearCam,23.053528
Front_Cam,31.72972


In [51]:
# предсказание и оценка качества модели
ypred_train = lasso.predict(Xtrain_scaled)
ypred_test = lasso.predict(Xtest_scaled)

print(f"R2 train: {r2_score(ytrain,ypred_train)}, R2 test: {r2_score(ytest,ypred_test)}")

R2 train: 0.951600102761604, R2 test: 0.961323488141152


### Задание 2
Поработайте с датасетом diet_data (Его создатель решил похудеть, подсчитывая калории, и ему это действительно удалось; в течение времени, пока худел, он записывал, что и какое он ест). Целевая колонка - change.

In [4]:
diet = pd.read_csv('diet_data.csv')

In [5]:
diet.head()

Unnamed: 0,Date,Stone,Pounds,Ounces,weight_oz,calories,cals_per_oz,five_donuts,walk,run,wine,prot,weight,change
0,7/30/2018,12.0,2.0,6.0,2726.0,1950.0,0.72,1.0,1.0,0.0,0.0,0.0,0.0,-30.0
1,7/31/2018,12.0,0.0,8.0,2696.0,2600.0,0.96,1.0,0.0,0.0,0.0,0.0,0.0,8.0
2,8/1/2018,12.0,1.0,0.0,2704.0,2500.0,0.92,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,8/2/2018,12.0,1.0,0.0,2704.0,1850.0,0.68,1.0,1.0,0.0,1.0,0.0,0.0,-40.0
4,8/3/2018,11.0,12.0,8.0,2664.0,2900.0,1.09,1.0,1.0,0.0,0.0,0.0,0.0,14.0


In [6]:
diet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         150 non-null    object 
 1   Stone        142 non-null    float64
 2   Pounds       142 non-null    float64
 3   Ounces       142 non-null    float64
 4   weight_oz    149 non-null    float64
 5   calories     140 non-null    float64
 6   cals_per_oz  147 non-null    object 
 7   five_donuts  140 non-null    float64
 8   walk         140 non-null    float64
 9   run          140 non-null    float64
 10  wine         140 non-null    float64
 11  prot         140 non-null    float64
 12  weight       140 non-null    float64
 13  change       147 non-null    float64
dtypes: float64(12), object(2)
memory usage: 16.6+ KB


#### Подготовка данных

In [7]:
# удалю сразу строки с пустыми целевыми данными
diet.dropna(subset=['change'], inplace=True)
diet = diet[diet.change != 0]

In [8]:
# заменяю даты-объекты на даты
diet.Date = pd.to_datetime(diet.Date)

# затем добавлю отдельную колонку с кол-вом дней от начала диеты
diet['since_start'] = (diet.Date - diet.Date.min()).dt.days
diet.drop('Date', axis=1, inplace=True)

In [9]:
# калорий на унцию преобразовываю в числа
diet['cals_per_oz'] = pd.to_numeric(diet['cals_per_oz'], errors='coerce')

In [10]:
diet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 0 to 143
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Stone        134 non-null    float64
 1   Pounds       134 non-null    float64
 2   Ounces       134 non-null    float64
 3   weight_oz    134 non-null    float64
 4   calories     133 non-null    float64
 5   cals_per_oz  134 non-null    float64
 6   five_donuts  133 non-null    float64
 7   walk         133 non-null    float64
 8   run          133 non-null    float64
 9   wine         133 non-null    float64
 10  prot         133 non-null    float64
 11  weight       133 non-null    float64
 12  change       134 non-null    float64
 13  since_start  134 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 15.7 KB


In [11]:
# пустую строку с calories заменю на медианное значение, так как разброс большой
diet.calories = diet.calories.fillna(diet.calories.median())

In [12]:
diet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 0 to 143
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Stone        134 non-null    float64
 1   Pounds       134 non-null    float64
 2   Ounces       134 non-null    float64
 3   weight_oz    134 non-null    float64
 4   calories     134 non-null    float64
 5   cals_per_oz  134 non-null    float64
 6   five_donuts  133 non-null    float64
 7   walk         133 non-null    float64
 8   run          133 non-null    float64
 9   wine         133 non-null    float64
 10  prot         133 non-null    float64
 11  weight       133 non-null    float64
 12  change       134 non-null    float64
 13  since_start  134 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 15.7 KB


In [13]:
# пустые осташиеся five_donuts, walk, run, wine, prot, weight заменяю модой
diet['five_donuts'] = diet['five_donuts'].fillna(diet['five_donuts'].mode()[0])
diet['walk'] = diet['walk'].fillna(diet['walk'].mode()[0])
diet['run'] = diet['run'].fillna(diet['run'].mode()[0])
diet['wine'] = diet['wine'].fillna(diet['wine'].mode()[0])
diet['prot'] = diet['prot'].fillna(diet['prot'].mode()[0])
diet['weight'] = diet['weight'].fillna(diet['weight'].mode()[0])

In [14]:
diet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 0 to 143
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Stone        134 non-null    float64
 1   Pounds       134 non-null    float64
 2   Ounces       134 non-null    float64
 3   weight_oz    134 non-null    float64
 4   calories     134 non-null    float64
 5   cals_per_oz  134 non-null    float64
 6   five_donuts  134 non-null    float64
 7   walk         134 non-null    float64
 8   run          134 non-null    float64
 9   wine         134 non-null    float64
 10  prot         134 non-null    float64
 11  weight       134 non-null    float64
 12  change       134 non-null    float64
 13  since_start  134 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 15.7 KB


#### Масштабирование, обучение и регуляризация

In [15]:
X = diet.drop('change', axis=1)
y = diet.change

In [16]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
reg = LinearRegression().fit(Xtrain, ytrain)

In [18]:
pred_train = reg.predict(Xtrain)
pred_test = reg.predict(Xtest)

In [19]:
mean_squared_error(pred_train, ytrain) ** 0.5

21.027210585099635

In [20]:
mean_squared_error(pred_test, ytest) ** 0.5

20.61301249229532

In [21]:
diet.change.mean(), diet.change.min(), diet.change.max()

(-20.940298507462686, -2598.0, 102.0)

In [22]:
change = ytrain[7]
parameters = Xtrain.iloc[7].values
change, parameters
reg.intercept_ + sum(reg.coef_ * parameters), change

(-6.854392498960124, 6.0)

In [23]:
# масштабирование
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler().fit(Xtrain)
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

In [24]:
reg_scaler = LinearRegression().fit(Xtrain, ytrain)

In [25]:
ypred_train = reg_scaler.predict(Xtrain)
ypred_test = reg_scaler.predict(Xtest)

In [26]:
np.sqrt(mean_squared_error(ytrain, ypred_train)), np.sqrt(mean_squared_error(ytest, ypred_test))

(21.027210585099684, 20.61301249229538)

In [27]:
# добавление регуляризации
reg2 = Lasso(alpha=0.5).fit(Xtrain, ytrain)

ypred2 = reg2.predict(Xtest)
print('Scores:', r2_score(ytest,ypred2), mean_squared_error(ytest,ypred2))
print(reg2.coef_,'\n')

Scores: 0.4401102946199038 427.3962995439767
[ 4.26948028e-01 -0.00000000e+00  9.57339185e-01  2.31320520e+01
 -1.36343747e+03  1.41279532e+03  2.43387493e+00 -3.82842865e+00
 -3.31524672e+00  5.84744422e-01 -0.00000000e+00  1.41215211e+00
 -2.02167787e+00] 



In [None]:
from sklearn.linear_model import LassoCV

n_alphas = 200
alphas = np.linspace(0.1, 20, n_alphas)

lasso_cv = LassoCV(alphas=alphas,
                   cv=5, random_state=17)
lasso_cv.fit(X, y)

In [29]:
reg_21 = Lasso(alpha=lasso_cv.alpha_)
reg_21.fit(Xtrain, ytrain)

ypred2_train = reg_21.predict(Xtrain)
ypred2_test = reg_21.predict(Xtest)

print(f'Score: {r2_score(ytrain, ypred2_train)} - на обучающей выборке, {r2_score(ytest, ypred2_test)} - на тестовой')
# моделька переобучилась

Score: 0.9792037366836112 - на обучающей выборке, 0.3052293653378221 - на тестовой


#### Ridge + одновременная регуляризация

In [30]:
for a in np.arange(0.1, 5.1, 0.2):
  reg3 = Ridge(alpha=a).fit(Xtrain, ytrain)
  ypred3 = reg3.predict(Xtest)

  print('aplha={}'.format(a))
  print(f'R2: {r2_score(ytest, ypred3)}, MAE: {mean_absolute_error(ytest, ypred3)}')

aplha=0.1
R2: 0.3412103786902696, MAE: 16.307211288130706
aplha=0.30000000000000004
R2: -0.15262921685526631, MAE: 23.36807121065196
aplha=0.5000000000000001
R2: -0.8084641158497605, MAE: 30.080155691228825
aplha=0.7000000000000001
R2: -1.4967949773065787, MAE: 35.40337631465471
aplha=0.9000000000000001
R2: -2.162608236554147, MAE: 39.85013925540638
aplha=1.1000000000000003
R2: -2.783693269481103, MAE: 43.531232333388644
aplha=1.3000000000000003
R2: -3.352805195696159, MAE: 46.640908977487335
aplha=1.5000000000000004
R2: -3.8695803170754406, MAE: 49.28619114251763
aplha=1.7000000000000004
R2: -4.336722286883317, MAE: 51.54626556528625
aplha=1.9000000000000004
R2: -4.758159840975237, MAE: 53.49412693417864
aplha=2.1000000000000005
R2: -5.138151936586666, MAE: 55.18548589700434
aplha=2.3000000000000007
R2: -5.480861461321943, MAE: 56.66359776998462
aplha=2.5000000000000004
R2: -5.790164802130277, MAE: 57.96255811864838
aplha=2.7000000000000006
R2: -6.06958071173788, MAE: 59.1096045696575

In [None]:
from sklearn.linear_model import RidgeCV

n_alphas = 200
ridge_alphas = np.logspace(-2, 6, n_alphas)

ridge_cv = RidgeCV(alphas=ridge_alphas,
                   scoring='neg_mean_squared_error',
                   cv=3)
ridge_cv.fit(X, y)

In [32]:
ridge_cv.alpha_ # получилось слишком большим

1000000.0

In [33]:
regressor3 = Lasso(alpha=ridge_cv.alpha_)

regressor3.fit(Xtrain, ytrain)

ypred3_train = regressor3.predict(Xtrain)
ypred3_test = regressor3.predict(Xtest)

print('Scores:', r2_score(ytrain,ypred3_train), r2_score(ytest,ypred3_test))

Scores: 0.0 -1.4666420652046805


In [34]:
regressor3.coef_ # все занулилось

array([ 0., -0., -0.,  0.,  0.,  0.,  0., -0.,  0.,  0.,  0.,  0., -0.])

In [37]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [None]:
elastic_cv = ElasticNetCV(
    l1_ratio=np.linspace(0.1, 1, 10),
    alphas=np.logspace(-4, 2, 100),
    cv=3
    )

elastic_cv.fit(Xtrain, ytrain)

regressor_elastic = ElasticNet(alpha=elastic_cv.alpha_, l1_ratio=elastic_cv.l1_ratio_)
regressor_elastic.fit(Xtrain, ytrain)

ypred_train = regressor_elastic.predict(Xtrain)
ypred_test = regressor_elastic.predict(Xtest)

In [41]:
# получилось не лучше чем случайное предсказание, возможно данных слишком мало, либо необходимо данные лучше подготовить
print('Scores:', r2_score(ytrain, ypred_train), r2_score(ytest, ypred_test))

Scores: 0.9268161946324485 -0.03889660420514063
