In [47]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")

## Number 8

**Описание**

Используем два метода: логистическая регрессия и случайный лес (для разнообразия вариантов выполнения задания).


LabelEncoder используется, чтобы преобразовать метки классов (A, B, C ...) в числа, чтобы модель могла работать с ними.


StandardScaler не обязателен к использованию, можно убрать при желании.

### Загрузка данных

In [48]:
df_train = pd.read_csv('./data/train_8.csv')
test = pd.read_csv('./data/test_8.csv')
df_train.head()

Unnamed: 0,x1,x2,x3,f
0,0.455058,0.146759,2.426221,D
1,0.403961,1.402071,2.891587,B
2,0.351298,0.095268,2.26343,C
3,0.664298,0.600407,0.670098,A
4,0.463471,1.633033,1.037303,D


### Оценка качества моделей на валидационной выборке

In [49]:
encoder = LabelEncoder()
X_train, y_train = df_train.drop(columns=['f']), pd.DataFrame(encoder.fit_transform(df_train['f']), columns=['f'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [50]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)

In [51]:
linear_model = LogisticRegression()
linear_model.fit(X_train, y_train.values.ravel())
predict = linear_model.predict(scaler.transform(X_val))

In [52]:
print('Accuracy linear model on validation data =', accuracy_score(encoder.inverse_transform(y_val), encoder.inverse_transform(predict)))

Accuracy linear model on validation data = 0.9466666666666667


In [53]:
random_fores_classifier = RandomForestClassifier()
random_fores_classifier.fit(X_train, y_train)
predict = random_fores_classifier.predict(scaler.transform(X_val))

In [54]:
print('Accuracy RFC model on validation data =', accuracy_score(encoder.inverse_transform(y_val), encoder.inverse_transform(predict)))

Accuracy RFC model on validation data = 0.8266666666666667


### Предсказание с помощью линейной модели 

In [55]:
encoder = LabelEncoder()
X_train, y_train = df_train.drop(columns=['f']), pd.DataFrame(encoder.fit_transform(df_train['f']), columns=['f'])

scaler = StandardScaler() # Используем по желанию
X_train = scaler.fit_transform(X_train, y_train)

linear_model = LogisticRegression()
linear_model.fit(X_train, y_train.values.ravel())
predict = linear_model.predict(scaler.transform(test))
# predict = linear_model.predict(test) Если не используется StandardScaler 

predict = pd.DataFrame(encoder.inverse_transform(predict), columns=['f'])

### Предсказание с помощью случайного леса

In [56]:
encoder = LabelEncoder()
X_train, y_train = df_train.drop(columns=['f']), pd.DataFrame(encoder.fit_transform(df_train['f']), columns=['f'])

scaler = StandardScaler() # Используем по желанию
X_train = scaler.fit_transform(X_train, y_train)

random_fores_classifier = RandomForestClassifier()
random_fores_classifier.fit(X_train, y_train)
predict = random_fores_classifier.predict(scaler.transform(test))
# predict = random_fores_classifier.predict(test) Если не используется StandardScaler 

predict = pd.DataFrame(encoder.inverse_transform(predict), columns=['f'])

### Сохраняем результат

In [57]:
predict.to_csv('number_8_result.csv') # Если необходимо убрать индексы используем в параметрах index=False

## Number 9


In [58]:
df_train = pd.read_csv('./data/train_9.csv')
test = pd.read_csv('./data/test_9.csv')
df_train.head()

Unnamed: 0,u,x,y,z
0,21,4,5,6
1,24,7,5,7
2,27,9,7,5
3,26,6,7,7
4,22,4,7,4


### Оценка качества моделей на валидационной выборке

In [59]:
X_train, y_train = df_train.drop(columns=['u']), pd.DataFrame(df_train['u'], columns=['u'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [60]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)

In [61]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train.values.ravel())
predict = linear_model.predict(scaler.transform(X_val))

print('MAE linear model on validation data =', mean_absolute_error(y_val, predict))

MAE linear model on validation data = 0.678897697264843


In [62]:
random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(X_train, y_train)
predict = random_forest_reg.predict(scaler.transform(X_val))

print('MAE RFR model on validation data =', mean_absolute_error(y_val, predict))

MAE RFR model on validation data = 0.8641140500240505


### Предсказание с помощью линейной модели 

In [63]:
X_train, y_train = df_train.drop(columns=['u']), pd.DataFrame(df_train['u'], columns=['u'])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train.values.ravel())
predict = linear_model.predict(scaler.transform(test))

predict = pd.DataFrame(predict, columns=['f'])

### Предсказание с помощью случайного леса

In [64]:
X_train, y_train = df_train.drop(columns=['u']), pd.DataFrame(df_train['u'], columns=['u'])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)

random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(X_train, y_train)
predict = random_forest_reg.predict(scaler.transform(test))

predict = pd.DataFrame(predict, columns=['f'])

In [65]:
predict

Unnamed: 0,f
0,24.380000
1,20.401905
2,23.529786
3,28.373750
4,23.736070
...,...
95,24.993952
96,23.935500
97,30.251667
98,18.250381


### Сохраняем результат

In [66]:
predict.to_csv('number_9_result.csv') # Если необходимо убрать индексы используем в параметрах index=False