In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA

Модель: усредненное значение пяти ансамблей.

Набор данных разбит на 5 частей, каждый ансамбль обучается на своих четырех частях.

Метаалгоритм -- линейная регрессия.

Алгоритмы первого уровня:

1. kNN
2. Случайный лес
3. Градиентный бустинг
4. Градиентный бустинг с гистограммами

In [2]:
train = pd.read_csv('train.csv')
X_pd = train.copy()

marks_dict = {
    'no data':0,
    'poor':1,
    'satisfactory':2,
    'good':3,
    'excellent':4 
}
marks_text = X_pd['f152'].unique()

X_pd['f1_time'] = X_pd['f1'].apply(lambda x: int(dt.datetime.strptime(x, "%Y-%m-%d").timestamp()))
for text in X_pd['f152'].unique():
    X_pd['f152'].replace(text, marks_dict[text], inplace=True)
    
X_pd['f11'].replace('OwnerOccupier', 65, inplace=True)
X_pd['f11'].replace('Investment', 75, inplace = True)

for col in ['f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118']:
    X_pd[col].replace('no', 0, inplace=True)
    X_pd[col].replace('yes', 1, inplace = True)
X_pd.drop(columns=['f291', 'id', 'f1'], inplace = True)
y_pd = train['f291']

In [3]:
testX = pd.read_csv('test.csv')
testX_pr = testX.copy()

testX['f1_time'] = testX['f1'].apply(lambda x: int(dt.datetime.strptime(x, "%Y-%m-%d").timestamp()))
for text in ['no data', 'poor', 'satisfactory', 'good', 'excellent']:
    testX['f152'].replace(text, marks_dict[text], inplace=True)
testX['f11'].replace('OwnerOccupier', 65, inplace=True)
testX['f11'].replace('Investment', 75, inplace = True)
for col in ['f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118']:
    testX[col].replace('no', 0, inplace=True)
    testX[col].replace('yes', 1, inplace = True)
testX.drop(columns=['id', 'f1'], inplace = True)



In [4]:
col_names = []
for i in range(len(X_pd['f12'].unique())):
    col_names.append('Is '+str(i))

    
testX.index = list(range(24736, 24736+6095))    
big = pd.concat([testX, X_pd])
big = big.sort_index()
    
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(big[['f12']]).toarray(), columns = col_names)
big = big.join(enc_df)
big.drop(columns=['f12'], inplace=True)
X_pd = big.loc[:24376, :]
testX = big.loc[24376:, :]

In [5]:
my_imp = KNNImputer(n_neighbors=10, weights='distance')
X_pd = my_imp.fit_transform(X_pd)
np.save('X_pd_kNN_diff10', X_pd)

X_pd.shape

(24376, 435)

In [6]:
testX = my_imp.transform(testX)
np.save('testX_pd_kNN10', testX)

In [7]:
X = X_pd
y = np.array(y_pd)
X.shape, y.shape

((24376, 435), (24376,))

Здесь пытаемся обучить kNN с весами distance.

In [5]:
X_fixed = X_pd
y_fixed = y_pd


for neigh in range(20,40,2):
    for k in [1]:
        print('+++++++++++')
        my_kf = KFold(n_splits = 3)
        for train_index, test_index in my_kf.split(X_fixed):
            X_train, X_test = X_fixed[train_index], X_fixed[test_index]
            y_train, y_test = y_fixed[train_index], y_fixed[test_index]
            knnClassifier = KNeighborsRegressor(n_neighbors=neigh, p = k, weights='distance', n_jobs=-1)
            knnClassifier.fit(X_train, y_train)
            tmp_res = knnClassifier.predict(X_test)
            accur = mean_squared_log_error(y_test, tmp_res.round(), squared=False)
            print('{0} [k={1}, neigh={2}]'.format(accur, k, neigh))

+++++++++++
0.5703885253991824 [k=1, neigh=20]
0.5754986517001063 [k=1, neigh=20]
0.5911248140818299 [k=1, neigh=20]
+++++++++++
0.5695006613551054 [k=1, neigh=22]
0.5749971094361351 [k=1, neigh=22]
0.5904592658456379 [k=1, neigh=22]
+++++++++++
0.5691952677065695 [k=1, neigh=24]
0.5744094190757559 [k=1, neigh=24]
0.5903805923886927 [k=1, neigh=24]
+++++++++++
0.5684914316868126 [k=1, neigh=26]
0.5738176046285969 [k=1, neigh=26]
0.5899453535736776 [k=1, neigh=26]
+++++++++++
0.5683971921042209 [k=1, neigh=28]
0.5734085854342993 [k=1, neigh=28]
0.5900173597399162 [k=1, neigh=28]
+++++++++++
0.5682121452316189 [k=1, neigh=30]
0.5730566672603853 [k=1, neigh=30]
0.5901685579134256 [k=1, neigh=30]
+++++++++++
0.5681145780298438 [k=1, neigh=32]
0.5731662361071416 [k=1, neigh=32]
0.5896797567045062 [k=1, neigh=32]
+++++++++++
0.5679476499701146 [k=1, neigh=34]
0.5726547745636241 [k=1, neigh=34]
0.589486679458409 [k=1, neigh=34]
+++++++++++
0.5676230181617676 [k=1, neigh=36]
0.5725828396665364

Самый оптимальный вариант в итоге был с 36 соседями.

Тут обучаем линейную регрессию

In [9]:
X_fixed = X_pd
y_fixed = y_pd
n_spl = 10

my_kf = KFold(n_splits = n_spl)
tot_accur = 0
for train_index, test_index in my_kf.split(X_fixed):
    X_train, X_test = X_fixed[train_index], X_fixed[test_index]
    y_train, y_test = y_fixed[train_index], y_fixed[test_index]
    my_lr = LinearRegression(positive=True)
    my_lr.fit(X_train, y_train)
    tmp_res = my_lr.predict(X_test)
    accur = mean_squared_log_error(y_test, abs(tmp_res), squared=False)
    tot_accur += accur
    print('LR: {0}'.format(accur))
    
print(tot_accur/n_spl)

LR: 0.7964752699470137
LR: 0.5271143976059428
LR: 0.5328429521998025
LR: 0.5419605197610966
LR: 0.5341319043537456
LR: 0.5375031145407737


KeyboardInterrupt: 

Обучим случайный лес

In [19]:
X_fixed = X_pd
y_fixed = y_pd

my_kf = KFold(n_splits = 3)
for train_index, test_index in my_kf.split(X_fixed):
    X_train, X_test = X_fixed[train_index], X_fixed[test_index]
    y_train, y_test = y_fixed[train_index], y_fixed[test_index]
    my_clf = RandomForestRegressor(n_estimators=150, n_jobs = -1)
    my_clf.fit(X_train, y_train)

    tmp_res = my_clf.predict(X_test)

    accur = mean_squared_log_error(y_test, abs(tmp_res), squared=False)
    print('Random forest: {0}'.format(accur))

Random forest: 0.47287702708992635
Random forest: 0.47435584442871565
Random forest: 0.4920860279519843


Градиентный бустинг

In [55]:
X_fixed = X_pd
y_fixed = y_pd


for estim in range(36,38,10):
    for m_depth in range(5,13,2):
        for loss in ['squared_error']:
            print('++++++++')
            print('estim = {0}, max_depth= {1}, loss = {2}'.format(estim, m_depth, loss))
            my_kf = KFold(n_splits = 3)
            for train_index, test_index in my_kf.split(X_fixed):
                X_train, X_test = X_fixed[train_index], X_fixed[test_index]
                y_train, y_test = y_fixed[train_index], y_fixed[test_index]

                my_clf = GradientBoostingRegressor(n_estimators=estim, max_depth=m_depth, loss=loss)
                my_clf.fit(X_train, y_train)

                tmp_res = my_clf.predict(X_test)

                accur = mean_squared_log_error(y_test, abs(tmp_res), squared=False)
                print('Gradient boosting: {0}'.format(accur))
                
my_clf.feature_importance_

++++++++
estim = 100, max_depth= 5, loss = squared_error
Gradient boosting: 0.4734022610145012


KeyboardInterrupt: 

Градиентный бустинг с гистограммами

In [57]:
X_fixed = X_pd
y_fixed = y_pd


for max_leaf_nodes in range(20,50,5):
    for m_depth in range(5,10,2):
        for loss in ['squared_error']:
            print('++++++++')
            print('max_leaf_nodes = {0}, max_depth= {1}, loss = {2}'.format(max_leaf_nodes, m_depth, loss))
            my_kf = KFold(n_splits = 3)
            for train_index, test_index in my_kf.split(X_fixed):
                X_train, X_test = X_fixed[train_index], X_fixed[test_index]
                y_train, y_test = y_fixed[train_index], y_fixed[test_index]

                my_clf = HistGradientBoostingRegressor(max_iter=160, max_depth=m_depth, loss=loss, max_leaf_nodes=max_leaf_nodes)
                my_clf.fit(X_train, y_train)

                tmp_res = my_clf.predict(X_test)

                accur = mean_squared_log_error(y_test, abs(tmp_res), squared=False)
                print('Gradient boosting: {0}'.format(accur))

++++++++
max_leaf_nodes = 20, max_depth= 5, loss = squared_error
Gradient boosting: 0.4605381986312555
Gradient boosting: 0.4668740461065695
Gradient boosting: 0.4855899358896991
++++++++
max_leaf_nodes = 20, max_depth= 7, loss = squared_error
Gradient boosting: 0.4636511319083091
Gradient boosting: 0.4675882445475966
Gradient boosting: 0.4848098352887786
++++++++
max_leaf_nodes = 20, max_depth= 9, loss = squared_error
Gradient boosting: 0.45946850171705705
Gradient boosting: 0.47054462157761284
Gradient boosting: 0.48278661262958394
++++++++
max_leaf_nodes = 25, max_depth= 5, loss = squared_error
Gradient boosting: 0.4621921248352332
Gradient boosting: 0.4688988432480071
Gradient boosting: 0.48495162170423595
++++++++
max_leaf_nodes = 25, max_depth= 7, loss = squared_error
Gradient boosting: 0.45843964135140275
Gradient boosting: 0.46666868748165014
Gradient boosting: 0.4840735101687848
++++++++
max_leaf_nodes = 25, max_depth= 9, loss = squared_error
Gradient boosting: 0.4577757126894

In [6]:
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_pd, y_pd, train_size=0.72)

In [6]:
X_pd.shape, testX.shape

((24376, 435), (6095, 435))

Усредняем несколько ансамблей

In [8]:
my_kf = KFold(n_splits = 5)

pack_of_models = []
my_models = []
for train_index, test_index in my_kf.split(X_pd):
    X_train, X_test = X_pd[train_index], X_pd[test_index]
    y_train, y_test = y_pd[train_index], y_pd[test_index]
    my_models = []
    my_models.append(KNeighborsRegressor(n_neighbors=36, p = 1, weights='distance', n_jobs=-1).fit(X_train, y_train))
    my_models.append(RandomForestRegressor(n_estimators=80, n_jobs = -1).fit(X_train, y_train))
    my_models.append(GradientBoostingRegressor(n_estimators=100, max_depth=9).fit(X_train, y_train))
    my_models.append(HistGradientBoostingRegressor(max_iter=160, max_depth=9, max_leaf_nodes = 40).fit(X_train, y_train))
    print('Base Models')
    meta_features = []
    for elem in X_test:
        tmp_res = np.array([])
        for model in my_models:
            tmp_res = np.hstack([tmp_res, model.predict(elem.reshape(1, len(elem)))])
        meta_features.append(tmp_res)
    meta_features = np.array(meta_features)
    pack_of_models.append([my_models, LinearRegression().fit(meta_features, y_test)])
    print('Meta LR')
    
print('Model is done')

Base Models
Meta LR
Base Models
Meta LR
Base Models
Meta LR
Base Models
Meta LR
Base Models
Meta LR
Model is done


In [20]:
print('Test has started')
new_meta_features = []
my_result = []
for elem in X_pd:
    tmp_answ = []    
    for model_string in pack_of_models:
        tmp_res = np.array([])
        for model in model_string[0]:
            tmp_res = np.hstack([tmp_res, model.predict(elem.reshape(1, len(elem)))])
        tmp_answ.append(model_string[1].predict(tmp_res.reshape(1,len(tmp_res))))
    tmp_answ = np.array(tmp_answ)
    my_result.append(np.mean(tmp_answ))

my_result = np.array(my_result)
    

accur = mean_squared_log_error(y_pd, abs(my_result), squared=False)
print(accur)

Test has started


KeyboardInterrupt: 

In [9]:
print('Test has started')
new_meta_features = []
my_result = []
for elem in testX:
    tmp_answ = []    
    for model_string in pack_of_models:
        tmp_res = np.array([])
        for model in model_string[0]:
            tmp_res = np.hstack([tmp_res, model.predict(elem.reshape(1, len(elem)))])
        tmp_answ.append(model_string[1].predict(tmp_res.reshape(1,len(tmp_res))))
    tmp_answ = np.array(tmp_answ)
    my_result.append(np.mean(tmp_answ))

my_result = np.array(my_result)
my_result


Test has started


array([ 6745565.48669781,  5432736.76855097,  4055334.72111829, ...,
        7601768.57051382,  3687879.40889432, 10820401.64183781])

In [10]:
my_result.round()

array([ 6745565.,  5432737.,  4055335., ...,  7601769.,  3687879.,
       10820402.])

In [11]:
my_answ = pd.DataFrame({'id': range(1,1+6095), 'prediction': my_result.round()})
my_answ.to_csv('Doynichenko_Maxim.csv', index=False)