In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import torch

In [2]:
data_types_dict = {
    'time_id': 'int32',
    'investment_id': 'int16',
    "target": 'float16',
}

features = [f'f_{i}' for i in range(300)]

for f in features:
    data_types_dict[f] = 'float16'

In [3]:
data = pd.read_csv('../input/ubiquant-market-prediction/train.csv', 
                       usecols = data_types_dict.keys(),
                       dtype=data_types_dict,
                       index_col = 0, chunksize = 3141410) #3141410

In [4]:
data = next(iter(data))

  mask |= (ar1 == a)


In [5]:
target = data['target']
data.drop(['target', 'investment_id'], inplace=True, axis=1)

In [6]:
"""
from scipy.stats import pearsonr 
def est_score(model):
    y_pred = model.predict(X_test)
    return pearsonr(y_pred, y_test)[0]
"""

'\nfrom scipy.stats import pearsonr \ndef est_score(model):\n    y_pred = model.predict(X_test)\n    return pearsonr(y_pred, y_test)[0]\n'

In [7]:
"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, stratify=data.index)
"""

'\nfrom sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, stratify=data.index)\n'

In [8]:
#del data, target

In [9]:
"""
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression(copy_X=False).fit(X_train, y_train)
"""

'\nfrom sklearn.linear_model import LinearRegression\nreg_model = LinearRegression(copy_X=False).fit(X_train, y_train)\n'

In [10]:
#est_score(reg_model)

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def move_to(data, device):
    """
    moving data to device
    :param data: data to move
    :param device: device
    :return: moved data
    """
    if isinstance(data, (list, tuple)):
        return [move_to(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [12]:
def pearson_loss(x, y):
    vx = x - torch.mean(x)
    vy = y - torch.mean(y)
    loss = -torch.sum(vx*vy) / (torch.sqrt(torch.sum(vx*vx)) * torch.sqrt(torch.sum(vy*vy)))
    return loss

In [13]:
class DNN_Model(torch.nn.Module):
    def __init__(self, deep_size):
        super().__init__()
        self.linear_1 = torch.nn.Linear(300, deep_size)
        self.linear_2 = torch.nn.Linear(deep_size, deep_size)
        self.linear_3 = torch.nn.Linear(deep_size, deep_size)
        self.linear_4 = torch.nn.Linear(deep_size, deep_size)
        self.linear_5 = torch.nn.Linear(deep_size, 1)
        self.dout = torch.nn.Dropout(p=0.7)
        self.act = torch.nn.SiLU()
    def fit(self, X, y, iters, batch_size = 4096, subset = 0.8, silent=False, plot=False, val=False, X_val=None, y_val=None):
        self = move_to(self, device)
        if val:
            tensor_train_val = move_to(torch.FloatTensor(X_val.values), device)
            tensor_y_val = move_to(torch.FloatTensor(y_val.values), device)
        scores_train = []
        scores_val = []
        
        opt = torch.optim.Adam(self.parameters(), lr=5e-4)
        
        for epoch in range(iters):
            i = 0
            #learning cycle
            while i + batch_size < y.shape[0]:     
                self.train()
                self.zero_grad()
                X_tensor = move_to(torch.FloatTensor(X.iloc[i:i+batch_size, :].values), device)
                y_tensor = move_to(torch.FloatTensor(y[i:i+batch_size].values), device)
                loss = pearson_loss(self.tensor_forward(X_tensor).view(-1), y_tensor)
                loss.backward()
                opt.step()
                i+=batch_size
                
            #prints
            if val:
                self.eval()
                val_loss = pearson_loss(self.tensor_forward(tensor_train_val).view(-1), tensor_y_val)
                scores_val.append(float(val_loss.detach()))
            scores_train.append(float(loss.detach()))
            
            if not silent:
                print(f'epoch {epoch} finished with {loss} loss')
                if val:
                    print(f'val loss {val_loss}')
                
            
        if not silent:
            print(f'Finished with {loss} loss')
        if plot:
            if val:
                sns.lineplot(x=range(iters), y=scores_train, label='train')
                sns.lineplot(x=range(iters), y=scores_val, label='val')
            else:
                sns.lineplot(scores_train)
            plt.plot()
                
    def tensor_forward(self, X_tensor):
        X_tensor = move_to(X_tensor, device)
        tensor_X = self.act(self.linear_1(X_tensor))
        tensor_X = self.act(self.linear_2(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_3(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_4(tensor_X))
        tensor_X = self.linear_5(tensor_X)
        return tensor_X 

    def forward(self, X):
        tensor_X = torch.FloatTensor(X.values)
        tensor_X = move_to(tensor_X, device)
        tensor_X = self.act(self.linear_1(tensor_X))
        tensor_X = self.act(self.linear_2(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_3(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_4(tensor_X))
        tensor_X = self.linear_5(tensor_X)
        return tensor_X

In [14]:
class DNN_Model_1(torch.nn.Module):
    def __init__(self, deep_size):
        super().__init__()
        self.linear_1 = torch.nn.Linear(300, deep_size)
        self.linear_2 = torch.nn.Linear(deep_size, deep_size)
        self.linear_3 = torch.nn.Linear(deep_size, deep_size)
        self.linear_4 = torch.nn.Linear(deep_size, deep_size)
        self.linear_5 = torch.nn.Linear(deep_size, deep_size)
        self.linear_6 = torch.nn.Linear(deep_size, deep_size)
        self.linear_7 = torch.nn.Linear(deep_size, deep_size)
        self.linear_8 = torch.nn.Linear(deep_size, 1)
        self.dout = torch.nn.Dropout(p=0.7)
        self.small_dout = torch.nn.Dropout(p=0.3)
        self.act = torch.nn.SiLU()
    def fit(self, X, y, iters, batch_size = 4096, subset = 0.8, silent=False, plot=False, val=False, X_val=None, y_val=None):
        self = move_to(self, device)
        if val:
            tensor_train_val = move_to(torch.FloatTensor(X_val.values), device)
            tensor_y_val = move_to(torch.FloatTensor(y_val.values), device)
        scores_train = []
        scores_val = []
        
        opt = torch.optim.Adam(self.parameters(), lr=5e-4)
        
        for epoch in range(iters):
            i = 0
            #learning cycle
            while i + batch_size < y.shape[0]: 
                self.train()
                self.zero_grad()
                X_tensor = move_to(torch.FloatTensor(X.iloc[i:i+batch_size, :].values), device)
                y_tensor = move_to(torch.FloatTensor(y[i:i+batch_size].values), device)
                loss = pearson_loss(self.tensor_forward(X_tensor).view(-1), y_tensor)
                loss.backward()
                opt.step()
                i+=batch_size
                
            #prints
            if val:
                self.eval()
                val_loss = pearson_loss(self.tensor_forward(tensor_train_val).view(-1), tensor_y_val)
                scores_val.append(float(val_loss.detach()))
            scores_train.append(float(loss.detach()))
            
            if not silent:
                print(f'epoch {epoch} finished with {loss} loss')
                if val:
                    print(f'val loss {val_loss}')
                
            
        if not silent:
            print(f'Finished with {loss} loss')
        if plot:
            if val:
                sns.lineplot(x=range(iters), y=scores_train, label='train')
                sns.lineplot(x=range(iters), y=scores_val, label='val')
            else:
                sns.lineplot(scores_train)
            plt.plot()
                
    def tensor_forward(self, X_tensor):
        X_tensor = move_to(X_tensor, device)
        tensor_X = self.act(self.linear_1(X_tensor))
        tensor_X = self.act(self.linear_2(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_3(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_4(tensor_X))
        tensor_X = self.act(self.linear_5(tensor_X))
        tensor_X = self.act(self.linear_6(tensor_X))
        tensor_X = self.small_dout(tensor_X)
        tensor_X = self.act(self.linear_7(tensor_X))
        tensor_X = self.act(self.linear_8(tensor_X))
        return tensor_X 

    def forward(self, X):
        tensor_X = torch.FloatTensor(X.values)
        tensor_X = move_to(tensor_X, device)
        tensor_X = self.act(self.linear_1(tensor_X))
        tensor_X = self.act(self.linear_2(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_3(tensor_X))
        tensor_X = self.dout(tensor_X)
        tensor_X = self.act(self.linear_4(tensor_X))
        tensor_X = self.act(self.linear_5(tensor_X))
        tensor_X = self.act(self.linear_6(tensor_X))
        tensor_X = self.small_dout(tensor_X)
        tensor_X = self.act(self.linear_7(tensor_X))
        tensor_X = self.act(self.linear_8(tensor_X))
        return tensor_X

In [15]:
#написать subsample: генерируем подмножество всех индексов и из него выбираем в батчах
#то есть двойная индексация

In [16]:
class StackedDNN:
    def __init__(self, n, model, min_deep=30, max_deep=40, min_iter=10, max_iter=15):
        self.nets = [model(np.random.randint(min_deep, max_deep)) for i in range(n)]
        self.min_iter=min_iter
        self.max_iter=max_iter
    
    def fit(self, X, y, batch_size=4096, subset=0.8):
        for i in range(len(self.nets)):
            self.nets[i].fit(X, y, np.random.randint(self.min_iter, self.max_iter), batch_size, subset, silent=True, plot=False, val=False, X_val=None, y_val=None)
            print(f'{i+1}/{len(self.nets)} fitted')
        
    def predict(self, X):
        y_pred = np.ndarray([len(self.nets), X.shape[0]])
        for i in range(len(self.nets)):
            self.nets[i].eval()
            y_pred[i] = np.array(self.nets[i].forward(X).detach().view(-1).cpu())
        y_pred /= len(self.nets)
        return np.sum(y_pred, axis=0)

In [17]:
nn_model = StackedDNN(30, DNN_Model, min_iter=10, max_iter=12, min_deep=280, max_deep=301)

#nn_model = StackedDNN(10, min_iter=12, max_iter=16, min_deep=280, max_deep=301)

In [18]:
nn_model.fit(data, target)

1/30 fitted
2/30 fitted
3/30 fitted
4/30 fitted
5/30 fitted
6/30 fitted
7/30 fitted
8/30 fitted
9/30 fitted
10/30 fitted
11/30 fitted
12/30 fitted
13/30 fitted
14/30 fitted
15/30 fitted
16/30 fitted
17/30 fitted
18/30 fitted
19/30 fitted
20/30 fitted
21/30 fitted
22/30 fitted
23/30 fitted
24/30 fitted
25/30 fitted
26/30 fitted
27/30 fitted
28/30 fitted
29/30 fitted
30/30 fitted


In [19]:
#res = nn_model.predict(X_test)
#pearsonr(res, y_test)[0]

#0.19485894974919082  -1
#0.1976733510956316  -4
#0.2011706156204262  -10

In [20]:
nn_model_1 = StackedDNN(30, DNN_Model_1, min_iter=10, max_iter=12, min_deep=280, max_deep=301)

#nn_model = StackedDNN(10, min_iter=12, max_iter=16, min_deep=280, max_deep=301)

In [21]:
nn_model_1.fit(data, target)

1/30 fitted
2/30 fitted
3/30 fitted
4/30 fitted
5/30 fitted
6/30 fitted
7/30 fitted
8/30 fitted
9/30 fitted
10/30 fitted
11/30 fitted
12/30 fitted
13/30 fitted
14/30 fitted
15/30 fitted
16/30 fitted
17/30 fitted
18/30 fitted
19/30 fitted
20/30 fitted
21/30 fitted
22/30 fitted
23/30 fitted
24/30 fitted
25/30 fitted
26/30 fitted
27/30 fitted
28/30 fitted
29/30 fitted
30/30 fitted


In [22]:
#res_1 = nn_model_1.predict(X_test)
#pearsonr(res_1, y_test)[0]

#0.1906351900630285  -1
#0.19793490067770353  -4
#19888346302274712  -10

In [23]:
"""
a = 0.2
print(pearsonr(res, y_test)[0])
print(pearsonr(res_1, y_test)[0])
pearsonr(a*res + (1-a)*res_1, y_test)[0]
"""

'\na = 0.2\nprint(pearsonr(res, y_test)[0])\nprint(pearsonr(res_1, y_test)[0])\npearsonr(a*res + (1-a)*res_1, y_test)[0]\n'

In [24]:
"""
best = 0
best_a = 0
for a in np.linspace(0, 1, 100):
    if pearsonr(a*res + (1-a)*res_1, y_test)[0] > best:
        best = pearsonr(a*res + (1-a)*res_1, y_test)[0]
        best_a = a
"""

'\nbest = 0\nbest_a = 0\nfor a in np.linspace(0, 1, 100):\n    if pearsonr(a*res + (1-a)*res_1, y_test)[0] > best:\n        best = pearsonr(a*res + (1-a)*res_1, y_test)[0]\n        best_a = a\n'

In [25]:
#best_a

In [26]:
#res_nn = 0.8*res + 0.2*res_1

In [27]:
#np.sum(res > 10000000000000)

In [28]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(iterations=8000, task_type='GPU', devices='0', silent=True).fit(data, target)

In [29]:
#est_score(model)

In [30]:
#res_boost = model.predict(X_test)

In [31]:
#1/17

In [32]:
"""
a = 0.8
print(pearsonr(res, y_test)[0])
print(pearsonr(res_boost, y_test)[0])
pearsonr(a*res_boost+(1-a)*res, y_test)[0]
"""

'\na = 0.8\nprint(pearsonr(res, y_test)[0])\nprint(pearsonr(res_boost, y_test)[0])\npearsonr(a*res_boost+(1-a)*res, y_test)[0]\n'

In [33]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    res_1 = nn_model.predict(test_df[features])
    res_2 = nn_model_1.predict(test_df[features])
    res_nn = 0.8*res_1 + 0.2*res_2
    res_boost = model.predict(test_df[features])
    
    test_df['target']  = 0.75*res_boost + 0.25*res_nn
    env.predict(test_df[['row_id','target']])

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [34]:
.902

0.902