In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn

# NN

In [2]:
def data_transform(df_ord, df_nod, type_ = None):
    
    if type_ == 'train':
        df_ord.drop('completed_time', axis = 1, inplace=True)
    else:
        pass
    
    df_ord['running_time'] = pd.to_datetime(df_ord['running_time'])
    df_ord['hours'] = df_ord['running_time'].apply(lambda x: x.hour)
    df_ord['sym_hours_periods'] = df_ord['hours'].apply(lambda x: abs(x-12) if x-12 <= 0 else abs(x-13))
    
    speed_means = df_nod.groupby("Id")['speed'].mean()
    
    i = 0
    for row in df_nod.values:
        if np.isnan(row[4]):
            df_nod.iloc[i, 4] = df_nod[df_nod['Id'] == row[0]]['speed'].mean(skipna = True)
        i+=1
        
    df_nod['time'] = df_nod.apply(lambda x: (float(x[3])/float(x[4]))/1000*3600, axis = 1)
    
    if df_nod[df_nod['speed'].isnull()]['Id'].sum() != 0:
        print(df_nod[df_nod['speed'].isnull()]['Id'].sum())
        raise ValueError('u have nans')
    else:
        pass
    
    time_sum = df_nod.groupby("Id")['time'].sum()
    df_ord = df_ord.merge(time_sum, left_on='Id', right_on='Id')
    
    dist_sum = df_nod.groupby("Id")['distance'].sum()
    df_ord = df_ord.merge(dist_sum, left_on='Id', right_on='Id')
    
    df_ord['route_distance_km'] = df_ord['route_distance_km']*1000
    df_ord['speed_from_node'] = df_ord['distance']/df_ord['time']
    
    ind_dict = df_nod.groupby('Id').indices
    diff_nodes = {}
    
    for key in ind_dict.keys():
        diff_nodes[key] = len(set(df_nod.iloc[ind_dict[key], 1])-set(df_nod.iloc[ind_dict[key], 2]))
        
    diff_nodes_df = pd.DataFrame(data=diff_nodes.items(), columns=['Id','diff_nodes'])
    df_ord = df_ord.merge(diff_nodes_df, left_on='Id', right_on='Id')
    
    df_ord['diff_dist'] = df_ord["route_distance_km"] - df_ord["distance"]
    
    df_ord.drop('running_time', axis = 1, inplace=True)
    
    return df_ord, df_nod

In [3]:
class LinearRegression(nn.Module):
    def __init__(self, n_input_features):
        super(LinearRegression, self).__init__()
        torch.manual_seed(0)
        self.linear1 = nn.Linear(n_input_features, 5)
        self.relu = nn.ReLU()
        self.lrelu = nn.LeakyReLU()
        self.tanh = nn.Tanh()
        self.linear2 = nn.Linear(5,4)
        self.linear3 = nn.Linear(4,2)
        self.linear4 = nn.Linear(2,1)
    def forward(self, x):
        z1 = self.linear1(x)
        a1 = self.tanh(z1)
        z2 = self.linear2(a1)
        a2 = self.relu(z2)
        z3 = self.linear3(a2)
        a3 = self.relu(z3)
        y_pred = self.linear4(a3)
        return y_pred

In [4]:
val_ord = pd.read_csv('final_test.csv')
val_nod = pd.read_csv("nodes_test.csv")

df_ord, df_nod = val_ord.copy(), val_nod.copy()
val_data, _ = data_transform(df_ord, df_nod, type_='test')

In [5]:
Id = val_data['Id'].values

In [6]:
tr_df = pd.read_csv('train_data.csv', index_col=0)

dr_ind = tr_df[abs(tr_df['diff_dist'])>150].index
tr_df.drop(dr_ind, axis = 0, inplace = True)

sc = StandardScaler()

X = tr_df.iloc[:, [1, 3,4,6,7,8,9]].to_numpy()
X_norm = sc.fit(X)

X_v = val_data.iloc[:, [1,2,3,5,6,7,8]].to_numpy()
X_val = sc.transform(X_v)

X_val = torch.from_numpy(X_val.astype(np.float32))

In [7]:
n_features = X_val.shape[1]

In [8]:
model = LinearRegression(n_features)
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

LinearRegression(
  (linear1): Linear(in_features=7, out_features=5, bias=True)
  (relu): ReLU()
  (lrelu): LeakyReLU(negative_slope=0.01)
  (tanh): Tanh()
  (linear2): Linear(in_features=5, out_features=4, bias=True)
  (linear3): Linear(in_features=4, out_features=2, bias=True)
  (linear4): Linear(in_features=2, out_features=1, bias=True)
)

In [9]:
pred = model(X_val).detach().numpy()
sub = np.concatenate((Id.reshape(-1,1), pred), axis = 1)

In [10]:
submission = pd.DataFrame(sub, columns = ['Id', 'Predicted'])

In [11]:
submission['Id'] = submission['Id'].astype(np.int64)

In [12]:
submission

Unnamed: 0,Id,Predicted
0,6198,573.447815
1,6417,753.169861
2,7054,611.595886
3,9628,778.478699
4,10283,840.138000
...,...,...
995,525706,398.849365
996,526604,500.158508
997,527213,532.195618
998,527520,208.558334


In [13]:
submission.to_csv('sub_nn_v3.csv', index = False, header = 1)

# Catboost

In [1]:
#!pip install catboost
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

data = pd.read_csv("train_data.csv")
data["dist_delta"] = data["route_distance_km"] - data["distance"]
data = data[abs(data['dist_delta'])<200]
X_train = data.loc[:, ["route_distance_km", "distance", "hours", "time", "speed_from_node", "diff_nodes", "dist_delta"]]
y_train = data.loc[:, "delta_time"]
CAT_COLS = ["hours", "diff_nodes"]

model = CatBoostRegressor(iterations=5000,
                          early_stopping_rounds=200,
                          depth=6,
                          loss_function="RMSE",
                          cat_features= CAT_COLS,
                          random_state= 13,
                          l2_leaf_reg = 3,
                          learning_rate=0.03,
                          eval_metric="RMSE")

model.fit(X_train, y_train)

0:	learn: 201.8719146	total: 58.1ms	remaining: 4m 50s
1:	learn: 198.1026869	total: 59.9ms	remaining: 2m 29s
2:	learn: 194.4121573	total: 61.9ms	remaining: 1m 43s
3:	learn: 190.7025127	total: 63.2ms	remaining: 1m 18s
4:	learn: 187.3647039	total: 64.4ms	remaining: 1m 4s
5:	learn: 184.0035321	total: 65.4ms	remaining: 54.4s
6:	learn: 181.0021449	total: 66.6ms	remaining: 47.5s
7:	learn: 177.8228573	total: 67.5ms	remaining: 42.1s
8:	learn: 174.9135062	total: 71.7ms	remaining: 39.8s
9:	learn: 171.9591612	total: 72.9ms	remaining: 36.4s
10:	learn: 169.3967136	total: 74.1ms	remaining: 33.6s
11:	learn: 166.8880187	total: 75.5ms	remaining: 31.4s
12:	learn: 164.2364304	total: 76.4ms	remaining: 29.3s
13:	learn: 161.6903258	total: 77.6ms	remaining: 27.6s
14:	learn: 159.5166481	total: 79.1ms	remaining: 26.3s
15:	learn: 157.2679462	total: 80.1ms	remaining: 24.9s
16:	learn: 154.9660682	total: 81.2ms	remaining: 23.8s
17:	learn: 152.7558364	total: 82ms	remaining: 22.7s
18:	learn: 150.7120262	total: 82.9ms

<catboost.core.CatBoostRegressor at 0x1212995b0>

In [2]:
def data_transform(df_ord, df_nod, type_ = None):
    
    if type_ == 'train':
        df_ord.drop('completed_time', axis = 1, inplace=True)
    else:
        pass
    
    df_ord['running_time'] = pd.to_datetime(df_ord['running_time'])
    df_ord['hours'] = df_ord['running_time'].apply(lambda x: x.hour)
    df_ord['sym_hours_periods'] = df_ord['hours'].apply(lambda x: abs(x-12) if x-12 <= 0 else abs(x-13))
    
    speed_means = df_nod.groupby("Id")['speed'].mean()
    
    i = 0
    for row in df_nod.values:
        if np.isnan(row[4]):
            df_nod.iloc[i, 4] = df_nod[df_nod['Id'] == row[0]]['speed'].mean(skipna = True)
        i+=1
        
    df_nod['time'] = df_nod.apply(lambda x: (float(x[3])/float(x[4]))/1000*3600, axis = 1)
    
    if df_nod[df_nod['speed'].isnull()]['Id'].sum() != 0:
        print(df_nod[df_nod['speed'].isnull()]['Id'].sum())
        raise ValueError('u have nans')
    else:
        pass
    
    time_sum = df_nod.groupby("Id")['time'].sum()
    df_ord = df_ord.merge(time_sum, left_on='Id', right_on='Id')
    
    dist_sum = df_nod.groupby("Id")['distance'].sum()
    df_ord = df_ord.merge(dist_sum, left_on='Id', right_on='Id')
    
    df_ord['route_distance_km'] = df_ord['route_distance_km']*1000
    df_ord['speed_from_node'] = df_ord['distance']/df_ord['time']
    
    ind_dict = df_nod.groupby('Id').indices
    diff_nodes = {}
    
    for key in ind_dict.keys():
        diff_nodes[key] = len(set(df_nod.iloc[ind_dict[key], 1])-set(df_nod.iloc[ind_dict[key], 2]))
        
    diff_nodes_df = pd.DataFrame(data=diff_nodes.items(), columns=['Id','diff_nodes'])
    df_ord = df_ord.merge(diff_nodes_df, left_on='Id', right_on='Id')
    
    df_ord['diff_dist'] = df_ord["route_distance_km"] - df_ord["distance"]
    
    df_ord.drop('running_time', axis = 1, inplace=True)
    
    return df_ord, df_nod

In [3]:
val_ord = pd.read_csv('final_test.csv')
val_nod = pd.read_csv("nodes_test.csv")

df_ord, df_nod = val_ord.copy(), val_nod.copy()
val_data, _ = data_transform(df_ord, df_nod, type_='test')

In [4]:
Id = val_data['Id'].values

In [5]:
val_data["dist_delta"] = val_data["route_distance_km"] - val_data["distance"]
X_val = val_data.loc[:, ["route_distance_km", "distance", "hours", "time", "speed_from_node", "diff_nodes", "dist_delta"]]
CAT_COLS = ["hours", "diff_nodes"]

In [6]:
pred = model.predict(X_val)
sub = np.concatenate((Id.reshape(-1,1), pred.reshape(-1,1)), axis = 1)

submission = pd.DataFrame(sub, columns = ['Id', 'Predicted'])
submission['Id'] = submission['Id'].astype(np.int64)

In [7]:
submission.to_csv('sub_catboost_v1.csv', index = False, header = 1)

# Linear

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [11]:
def data_transform(df_ord, df_nod, type_ = None):
    
    if type_ == 'train':
        df_ord.drop('completed_time', axis = 1, inplace=True)
    else:
        pass
    
    df_ord['running_time'] = pd.to_datetime(df_ord['running_time'])
    df_ord['hours'] = df_ord['running_time'].apply(lambda x: x.hour)
    df_ord['sym_hours_periods'] = df_ord['hours'].apply(lambda x: abs(x-12) if x-12 <= 0 else abs(x-13))
    
    speed_means = df_nod.groupby("Id")['speed'].mean()
    
    i = 0
    for row in df_nod.values:
        if np.isnan(row[4]):
            df_nod.iloc[i, 4] = df_nod[df_nod['Id'] == row[0]]['speed'].mean(skipna = True)
        i+=1
        
    df_nod['time'] = df_nod.apply(lambda x: (float(x[3])/float(x[4]))/1000*3600, axis = 1)
    
    if df_nod[df_nod['speed'].isnull()]['Id'].sum() != 0:
        print(df_nod[df_nod['speed'].isnull()]['Id'].sum())
        raise ValueError('u have nans')
    else:
        pass
    
    time_sum = df_nod.groupby("Id")['time'].sum()
    df_ord = df_ord.merge(time_sum, left_on='Id', right_on='Id')
    
    dist_sum = df_nod.groupby("Id")['distance'].sum()
    df_ord = df_ord.merge(dist_sum, left_on='Id', right_on='Id')
    
    df_ord['route_distance_km'] = df_ord['route_distance_km']*1000
    df_ord['speed_from_node'] = df_ord['distance']/df_ord['time']
    
    ind_dict = df_nod.groupby('Id').indices
    diff_nodes = {}
    
    for key in ind_dict.keys():
        diff_nodes[key] = len(set(df_nod.iloc[ind_dict[key], 1])-set(df_nod.iloc[ind_dict[key], 2]))
        
    diff_nodes_df = pd.DataFrame(data=diff_nodes.items(), columns=['Id','diff_nodes'])
    df_ord = df_ord.merge(diff_nodes_df, left_on='Id', right_on='Id')
    
    df_ord['diff_dist'] = df_ord["route_distance_km"] - df_ord["distance"]
    
    df_ord.drop('running_time', axis = 1, inplace=True)
    
    return df_ord, df_nod

In [36]:
val_ord = pd.read_csv('final_test.csv')
val_nod = pd.read_csv("nodes_test.csv")

df_ord, df_nod = val_ord.copy(), val_nod.copy()
val_data, _ = data_transform(df_ord, df_nod, type_='test')

In [37]:
Id = val_data['Id'].values

In [38]:
val_data.iloc[:, [2,3,4,7]]

Unnamed: 0,hours,sym_hours_periods,time,diff_nodes
0,3,9,391.092968,2
1,3,9,524.819710,2
2,3,9,387.661475,3
3,4,8,561.546514,1
4,4,8,654.753300,1
...,...,...,...,...
995,18,5,286.361660,1
996,18,5,356.409807,1
997,18,5,360.807126,2
998,18,5,76.890623,1


In [39]:
tr_df = pd.read_csv('train_data.csv', index_col=0)

sc = StandardScaler()

dr_ind = tr_df[abs(tr_df['diff_dist'])>150].index
tr_df.drop(dr_ind, axis = 0, inplace = True)

X = tr_df.iloc[:, [3,4,5,8]].to_numpy()
X_norm = sc.fit_transform(X)
y = tr_df.iloc[:, 2].to_numpy()

X_v = val_data.iloc[:, [2,3,4,7]].to_numpy()
X_val = sc.transform(X_v)


In [40]:
X_val = sm.add_constant(X_val)
X_norm = sm.add_constant(X_norm)

X_val.shape, X_norm.shape

((1000, 5), (796, 5))

In [41]:
reg = LinearRegression().fit(X_norm, y)

pred = reg.predict(X_val)
sub = np.concatenate((Id.reshape(-1,1), pred.reshape(-1,1)), axis = 1)

submission = pd.DataFrame(sub, columns = ['Id', 'Predicted'])
submission['Id'] = submission['Id'].astype(np.int64)

In [43]:
submission.to_csv('sub_linear_v1.csv', index = False, header = 1)