In [1]:
%load_ext autoreload
%autoreload 2


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.utils import label, author_data, sample_submission, train, test, features_author, fill
from src.utils import cost, write_submission_global, test_model ,final_model

In [2]:
author_data["engagement_lr"] = np.load("aid/all_author_train_lr.npy")
author_data["engagement_lgbm"] = np.load("aid/all_author_train_lr.npy")

def data_deal(df):
    # Add mean engagement on historical data
    df = df.join(author_data.groupby("author").agg(
        {"engagement" : "mean",
         "engagement_lr" : "mean",
         "engagement_lgbm" : "mean"}
    ).fillna(fill).rename(columns={"engagement" : "engagement_"}), 
                 on="author")
    return df

In [3]:
# Add all answers with different model.
# With all author data
train_set = data_deal(train)[["followers", "engagement_", "engagement_lr", "engagement_lgbm"]]
test_set = data_deal(test)[["followers", "engagement_", "engagement_lr", "engagement_lgbm"]]

# With single author data
train_set["single_author_lr"] = np.load("aid/single_author_train_lr.npy")
train_set["single_author_lgbm"] = np.load("aid/single_author_train_lgbm.npy")
test_set["single_author_lr"] = np.load("aid/single_author_test_lr.npy")
test_set["single_author_lgbm"] = np.load("aid/single_author_test_lgbm.npy")

# With all train data
train_set["all_train_lr"] = np.load("aid/all_train_train_lr.npy")
train_set["all_train_lgbm"] = np.load("aid/all_train_train_lgbm.npy")
test_set["all_train_lr"] = np.load("aid/all_train_test_lr.npy")
test_set["all_train_lgbm"] = np.load("aid/all_train_test_lgbm.npy")

# Fillna
na_values = {
    "followers" : 50,
    "engagement_" : fill,
    "engagement_lr" : fill,
    "engagement_lgbm" : fill,
    "single_author_lr" : fill,
    "single_author_lgbm" : fill,
}
train_set = train_set.fillna(na_values)
test_set = test_set.fillna(na_values)

# Label set
label_set = train["engagement"]

#### Get features and split train/test

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Seperation train/test
X_train, X_test, Y_train, Y_test= train_test_split(train_set, 
                                                   label_set, 
                                                   train_size=0.75,
                                                   random_state=0,
                                                  )

# Normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test_data = scaler.transform(test_set)

#### Training

In [5]:
%%time
from sklearn.linear_model import LinearRegression
lr_model, cost_train, cost_test = test_model(LinearRegression(), X_train, X_test, Y_train, Y_test)
lr_model = final_model(LinearRegression(), train_set, label_set)
write_submission_global(lr_model, cost_train, cost_test, test_data, output_name="final_lr")

23456.502643222397 23189.91335007892
CPU times: user 63.3 ms, sys: 2.69 ms, total: 66 ms
Wall time: 23.3 ms


In [6]:
%%time
from lightgbm import LGBMRegressor
lgbm_model, cost_train, cost_test = test_model(LGBMRegressor(num_leaves=10, 
                                                             max_depth=5, 
                                                             learning_rate=0.07, 
                                                             objective="mae",
                                                             random_state=0), 
                                              X_train, X_test, Y_train, Y_test)

15306.283082578559 15156.263724193206
CPU times: user 1.1 s, sys: 6.74 ms, total: 1.11 s
Wall time: 96.8 ms


In [7]:
lgbm_model = final_model(LGBMRegressor(num_leaves=10, 
                                       max_depth=5, 
                                       learning_rate=0.07, 
                                       objective="mae",
                                       random_state=0), 
                         train_set, label_set)
write_submission_global(lgbm_model, cost_train, cost_test, test_data, output_name="final_lgbm")

In [8]:
from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(hidden_layer_sizes=(8, 8), activation='relu', 
             solver='adam', alpha=0.0001, batch_size='auto', 
             learning_rate='constant', learning_rate_init=0.001, 
             power_t=0.5, max_iter=1000, shuffle=True, random_state=0, 
             tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
             nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, 
             beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)

In [9]:
# Normalization
scaler_label = StandardScaler()
Y_train_ = scaler_label.fit_transform(Y_train.values.reshape(-1, 1)).reshape(-1)
Y_test_ = scaler_label.transform(Y_test.values.reshape(-1, 1)).reshape(-1)

In [10]:
# Training
mlp_model.fit(X_train, Y_train_)

MLPRegressor(hidden_layer_sizes=(8, 8), max_iter=1000, random_state=0)

In [11]:
def local_cost(x, y):
    return np.abs(x - y).mean()

cost_train, cost_test = local_cost(mlp_model.predict(X_train) * scaler_label.scale_ + scaler_label.mean_, 
           Y_train_* scaler_label.scale_ + scaler_label.mean_), local_cost(mlp_model.predict(X_test) * scaler_label.scale_ + scaler_label.mean_, 
           Y_test_* scaler_label.scale_ + scaler_label.mean_)
print(cost_train, cost_test)

19058.47759606769 19193.013359472483


In [12]:
mlp_model = final_model(MLPRegressor(hidden_layer_sizes=(8, 8), activation='relu', 
                         solver='adam', alpha=0.0001, batch_size='auto', 
                         learning_rate='constant', learning_rate_init=0.001, 
                         power_t=0.5, max_iter=1000, shuffle=True, random_state=0, 
                         tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                         nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, 
                         beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000),
                         train_set, label_set)

sample_submission["engagement"] = mlp_model.predict(test_data)*scaler_label.scale_ + scaler_label.mean_
sample_submission["engagement"] = sample_submission["engagement"] * (sample_submission["engagement"] >= 0)
sample_submission.to_csv("output/final_mlp_{:.2f}_{:.2f}.csv".format(cost_train, cost_test), index=False)
#write_submission_global(lgbm_model, cost_train, cost_test, test_data, output_name="final_mlp")

#### Simply use the optimization method to get the best weight among these results.

In [13]:
train_set_opt = data_deal(train)[["engagement_", "engagement_lr", "engagement_lgbm"]]
test_set_opt = data_deal(test)[["engagement_", "engagement_lr", "engagement_lgbm"]]

# With single author data
train_set_opt["single_author_lr"] = np.load("aid/single_author_train_lr.npy")
train_set_opt["single_author_lgbm"] = np.load("aid/single_author_train_lgbm.npy")
test_set_opt["single_author_lr"] = np.load("aid/single_author_test_lr.npy")
test_set_opt["single_author_lgbm"] = np.load("aid/single_author_test_lgbm.npy")

# With all train data
train_set_opt["all_train_lr"] = np.load("aid/all_train_train_lr.npy")
train_set_opt["all_train_lgbm"] = np.load("aid/all_train_train_lgbm.npy")
test_set_opt["all_train_lr"] = np.load("aid/all_train_test_lr.npy")
test_set_opt["all_train_lgbm"] = np.load("aid/all_train_test_lgbm.npy")

train_set_opt["intercept"] = fill
test_set_opt["intercept"] = fill

# Result above
train_set_opt["final_lr"] = lr_model.predict(train_set)
train_set_opt["final_lgbm"] = lgbm_model.predict(train_set)
train_set_opt["final_mlp"] = mlp_model.predict(train_set)
test_set_opt["final_lr"] = lr_model.predict(test_set)
test_set_opt["final_lgbm"] = lgbm_model.predict(test_set)
test_set_opt["final_mlp"] = mlp_model.predict(test_set)

# Fillna
na_values = {
    "engagement_" : fill,
    "engagement_lr" : fill,
    "engagement_lgbm" : fill,
    "single_author_lr" : fill,
    "single_author_lgbm" : fill,
}
train_set_opt = train_set_opt.fillna(na_values)
test_set_opt = test_set_opt.fillna(na_values)

# Label set
label_set = train["engagement"]

In [14]:
from scipy.optimize import minimize
for nb in [5, 11]:
    if nb == 5:
        features = [i for i in train_set_opt.columns if "lgbm" in i] + ["intercept"]
    elif nb == 11:
        features = train_set_opt.columns

    bnds = ((0, None),) * nb
    cons = ({'type': 'eq', 'fun': lambda x:  x.sum()-1,},)
    def fun(x):
        return np.abs((train_set_opt[features].values * x).sum(axis=1) - label_set).mean()

    x0 = np.ones(nb) / nb
    res = minimize(fun, x0, bounds=bnds, constraints=cons)
    print(res.x, res.fun)
    sample_submission["engagement"] = (test_set_opt[features].values * res.x).sum(axis=1)
    sample_submission.to_csv("output/final_opt_{}.csv".format(nb), index=False)

[4.49914819e-10 4.30702930e-09 6.83068927e-09 9.99999993e-01
 4.86674261e-15] 15228.5396429263
[1.91511598e-08 0.00000000e+00 0.00000000e+00 8.48545713e-08
 1.80877765e-08 0.00000000e+00 2.23366475e-08 8.89304834e-09
 0.00000000e+00 9.85157830e-01 1.48423429e-02] 15226.03995497098
