In [1]:
import sys
sys.path.append("../processing/")

In [2]:
import gzip
import json
import numpy as np

from joblib import Parallel, delayed
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

from _config import Config
from prediction_next_reply import load_data, make_paired_dataset
from prediction_prefix import sanitize_x

In [3]:
def get_GB_model(n_folds, n_jobs):
    grid = {
        "clf__n_estimators": [10, 25, 50, 100, 500, 1000, 2000, 3000, 5000, 10000]
        # "clf__n_estimators": [10, 25]   
    }

    pipe = Pipeline([
        ("imp", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("std", StandardScaler(copy=True, with_mean=True, with_std=True)),
        ("clf", LGBMClassifier(random_state=0))
    ])

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    model = GridSearchCV(
        estimator=pipe, 
        param_grid=grid, 
        cv=skf,
        n_jobs=n_jobs,
        scoring="accuracy",
        refit=True
    )
    
    return model

In [4]:
# params
cv_n_folds = 10
cv_n_jobs = 10

feature_sets = [
    "conversation_state",
    "user_info",
    "alignments",
    "follow_di",
    "follow_ud",
    "reply_di",
    "reply_ud",
    "dyad_up",
    "dyad_ur",
    "embeddedness_all",
    "embeddedness_toxicity",
    "embeddedness_follow",
    "embeddedness_reply",
    "tree"    
]

In [5]:
# load data
X_nw, y_nw, meta_nw, fnames_nw = load_data("news", feature_sets)
X_mt, y_mt, meta_mt, fnames_mt = load_data("midterms", feature_sets)

assert fnames_nw == fnames_mt
fnames = fnames_nw

print(X_nw.shape, X_mt.shape)

(193040, 273) (100286, 273)


In [6]:
# sanitize
_, fnames_new_nw, _ = sanitize_x(X_nw, fnames_nw)
_, fnames_new_mt, _ = sanitize_x(X_mt, fnames_mt)

# intersect features
fnames_new_nw = [tuple(i) for i in fnames_new_nw]
fnames_new_mt = [tuple(i) for i in fnames_new_mt]

fnames_int = set(fnames_new_nw) & set(fnames_new_mt)
fnames_int_idxs = sorted([fnames.index(common) for common in fnames_int])

# filter
X_nw = X_nw[:, fnames_int_idxs]
X_mt = X_mt[:, fnames_int_idxs]

assert X_nw.shape[1] == X_mt.shape[1]

print(X_nw.shape, X_mt.shape)

(193040, 269) (100286, 269)


In [7]:
# make paired datasets
X_nw_pairs, y_nw_pairs, meta_nw_pairs = make_paired_dataset(X_nw, y_nw, meta_nw)
X_mt_pairs, y_mt_pairs, meta_mt_pairs = make_paired_dataset(X_mt, y_mt, meta_mt)

In [8]:
# train / test split
X_nw_pairs_train, X_nw_pairs_test, y_nw_pairs_train, y_nw_pairs_test = \
    train_test_split(X_nw_pairs, y_nw_pairs, test_size=0.2, random_state=0)

X_mt_pairs_train, X_mt_pairs_test, y_mt_pairs_train, y_mt_pairs_test = \
    train_test_split(X_mt_pairs, y_mt_pairs, test_size=0.2, random_state=0)

In [9]:
# news model
model_nw = get_GB_model(n_folds=cv_n_folds, n_jobs=cv_n_jobs)
model_nw.fit(X_nw_pairs_train, y_nw_pairs_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('imp', SimpleImputer()),
                                       ('std', StandardScaler()),
                                       ('clf',
                                        LGBMClassifier(random_state=0))]),
             n_jobs=10,
             param_grid={'clf__n_estimators': [10, 25, 50, 100, 500, 1000, 2000,
                                               3000, 5000, 10000]},
             scoring='accuracy')

In [10]:
# midterms model
model_mt = get_GB_model(n_folds=cv_n_folds, n_jobs=cv_n_jobs)
model_mt.fit(X_mt_pairs_train, y_mt_pairs_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('imp', SimpleImputer()),
                                       ('std', StandardScaler()),
                                       ('clf',
                                        LGBMClassifier(random_state=0))]),
             n_jobs=10,
             param_grid={'clf__n_estimators': [10, 25, 50, 100, 500, 1000, 2000,
                                               3000, 5000, 10000]},
             scoring='accuracy')

In [11]:
res = {
    # news
    "news_train": accuracy_score(model_nw.predict(X_nw_pairs_train), y_nw_pairs_train),
    "news_news_test": accuracy_score(model_nw.predict(X_nw_pairs_test), y_nw_pairs_test),
    "news_midterms_test": accuracy_score(model_nw.predict(X_mt_pairs_test), y_mt_pairs_test),
    # midterms
    "midterms_train": accuracy_score(model_mt.predict(X_mt_pairs_train), y_mt_pairs_train),
    "midterms_midterms_test": accuracy_score(model_mt.predict(X_mt_pairs_test), y_mt_pairs_test),
    "midterms_news_test": accuracy_score(model_mt.predict(X_nw_pairs_test), y_nw_pairs_test)       
}

In [12]:
# write results to JSON
out_fpath = f"{Config().modeling_dir}/next_reply/runs/domain_transfer.json.gz"

with gzip.open(out_fpath, "wt") as fout:
    json.dump(res, fout)

In [13]:
res

{'news_train': 0.8507304185661003,
 'news_news_test': 0.7139970990468296,
 'news_midterms_test': 0.7361651211486688,
 'midterms_train': 0.7952335842847884,
 'midterms_midterms_test': 0.7425466148170307,
 'midterms_news_test': 0.705605055946954}