In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from pathlib import Path
import os
DATA_DIR=Path('../data/influence')
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data\influence\sample_submission.csv
data\influence\solution.csv
data\influence\test.csv
data\influence\train.csv


In [3]:
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SOLUTION_PATH = DATA_DIR / "solution.csv"

SUBMISSION_PATH = Path("../submissions/v2")


PURPOSE_LABELS = {
    0: "BACKGROUND",
    1: "COMPARES_CONTRASTS",
    2: "EXTENSION",
    3: "FUTURE",
    4: "MOTIVATION",
    5: "USES"
}

INFLUENCE_LABELS = {
    0: "INCIDENTAL",
    1: "INFLUENTIAL"
}

TASKS={
    "purpose": ["citation_class_label", PURPOSE_LABELS],
    "influence": ["citation_influence_label", INFLUENCE_LABELS]
}

np.random.seed(250320)

In [4]:
df_train = pd.read_csv(TRAIN_PATH).merge(
    pd.read_csv(str(TRAIN_PATH).replace("influence", "purpose"))[["unique_id", "citation_class_label"]],
    on="unique_id"
)
df_train.columns

Index(['unique_id', 'core_id', 'citing_title', 'citing_author', 'cited_title',
       'cited_author', 'citation_context', 'citation_influence_label',
       'citation_class_label'],
      dtype='object')

In [5]:
df_test = pd.read_csv(TEST_PATH).merge(
    pd.read_csv(str(TEST_PATH).replace("influence", "purpose"))[["unique_id"]],
    on="unique_id"
)
df_test.columns

Index(['unique_id', 'core_id', 'citing_title', 'citing_author', 'cited_title',
       'cited_author', 'citation_context'],
      dtype='object')

In [6]:
df_solution = pd.read_csv(SOLUTION_PATH).merge(
    pd.read_csv(str(SOLUTION_PATH).replace("influence", "purpose")),
    on="unique_id"
)
df_solution.columns

Index(['unique_id', 'citation_influence_label', 'citation_class_label'], dtype='object')

In [7]:
df_test = df_test.merge(df_solution, on="unique_id")
df_test.shape

(1000, 9)

In [8]:
df = pd.concat([
    df_train.assign(split="train"),
    df_test.assign(split="test"),
], axis=0, sort=False).reset_index(drop=True).astype({task[0]: int for task in TASKS.values()})
df.head()

Unnamed: 0,unique_id,core_id,citing_title,citing_author,cited_title,cited_author,citation_context,citation_influence_label,citation_class_label,split
0,CC1,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Web search personalization with ontological us...,Sieg,They usually generate user models that describ...,0,5,train
1,CC2,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Exploring Scholarly Data with Rexplore,Osborne,The Computer Science Ontology (CSO)[3]is a lar...,0,0,train
2,CC3,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Klink-2: Integrating Multiple Web Sources to G...,Osborne,"In order to do so, we characterized all SN pub...",0,0,train
3,CC4,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Forecasting the Spreading of Technologies in R...,Osborne,"This API supports a number of applications, in...",1,0,train
4,CC5,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Supporting Springer Nature Editors by means of...,Osborne,It works according to three main steps:1) It r...,1,5,train


In [9]:
df.split.value_counts()

train    3000
test     1000
Name: split, dtype: int64

In [10]:
df.pivot_table(
    index="citation_class_label", columns="split", values="unique_id", aggfunc=len
).sort_values("train", ascending=False)

split,test,train
citation_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,546,1648
5,153,475
1,121,368
4,106,276
2,59,171
3,15,62


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [12]:
ct = ColumnTransformer([
    ("citing_tfidf", TfidfVectorizer(), "citing_title"),
    ("cited_tfidf", TfidfVectorizer(), "cited_title"),
    ("citation_context_tfidf", TfidfVectorizer(),"citation_context"),
])
ct.fit(df)
df_features = ct.transform(df)
df_features.shape

(4000, 23045)

In [13]:
from joblib import dump, load

In [14]:
# Save transformer
dump(ct, SUBMISSION_PATH / "ColumnTransformer.joblib")
dump(df_features, SUBMISSION_PATH / "df_features.joblib")

['submissions\\df_features.joblib']

In [15]:
df_features

<4000x23045 sparse matrix of type '<class 'numpy.float64'>'
	with 204727 stored elements in Compressed Sparse Row format>

In [16]:
df_features[[0, 1, 5]]

<3x23045 sparse matrix of type '<class 'numpy.float64'>'
	with 93 stored elements in Compressed Sparse Row format>

In [17]:
def generate_data(df, label_col, split="train"):
    split_idx = df[(df.split == split)].index.tolist()
    X = df_features[split_idx]
    y = df.iloc[split_idx][label_col]
    print(f"{split}: X={X.shape}, y={y.shape}")
    return X, y, split_idx

def submission_pipeline(model, df, df_features, task, model_key=None, to_dense=False):
    # Setup submission folder
    submission_folder = SUBMISSION_PATH / f"{model_key}_{task}"
    submission_folder.mkdir(parents=True, exist_ok=True)
    print(f"Generated folder: {submission_folder}")
    
    model_file = submission_folder / "model.joblib"
    submission_file=submission_folder / f"submission.csv"
    
    label_col, label_dict = TASKS[task]
    
    X_train, y_train, train_idx = generate_data(df, label_col, split="train")
    X_test, y_test, test_idx = generate_data(df, label_col, split="test")
    print(f"Training model")
    if to_dense:
        X_train = X_train.toarray()
        X_test = X_test.toarray()
    model.fit(X_train, y_train.astype(int))
    dump(model, model_file)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("Output label dist")
    print(pd.Series(y_test_pred).map(label_dict).value_counts())
    
    target_names = list(sorted(label_dict.values()))
    
    # Print reports 
    print("Training report")
    print(classification_report(y_train, y_train_pred, target_names=target_names))
    print("Test report")
    print(classification_report(y_test, y_test_pred, target_names=target_names))
    
    train_report = classification_report(y_train, y_train_pred, target_names=target_names, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, target_names=target_names, output_dict=True)
    
    print(f"Writing submission file: {submission_file}")
    df.iloc[test_idx][["unique_id"]].assign(**{label_col: y_test_pred}).to_csv(submission_file, index=False)
    return model, train_report, test_report

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV

In [19]:
model_configs = {
    "gbt": [GradientBoostingClassifier, dict()],
    "rf": [RandomForestClassifier, dict(n_jobs=-1)],
    "mlp-3": [MLPClassifier, dict(hidden_layer_sizes=(256,256,128))],
    "mlp": [MLPClassifier, dict()],
    "lr": [LogisticRegressionCV, dict(n_jobs=-1)]
}

DENSE_MODELS = {"mlp", "mlp-3"}

In [20]:
reports = {}
for model_key, model_params in model_configs.items():
    model_cls, model_kwargs = model_params
    to_dense=False
    if model_cls in DENSE_MODELS:
        to_dense=True
    print(model_key, model_params)
    for task in TASKS:
        model = model_cls(**model_kwargs)
        model, train_report, test_report = %time submission_pipeline(model, df, df_features, task, model_key=model_key, to_dense=to_dense)
        reports[(model_key, task)] = {"train": train_report, "test": test_report}

gbt [<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, {}]
Generated folder: submissions\gbt_purpose
train: X=(3000, 23045), y=(3000,)
test: X=(1000, 23045), y=(1000,)
Training model
Output label dist
BACKGROUND            936
USES                   23
COMPARES_CONTRASTS     19
MOTIVATION             11
FUTURE                  8
EXTENSION               3
dtype: int64
Training report
                    precision    recall  f1-score   support

        BACKGROUND       0.73      1.00      0.84      1648
COMPARES_CONTRASTS       1.00      0.45      0.62       368
         EXTENSION       0.99      0.68      0.81       171
            FUTURE       1.00      1.00      1.00        62
        MOTIVATION       1.00      0.47      0.64       276
              USES       0.97      0.56      0.71       475

          accuracy                           0.79      3000
         macro avg       0.95      0.69      0.77      3000
      weighted avg       0.85      0.79      0.78



Output label dist
BACKGROUND            962
USES                   23
COMPARES_CONTRASTS     13
MOTIVATION              2
dtype: int64
Training report
                    precision    recall  f1-score   support

        BACKGROUND       0.95      1.00      0.97      1648
COMPARES_CONTRASTS       0.99      0.92      0.96       368
         EXTENSION       0.98      0.94      0.96       171
            FUTURE       1.00      0.87      0.93        62
        MOTIVATION       0.99      0.94      0.96       276
              USES       1.00      0.93      0.97       475

          accuracy                           0.97      3000
         macro avg       0.98      0.93      0.96      3000
      weighted avg       0.97      0.97      0.97      3000

Test report
                    precision    recall  f1-score   support

        BACKGROUND       0.55      0.96      0.70       546
COMPARES_CONTRASTS       0.38      0.04      0.07       121
         EXTENSION       0.00      0.00      0.00    

  'precision', 'predicted', average, warn_for)


Output label dist
INCIDENTAL     537
INFLUENTIAL    463
dtype: int64
Training report
              precision    recall  f1-score   support

  INCIDENTAL       0.98      0.99      0.99      1568
 INFLUENTIAL       0.99      0.98      0.98      1432

    accuracy                           0.99      3000
   macro avg       0.99      0.98      0.99      3000
weighted avg       0.99      0.99      0.99      3000

Test report
              precision    recall  f1-score   support

  INCIDENTAL       0.45      0.53      0.49       457
 INFLUENTIAL       0.54      0.46      0.50       543

    accuracy                           0.49      1000
   macro avg       0.50      0.50      0.49      1000
weighted avg       0.50      0.49      0.49      1000

Writing submission file: submissions\rf_influence\submission.csv
Wall time: 555 ms
mlp-3 [<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>, {'hidden_layer_sizes': (256, 256, 128)}]
Generated folder: submissions\mlp-3_purpose
trai

  'precision', 'predicted', average, warn_for)


Output label dist
INFLUENTIAL    560
INCIDENTAL     440
dtype: int64
Training report
              precision    recall  f1-score   support

  INCIDENTAL       1.00      1.00      1.00      1568
 INFLUENTIAL       1.00      1.00      1.00      1432

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Test report
              precision    recall  f1-score   support

  INCIDENTAL       0.45      0.44      0.44       457
 INFLUENTIAL       0.54      0.56      0.55       543

    accuracy                           0.50      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.50      0.50      0.50      1000

Writing submission file: submissions\mlp-3_influence\submission.csv
Wall time: 1min 25s
mlp [<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>, {}]
Generated folder: submissions\mlp_purpose
train: X=(3000, 23045), y=(3000,)
test: 



Output label dist
BACKGROUND    999
USES            1
dtype: int64
Training report
                    precision    recall  f1-score   support

        BACKGROUND       0.63      1.00      0.78      1648
COMPARES_CONTRASTS       0.00      0.00      0.00       368
         EXTENSION       0.00      0.00      0.00       171
            FUTURE       0.00      0.00      0.00        62
        MOTIVATION       0.00      0.00      0.00       276
              USES       0.99      0.84      0.91       475

          accuracy                           0.68      3000
         macro avg       0.27      0.31      0.28      3000
      weighted avg       0.51      0.68      0.57      3000

Test report
                    precision    recall  f1-score   support

        BACKGROUND       0.55      1.00      0.71       546
COMPARES_CONTRASTS       0.00      0.00      0.00       121
         EXTENSION       0.00      0.00      0.00        59
            FUTURE       0.00      0.00      0.00        15
 

  'precision', 'predicted', average, warn_for)


Output label dist
INCIDENTAL    1000
dtype: int64
Training report
              precision    recall  f1-score   support

  INCIDENTAL       0.52      1.00      0.69      1568
 INFLUENTIAL       0.00      0.00      0.00      1432

    accuracy                           0.52      3000
   macro avg       0.26      0.50      0.34      3000
weighted avg       0.27      0.52      0.36      3000

Test report
              precision    recall  f1-score   support

  INCIDENTAL       0.46      1.00      0.63       457
 INFLUENTIAL       0.00      0.00      0.00       543

    accuracy                           0.46      1000
   macro avg       0.23      0.50      0.31      1000
weighted avg       0.21      0.46      0.29      1000

Writing submission file: submissions\lr_influence\submission.csv
Wall time: 2.02 s


  'precision', 'predicted', average, warn_for)


In [21]:
df_reports = pd.concat([
    pd.concat([
        pd.DataFrame(report[split]).T.assign(model=model, task=task, split=split).reset_index().rename(columns={"index": "label"})
        for split in report
    ])
    for (model, task), report in reports.items()
], axis=0, sort=False, ignore_index=True)

In [22]:
df_reports

Unnamed: 0,label,precision,recall,f1-score,support,model,task,split
0,BACKGROUND,0.730018,0.997573,0.843077,1648.000,gbt,purpose,train
1,COMPARES_CONTRASTS,1.000000,0.451087,0.621723,368.000,gbt,purpose,train
2,EXTENSION,0.991453,0.678363,0.805556,171.000,gbt,purpose,train
3,FUTURE,1.000000,1.000000,1.000000,62.000,gbt,purpose,train
4,MOTIVATION,1.000000,0.471014,0.640394,276.000,gbt,purpose,train
...,...,...,...,...,...,...,...,...
135,INCIDENTAL,0.457000,1.000000,0.627316,457.000,lr,influence,test
136,INFLUENTIAL,0.000000,0.000000,0.000000,543.000,lr,influence,test
137,accuracy,0.457000,0.457000,0.457000,0.457,lr,influence,test
138,macro avg,0.228500,0.500000,0.313658,1000.000,lr,influence,test


In [23]:
df_reports.loc[
    df_reports.label=="macro avg", 
    ["f1-score", "model", "task", "split"]
].pivot_table(index="model", columns=["task", "split"], values="f1-score", aggfunc="first")

task,influence,influence,purpose,purpose
split,test,train,test,train
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
gbt,0.536886,0.804022,0.147578,0.769884
lr,0.313658,0.343257,0.119964,0.280571
mlp,0.493785,1.0,0.185423,1.0
mlp-3,0.495649,1.0,0.176915,1.0
rf,0.491982,0.98529,0.136282,0.957873


In [24]:
df_t = df_reports.loc[
    (df_reports.label=="macro avg") & (df_reports.task=="purpose"), 
    ["f1-score", "model", "task", "split"]
].pivot_table(index="model", columns="split", values="f1-score", aggfunc="first").sort_values("test")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrr}
\toprule
split &   test &  train \\
model &        &        \\
\midrule
lr    &  0.120 &  0.281 \\
rf    &  0.136 &  0.958 \\
gbt   &  0.148 &  0.770 \\
mlp-3 &  0.177 &  1.000 \\
mlp   &  0.185 &  1.000 \\
\bottomrule
\end{tabular}



split,test,train
model,Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.119964,0.280571
rf,0.136282,0.957873
gbt,0.147578,0.769884
mlp-3,0.176915,1.0
mlp,0.185423,1.0


In [25]:
df_t = df_reports.loc[
    (df_reports.label=="macro avg") & (df_reports.task=="influence"), 
    ["f1-score", "model", "task", "split"]
].pivot_table(index="model", columns="split", values="f1-score", aggfunc="first").sort_values("test")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrr}
\toprule
split &   test &  train \\
model &        &        \\
\midrule
lr    &  0.314 &  0.343 \\
rf    &  0.492 &  0.985 \\
mlp   &  0.494 &  1.000 \\
mlp-3 &  0.496 &  1.000 \\
gbt   &  0.537 &  0.804 \\
\bottomrule
\end{tabular}



split,test,train
model,Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.313658,0.343257
rf,0.491982,0.98529
mlp,0.493785,1.0
mlp-3,0.495649,1.0
gbt,0.536886,0.804022


In [33]:
df_t = df_reports.loc[
    (df_reports.split=="test") & (df_reports.task=="purpose"), 
    ["label", "f1-score", "model",]
].pivot_table(index="model", columns="label", values="f1-score", aggfunc="first").sort_values("macro avg")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrrrrrrrrr}
\toprule
label &  BACKGROUND &  COMPARES\_CONTRASTS &  EXTENSION &  FUTURE &  MOTIVATION &   USES &  accuracy &  macro avg &  weighted avg \\
model &             &                     &            &         &             &        &           &            &               \\
\midrule
lr    &       0.707 &               0.000 &      0.000 &     0.0 &       0.000 &  0.013 &     0.547 &      0.120 &         0.388 \\
rf    &       0.698 &               0.075 &      0.000 &     0.0 &       0.000 &  0.045 &     0.535 &      0.136 &         0.397 \\
gbt   &       0.700 &               0.071 &      0.000 &     0.0 &       0.000 &  0.114 &     0.534 &      0.148 &         0.408 \\
mlp-3 &       0.663 &               0.175 &      0.000 &     0.0 &       0.059 &  0.165 &     0.492 &      0.177 &         0.414 \\
mlp   &       0.649 &               0.176 &      0.065 &     0.0 &       0.060 &  0.163 &     0.478 &      0.185 &         0.411 \\
\bottomrule
\end{tabular}



label,BACKGROUND,COMPARES_CONTRASTS,EXTENSION,FUTURE,MOTIVATION,USES,accuracy,macro avg,weighted avg
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lr,0.706796,0.0,0.0,0.0,0.0,0.012987,0.547,0.119964,0.387898
rf,0.697613,0.074627,0.0,0.0,0.0,0.045455,0.535,0.136282,0.396881
gbt,0.700405,0.071429,0.0,0.0,0.0,0.113636,0.534,0.147578,0.40845
mlp-3,0.662757,0.174863,0.0,0.0,0.059259,0.164609,0.492,0.176915,0.41449
mlp,0.648729,0.176166,0.064516,0.0,0.059701,0.163424,0.478,0.185423,0.410661


In [34]:
df_t = df_reports.loc[
    (df_reports.split=="test") & (df_reports.task=="influence"), 
    ["label", "f1-score", "model",]
].pivot_table(index="model", columns="label", values="f1-score", aggfunc="first").sort_values("macro avg")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrrrrr}
\toprule
label &  INCIDENTAL &  INFLUENTIAL &  accuracy &  macro avg &  weighted avg \\
model &             &              &           &            &               \\
\midrule
lr    &       0.627 &        0.000 &     0.457 &      0.314 &         0.287 \\
rf    &       0.489 &        0.495 &     0.492 &      0.492 &         0.492 \\
mlp   &       0.469 &        0.519 &     0.495 &      0.494 &         0.496 \\
mlp-3 &       0.444 &        0.548 &     0.501 &      0.496 &         0.500 \\
gbt   &       0.499 &        0.575 &     0.540 &      0.537 &         0.540 \\
\bottomrule
\end{tabular}



label,INCIDENTAL,INFLUENTIAL,accuracy,macro avg,weighted avg
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,0.627316,0.0,0.457,0.313658,0.286684
rf,0.488934,0.49503,0.492,0.491982,0.492244
mlp,0.46898,0.518589,0.495,0.493785,0.495918
mlp-3,0.443701,0.547597,0.501,0.495649,0.500117
gbt,0.498911,0.574861,0.54,0.536886,0.540152


## Investigate model