In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from pathlib import Path
import os
DATA_DIR=Path('../data/influence')
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data\influence\sample_submission.csv
data\influence\solution.csv
data\influence\test.csv
data\influence\train.csv


In [3]:
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SOLUTION_PATH = DATA_DIR / "solution.csv"

SUBMISSION_PATH = Path("../submissions/v1")
SUBMISSION_PATH.mkdir(parents=True, exist_ok=True)


PURPOSE_LABELS = {
    0: "BACKGROUND",
    1: "COMPARES_CONTRASTS",
    2: "EXTENSION",
    3: "FUTURE",
    4: "MOTIVATION",
    5: "USES"
}

INFLUENCE_LABELS = {
    0: "INCIDENTAL",
    1: "INFLUENTIAL"
}

TASKS={
    "purpose": ["citation_class_label", PURPOSE_LABELS],
    "influence": ["citation_influence_label", INFLUENCE_LABELS]
}

np.random.seed(250320)

In [4]:
df_train = pd.read_csv(TRAIN_PATH).merge(
    pd.read_csv(str(TRAIN_PATH).replace("influence", "purpose"))[["unique_id", "citation_class_label"]],
    on="unique_id"
)
df_train.columns

Index(['unique_id', 'core_id', 'citing_title', 'citing_author', 'cited_title',
       'cited_author', 'citation_context', 'citation_influence_label',
       'citation_class_label'],
      dtype='object')

In [5]:
df_test = pd.read_csv(TEST_PATH).merge(
    pd.read_csv(str(TEST_PATH).replace("influence", "purpose"))[["unique_id"]],
    on="unique_id"
)
df_test.columns

Index(['unique_id', 'core_id', 'citing_title', 'citing_author', 'cited_title',
       'cited_author', 'citation_context'],
      dtype='object')

In [6]:
df_solution = pd.read_csv(SOLUTION_PATH).merge(
    pd.read_csv(str(SOLUTION_PATH).replace("influence", "purpose")),
    on="unique_id"
)
df_solution.columns

Index(['unique_id', 'citation_influence_label', 'citation_class_label'], dtype='object')

In [7]:
df_test = df_test.merge(df_solution, on="unique_id")
df_test.shape

(1000, 9)

In [8]:
df = pd.concat([
    df_train.assign(split="train"),
    df_test.assign(split="test"),
], axis=0, sort=False).reset_index(drop=True).astype({task[0]: int for task in TASKS.values()})
df.head()

Unnamed: 0,unique_id,core_id,citing_title,citing_author,cited_title,cited_author,citation_context,citation_influence_label,citation_class_label,split
0,CC1,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Web search personalization with ontological us...,Sieg,They usually generate user models that describ...,0,5,train
1,CC2,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Exploring Scholarly Data with Rexplore,Osborne,The Computer Science Ontology (CSO)[3]is a lar...,0,0,train
2,CC3,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Klink-2: Integrating Multiple Web Sources to G...,Osborne,"In order to do so, we characterized all SN pub...",0,0,train
3,CC4,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Forecasting the Spreading of Technologies in R...,Osborne,"This API supports a number of applications, in...",1,0,train
4,CC5,158977742,Ontology-Based Recommendation of Editorial Pro...,Thiviyan Thanapalasingam,Supporting Springer Nature Editors by means of...,Osborne,It works according to three main steps:1) It r...,1,5,train


In [9]:
df.split.value_counts()

train    3000
test     1000
Name: split, dtype: int64

In [10]:
df.pivot_table(
    index="citation_class_label", columns="split", values="unique_id", aggfunc=len
).sort_values("train", ascending=False)

split,test,train
citation_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,546,1648
5,153,475
1,121,368
4,106,276
2,59,171
3,15,62


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [12]:
ct = ColumnTransformer([
    #("citing_tfidf", TfidfVectorizer(), "citing_title"),
    #("cited_tfidf", TfidfVectorizer(), "cited_title"),
    ("citation_context_tfidf", TfidfVectorizer(),"citation_context"),
])
ct.fit(df)
df_features = ct.transform(df)
df_features.shape

(4000, 13480)

In [13]:
from joblib import dump, load

In [14]:
# Save transformer
dump(ct, SUBMISSION_PATH / "ColumnTransformer.joblib")
dump(df_features, SUBMISSION_PATH / "df_features.joblib")

['submissions\\v2\\df_features.joblib']

In [15]:
df_features

<4000x13480 sparse matrix of type '<class 'numpy.float64'>'
	with 118873 stored elements in Compressed Sparse Row format>

In [16]:
df_features[[0, 1, 5]]

<3x13480 sparse matrix of type '<class 'numpy.float64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [17]:
def generate_data(df, label_col, split="train"):
    split_idx = df[(df.split == split)].index.tolist()
    X = df_features[split_idx]
    y = df.iloc[split_idx][label_col]
    print(f"{split}: X={X.shape}, y={y.shape}")
    return X, y, split_idx

def submission_pipeline(model, df, df_features, task, model_key=None, to_dense=False):
    # Setup submission folder
    submission_folder = SUBMISSION_PATH / f"{model_key}_{task}"
    submission_folder.mkdir(parents=True, exist_ok=True)
    print(f"Generated folder: {submission_folder}")
    
    model_file = submission_folder / "model.joblib"
    submission_file=submission_folder / f"submission.csv"
    
    label_col, label_dict = TASKS[task]
    
    X_train, y_train, train_idx = generate_data(df, label_col, split="train")
    X_test, y_test, test_idx = generate_data(df, label_col, split="test")
    print(f"Training model")
    if to_dense:
        X_train = X_train.toarray()
        X_test = X_test.toarray()
    model.fit(X_train, y_train.astype(int))
    dump(model, model_file)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("Output label dist")
    print(pd.Series(y_test_pred).map(label_dict).value_counts())
    
    target_names = list(sorted(label_dict.values()))
    
    # Print reports 
    print("Training report")
    print(classification_report(y_train, y_train_pred, target_names=target_names))
    print("Test report")
    print(classification_report(y_test, y_test_pred, target_names=target_names))
    
    train_report = classification_report(y_train, y_train_pred, target_names=target_names, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, target_names=target_names, output_dict=True)
    
    print(f"Writing submission file: {submission_file}")
    df.iloc[test_idx][["unique_id"]].assign(**{label_col: y_test_pred}).to_csv(submission_file, index=False)
    return model, train_report, test_report

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV

In [19]:
model_configs = {
    "gbt": [GradientBoostingClassifier, dict()],
    "rf": [RandomForestClassifier, dict(n_jobs=-1)],
    "mlp-3": [MLPClassifier, dict(hidden_layer_sizes=(256,256,128))],
    "mlp": [MLPClassifier, dict()],
    "lr": [LogisticRegressionCV, dict(n_jobs=-1)]
}

DENSE_MODELS = {"mlp", "mlp-3"}

In [20]:
reports = {}
for model_key, model_params in model_configs.items():
    model_cls, model_kwargs = model_params
    to_dense=False
    if model_cls in DENSE_MODELS:
        to_dense=True
    print(model_key, model_params)
    for task in TASKS:
        model = model_cls(**model_kwargs)
        model, train_report, test_report = %time submission_pipeline(model, df, df_features, task, model_key=model_key, to_dense=to_dense)
        reports[(model_key, task)] = {"train": train_report, "test": test_report}

gbt [<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, {}]
Generated folder: submissions\v2\gbt_purpose
train: X=(3000, 13480), y=(3000,)
test: X=(1000, 13480), y=(1000,)
Training model
Output label dist
BACKGROUND            925
USES                   28
COMPARES_CONTRASTS     19
EXTENSION              12
MOTIVATION             10
FUTURE                  6
dtype: int64
Training report
                    precision    recall  f1-score   support

        BACKGROUND       0.70      1.00      0.82      1648
COMPARES_CONTRASTS       0.99      0.41      0.58       368
         EXTENSION       1.00      0.50      0.67       171
            FUTURE       1.00      0.98      0.99        62
        MOTIVATION       1.00      0.43      0.60       276
              USES       0.98      0.49      0.65       475

          accuracy                           0.76      3000
         macro avg       0.95      0.64      0.72      3000
      weighted avg       0.83      0.76      0



Output label dist
BACKGROUND            951
COMPARES_CONTRASTS     23
USES                   20
MOTIVATION              3
EXTENSION               3
dtype: int64
Training report
                    precision    recall  f1-score   support

        BACKGROUND       0.94      1.00      0.97      1648
COMPARES_CONTRASTS       0.99      0.92      0.95       368
         EXTENSION       0.99      0.88      0.93       171
            FUTURE       1.00      0.94      0.97        62
        MOTIVATION       0.99      0.92      0.95       276
              USES       1.00      0.92      0.95       475

          accuracy                           0.96      3000
         macro avg       0.98      0.93      0.95      3000
      weighted avg       0.96      0.96      0.96      3000

Test report
                    precision    recall  f1-score   support

        BACKGROUND       0.54      0.95      0.69       546
COMPARES_CONTRASTS       0.13      0.02      0.04       121
         EXTENSION       0.

  'precision', 'predicted', average, warn_for)


Output label dist
INCIDENTAL     533
INFLUENTIAL    467
dtype: int64
Training report
              precision    recall  f1-score   support

  INCIDENTAL       0.97      0.99      0.98      1568
 INFLUENTIAL       0.98      0.97      0.97      1432

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Test report
              precision    recall  f1-score   support

  INCIDENTAL       0.51      0.59      0.55       457
 INFLUENTIAL       0.60      0.52      0.55       543

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.56      0.55      0.55      1000

Writing submission file: submissions\v2\rf_influence\submission.csv
Wall time: 478 ms
mlp-3 [<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>, {'hidden_layer_sizes': (256, 256, 128)}]
Generated folder: submissions\v2\mlp-3_purpos



Output label dist
BACKGROUND    972
USES           28
dtype: int64
Training report
                    precision    recall  f1-score   support

        BACKGROUND       0.65      1.00      0.79      1648
COMPARES_CONTRASTS       0.00      0.00      0.00       368
         EXTENSION       0.00      0.00      0.00       171
            FUTURE       0.00      0.00      0.00        62
        MOTIVATION       0.00      0.00      0.00       276
              USES       1.00      0.98      0.99       475

          accuracy                           0.70      3000
         macro avg       0.27      0.33      0.30      3000
      weighted avg       0.52      0.70      0.59      3000

Test report
                    precision    recall  f1-score   support

        BACKGROUND       0.55      0.98      0.70       546
COMPARES_CONTRASTS       0.00      0.00      0.00       121
         EXTENSION       0.00      0.00      0.00        59
            FUTURE       0.00      0.00      0.00        15
 

  'precision', 'predicted', average, warn_for)


Output label dist
INCIDENTAL     577
INFLUENTIAL    423
dtype: int64
Training report
              precision    recall  f1-score   support

  INCIDENTAL       0.81      0.89      0.85      1568
 INFLUENTIAL       0.87      0.77      0.81      1432

    accuracy                           0.83      3000
   macro avg       0.84      0.83      0.83      3000
weighted avg       0.83      0.83      0.83      3000

Test report
              precision    recall  f1-score   support

  INCIDENTAL       0.51      0.64      0.57       457
 INFLUENTIAL       0.61      0.48      0.54       543

    accuracy                           0.55      1000
   macro avg       0.56      0.56      0.55      1000
weighted avg       0.56      0.55      0.55      1000

Writing submission file: submissions\v2\lr_influence\submission.csv
Wall time: 1.52 s


In [21]:
df_reports = pd.concat([
    pd.concat([
        pd.DataFrame(report[split]).T.assign(model=model, task=task, split=split).reset_index().rename(columns={"index": "label"})
        for split in report
    ])
    for (model, task), report in reports.items()
], axis=0, sort=False, ignore_index=True)

In [22]:
df_reports

Unnamed: 0,label,precision,recall,f1-score,support,model,task,split
0,BACKGROUND,0.701620,0.998786,0.824236,1648.000,gbt,purpose,train
1,COMPARES_CONTRASTS,0.993377,0.407609,0.578035,368.000,gbt,purpose,train
2,EXTENSION,1.000000,0.502924,0.669261,171.000,gbt,purpose,train
3,FUTURE,1.000000,0.983871,0.991870,62.000,gbt,purpose,train
4,MOTIVATION,1.000000,0.427536,0.598985,276.000,gbt,purpose,train
...,...,...,...,...,...,...,...,...
135,INCIDENTAL,0.507799,0.641138,0.566731,457.000,lr,influence,test
136,INFLUENTIAL,0.612293,0.476980,0.536232,543.000,lr,influence,test
137,accuracy,0.552000,0.552000,0.552000,0.552,lr,influence,test
138,macro avg,0.560046,0.559059,0.551482,1000.000,lr,influence,test


In [23]:
df_reports.loc[
    df_reports.label=="macro avg", 
    ["f1-score", "model", "task", "split"]
].pivot_table(index="model", columns=["task", "split"], values="f1-score", aggfunc="first")

task,influence,influence,purpose,purpose
split,test,train,test,train
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
gbt,0.534654,0.76985,0.151085,0.719327
lr,0.551482,0.82963,0.135456,0.295864
mlp,0.523246,0.991648,0.186824,0.995218
mlp-3,0.523726,0.993987,0.18584,0.994995
rf,0.549955,0.975595,0.140355,0.954409


In [24]:
df_t = df_reports.loc[
    (df_reports.label=="macro avg") & (df_reports.task=="purpose"), 
    ["f1-score", "model", "task", "split"]
].pivot_table(index="model", columns="split", values="f1-score", aggfunc="first").sort_values("test")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrr}
\toprule
split &   test &  train \\
model &        &        \\
\midrule
lr    &  0.135 &  0.296 \\
rf    &  0.140 &  0.954 \\
gbt   &  0.151 &  0.719 \\
mlp-3 &  0.186 &  0.995 \\
mlp   &  0.187 &  0.995 \\
\bottomrule
\end{tabular}



split,test,train
model,Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.135456,0.295864
rf,0.140355,0.954409
gbt,0.151085,0.719327
mlp-3,0.18584,0.994995
mlp,0.186824,0.995218


In [25]:
df_t = df_reports.loc[
    (df_reports.label=="macro avg") & (df_reports.task=="influence"), 
    ["f1-score", "model", "task", "split"]
].pivot_table(index="model", columns="split", values="f1-score", aggfunc="first").sort_values("test")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrr}
\toprule
split &   test &  train \\
model &        &        \\
\midrule
mlp   &  0.523 &  0.992 \\
mlp-3 &  0.524 &  0.994 \\
gbt   &  0.535 &  0.770 \\
rf    &  0.550 &  0.976 \\
lr    &  0.551 &  0.830 \\
\bottomrule
\end{tabular}



split,test,train
model,Unnamed: 1_level_1,Unnamed: 2_level_1
mlp,0.523246,0.991648
mlp-3,0.523726,0.993987
gbt,0.534654,0.76985
rf,0.549955,0.975595
lr,0.551482,0.82963


In [26]:
df_t = df_reports.loc[
    (df_reports.split=="test") & (df_reports.task=="purpose"), 
    ["label", "f1-score", "model",]
].pivot_table(index="model", columns="label", values="f1-score", aggfunc="first").sort_values("macro avg")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrrrrrrrrr}
\toprule
label &  BACKGROUND &  COMPARES\_CONTRASTS &  EXTENSION &  FUTURE &  MOTIVATION &   USES &  accuracy &  macro avg &  weighted avg \\
model &             &                     &            &         &             &        &           &            &               \\
\midrule
lr    &       0.702 &               0.000 &      0.000 &     0.0 &       0.000 &  0.110 &     0.543 &      0.135 &         0.400 \\
rf    &       0.692 &               0.042 &      0.032 &     0.0 &       0.018 &  0.058 &     0.528 &      0.140 &         0.396 \\
gbt   &       0.683 &               0.057 &      0.056 &     0.0 &       0.000 &  0.110 &     0.518 &      0.151 &         0.400 \\
mlp-3 &       0.641 &               0.202 &      0.022 &     0.0 &       0.031 &  0.219 &     0.467 &      0.186 &         0.412 \\
mlp   &       0.639 &               0.206 &      0.049 &     0.0 &       0.028 &  0.198 &     0.462 &      0.187 &         0.410 \\
\bottomrule
\end{tabular}



label,BACKGROUND,COMPARES_CONTRASTS,EXTENSION,FUTURE,MOTIVATION,USES,accuracy,macro avg,weighted avg
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lr,0.70224,0.0,0.0,0.0,0.0,0.110497,0.543,0.135456,0.400329
rf,0.692051,0.041667,0.032258,0.0,0.018349,0.057803,0.528,0.140355,0.395593
gbt,0.682529,0.057143,0.056338,0.0,0.0,0.110497,0.518,0.151085,0.399805
mlp-3,0.640867,0.202247,0.021978,0.0,0.030769,0.219178,0.467,0.18584,0.412478
mlp,0.639127,0.206186,0.049383,0.0,0.028369,0.19788,0.462,0.186824,0.410108


In [27]:
df_t = df_reports.loc[
    (df_reports.split=="test") & (df_reports.task=="influence"), 
    ["label", "f1-score", "model",]
].pivot_table(index="model", columns="label", values="f1-score", aggfunc="first").sort_values("macro avg")
with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{lrrrrr}
\toprule
label &  INCIDENTAL &  INFLUENTIAL &  accuracy &  macro avg &  weighted avg \\
model &             &              &           &            &               \\
\midrule
mlp   &       0.487 &        0.559 &     0.526 &      0.523 &         0.526 \\
mlp-3 &       0.512 &        0.535 &     0.524 &      0.524 &         0.525 \\
gbt   &       0.568 &        0.502 &     0.537 &      0.535 &         0.532 \\
rf    &       0.545 &        0.554 &     0.550 &      0.550 &         0.550 \\
lr    &       0.567 &        0.536 &     0.552 &      0.551 &         0.550 \\
\bottomrule
\end{tabular}



label,INCIDENTAL,INFLUENTIAL,accuracy,macro avg,weighted avg
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mlp,0.487013,0.55948,0.526,0.523246,0.526362
mlp-3,0.512295,0.535156,0.524,0.523726,0.524709
gbt,0.567694,0.501615,0.537,0.534654,0.531813
rf,0.545455,0.554455,0.55,0.549955,0.550342
lr,0.566731,0.536232,0.552,0.551482,0.55017


## Investigate model

In [28]:
model_path = SUBMISSION_PATH / "lr_influence/model.joblib"
lr_influence_model = load(model_path)

In [32]:
lr_influence_model.coef_.shape

(1, 13480)

In [33]:
lr_influence_model.classes_

array([0, 1])

In [38]:
"citation_context_tfidf__00__0".split("__", 1)

['citation_context_tfidf', '00__0']

In [42]:
df_coefs = pd.DataFrame(
    lr_influence_model.coef_.T, 
    index=[x.split("__", 1)[-1] for x in ct.get_feature_names()], 
    columns=["weight"]
).rename_axis("feature").reset_index()
df_coefs.head()

Unnamed: 0,feature,weight
0,0,-0.016262
1,0,-0.004043
2,1,0.045382
3,26,-0.06033
4,4,-0.069719


In [52]:
df_t = pd.concat({
    INFLUENCE_LABELS[0]: df_coefs.sort_values("weight").reset_index(drop=True),
    INFLUENCE_LABELS[1]: df_coefs.sort_values("weight", ascending=False).reset_index(drop=True),
}, axis=1, sort=False).head(10)

with pd.option_context("precision", 3):
    print(df_t.to_latex())
df_t

\begin{tabular}{llrlr}
\toprule
{} & \multicolumn{2}{l}{INCIDENTAL} & \multicolumn{2}{l}{INFLUENTIAL} \\
{} &    feature & weight &      feature & weight \\
\midrule
0 &  including & -0.703 &          the &  1.547 \\
1 &   learning & -0.702 &        first &  0.813 \\
2 &         11 & -0.652 &         were &  0.742 \\
3 &       2002 & -0.629 &           to &  0.676 \\
4 &        and & -0.624 &           of &  0.631 \\
5 &        amp & -0.623 &    cessation &  0.620 \\
6 &   academic & -0.608 &           us &  0.575 \\
7 &     impact & -0.580 &          avh &  0.518 \\
8 &         13 & -0.544 &        virus &  0.513 \\
9 &   research & -0.495 &  temperature &  0.510 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,INCIDENTAL,INCIDENTAL,INFLUENTIAL,INFLUENTIAL
Unnamed: 0_level_1,feature,weight,feature,weight
0,including,-0.703429,the,1.546984
1,learning,-0.702319,first,0.812547
2,11,-0.652398,were,0.742135
3,2002,-0.629222,to,0.675671
4,and,-0.62402,of,0.631087
5,amp,-0.622686,cessation,0.619857
6,academic,-0.608006,us,0.574899
7,impact,-0.580294,avh,0.517897
8,13,-0.544389,virus,0.51279
9,research,-0.495197,temperature,0.509712
