In [1]:
import pandas as pd
from pici.labelling import InnovationLabels
from pici import Pici

p = Pici(
    cache_dir='../../cache',
    #cache_nrows=10000,
    start='2017-01-01',
    end='2019-01-01',
    labels=[
        InnovationLabels(
            pd.read_excel("../../pici/tests/test_integrated_labels.xlsx")
        ),
        InnovationLabels().from_limesurvey(
            pd.read_excel("../../pici/tests/results-survey664322_2022-08-01.xlsx"),
            drop_labellers=["Test","test"]
        )
    ],
)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'infomap', 'wurlitzer', 'karateclub'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'karateclub', 'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'infomap', 'wurlitzer'}


In [2]:
import networkx as nx
import pandas as pd
from pici.reporting import topics_metric

@topics_metric
def all_well_connected(community, k):
    """
    An indicator for whether all contributors to a thread
    are well-connected. There are two metrics for this concept,

    - ``all contributors in k-core`` - whether all contributors
       belong to the co-contributor network's k-core, and
    - ``all contributors out_degree >= 1`` - whether all
       contributors have commented on at least one thread that
       they did not start.

    Args:
        community: A pici.Community object
        k: parameter for k-core metric

    Returns:
        dict of (str, Pandas.Series)

    """

    # the easiest way to retain the topics index
    # is to define our metric on the posts df
    # and then aggregate to the topics level:

    # a) k-cores

    contributor_cores = nx.core_number(
        community.co_contributor_graph
    )

    # a df with one row per post, and True if contributor
    # belongs at least to k-core
    df = community.posts[[community.topic_column, community.contributor_column]]
    df['in_k_core'] = df[community.contributor_column].map(
        contributor_cores) >= k

    # aggregate to boolean topic-level metric,
    # rule: all(in_k_core)==True
    all_in_k_core = df.groupby(by=community.topic_column)['in_k_core'].agg(all)

    # b) have commented in other threads

    out_degree = community.commenter_graph.out_degree

    df['out_deg_lg_1'] = df[community.contributor_column].map(
        out_degree) >= 1
    all_commenters = df.groupby(by=community.topic_column)['out_deg_lg_1'].agg(all)

    return {
        f'all contributors in {k}-core': all_in_k_core,
        'all contributors out_degree >= 1': all_commenters
    }

In [3]:
p.add_metric(all_well_connected)

In [4]:
# prepare features and labels
X, Y = p.get_topic_features(parameters={
    'all_well_connected': {'k': 10}
})
X = X.apply(pd.to_numeric)

In [5]:
Y

Unnamed: 0,labeller,community_name,url,label_idea,label_evaluation,label_implementation,label_modification,label_improvement,label_potential,label_any_activity,label_has_potential,id
0,petrol39blackberry0,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,False,False,1,True,True,16852
1,xwegner_lgh@outlook.de,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,True,False,1,True,True,25188
2,anna+philipp,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,False,False,2,True,True,55121
3,anna+philipp,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,True,True,2,True,True,56276
4,Jan-Philipp (JP),OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,False,True,2,True,True,56276
...,...,...,...,...,...,...,...,...,...,...,...,...
521,Jan-Philipp (JP),PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,True,2,True,True,whirligig-star
522,Jan P.,PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,True,2,True,True,whirligig-star
523,Larilu,PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,False,0,True,False,why-not-develop-a-pelletizing-machine
524,Larilu,PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,False,1,True,True,wooden-framework


In [7]:
# multi-label classification!?
y = Y[['label_has_potential', 'label_any_activity']]
Pici.select_features(None, X,y)

ValueError: y should be a 1d array, got an array of shape (526, 2) instead.

<https://www.kaggle.com/code/residentmario/notes-on-multiclass-and-multitask-schemes/notebook>

multi-label to multi-class? <https://doi.org/10.3390/e18080282>

<http://scikit.ml/api/skmultilearn.problem_transform.lp.html>