In [71]:
import pandas as pd
from pici.labelling import InnovationLabels
from pici import Pici

p = Pici(
    cache_dir='../../cache',
    #cache_nrows=10000,
    start='2017-01-01',
    end='2019-01-01',
    labels=[
       InnovationLabels(
           pd.read_excel("../../pici/tests/test_integrated_labels.xlsx")
       ),
       InnovationLabels().from_limesurvey(
           pd.read_excel("../../pici/tests/results-survey664322_2022-08-01.xlsx"),
           drop_labellers=["Test","test"]
       )
    ],
)

In [72]:
X, Y = p.get_topic_features()

In [73]:
X = X.apply(pd.to_numeric)

In [74]:
X

Unnamed: 0,number of posts,number of contributors,first post date,second post date,last post date,delay first last post,delay first second post
0,4,3,1502976779000000000,1502984003000000000,1503047575000000000,0,0
1,8,3,1485400480000000000,1485414291000000000,1495721199000000000,119,0
2,1,1,1489793738000000000,1489793738000000000,1489793738000000000,0,0
3,2,2,1484252910000000000,1484338804000000000,1484338804000000000,0,0
4,2,2,1484252910000000000,1484338804000000000,1484338804000000000,0,0
...,...,...,...,...,...,...,...
521,7,6,1522581780000000000,1522597200000000000,1527886080000000000,61,0
522,7,6,1522581780000000000,1522597200000000000,1527886080000000000,61,0
523,2,2,1492318020000000000,1492510140000000000,1492510140000000000,2,2
524,10,5,1494089520000000000,1496766180000000000,1512571680000000000,213,30


In [75]:
Y

Unnamed: 0,labeller,community_name,url,label_idea,label_evaluation,label_implementation,label_modification,label_improvement,label_potential,label_any_activity,label_has_potential,id
0,petrol39blackberry0,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,False,False,1,True,True,16852
1,xwegner_lgh@outlook.de,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,True,False,1,True,True,25188
2,anna+philipp,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,False,False,2,True,True,55121
3,anna+philipp,OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,True,True,2,True,True,56276
4,Jan-Philipp (JP),OpenStreetMap,https://forum.openstreetmap.org/viewtopic.php?...,True,True,False,False,True,2,True,True,56276
...,...,...,...,...,...,...,...,...,...,...,...,...
521,Jan-Philipp (JP),PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,True,2,True,True,whirligig-star
522,Jan P.,PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,True,2,True,True,whirligig-star
523,Larilu,PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,False,0,True,False,why-not-develop-a-pelletizing-machine
524,Larilu,PreciousPlastic,https://davehakkens.nl/community/forums/topic/...,True,True,False,False,False,1,True,True,wooden-framework


In [76]:
y = Y.label_idea

In [77]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight="balanced", n_estimators=30)
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(2), scoring='roc_auc', n_jobs=-1)

param_grid = {
    'estimator__max_features': ['auto'],
    'estimator__max_depth' : [3,4,5,10],
    'estimator__criterion' :['gini']
}
k_fold = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)

CV_rfc = GridSearchCV(estimator=rfecv, param_grid=param_grid, cv= k_fold, scoring = 'roc_auc')

X_train, X_test, y_train, y_test = train_test_split(X, y)

CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=0, shuffle=True),
             estimator=RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
                             estimator=RandomForestClassifier(class_weight='balanced',
                                                              n_estimators=30,
                                                              random_state=0),
                             n_jobs=-1, scoring='roc_auc'),
             param_grid={'estimator__criterion': ['gini'],
                         'estimator__max_depth': [3, 4, 5, 10],
                         'estimator__max_features': ['auto']},
             scoring='roc_auc')

In [78]:
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)
print(CV_rfc.best_estimator_)

{'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': 'auto'}
0.7274360792926092
RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
      estimator=RandomForestClassifier(class_weight='balanced', max_depth=10,
                                       n_estimators=30, random_state=0),
      n_jobs=-1, scoring='roc_auc')


In [79]:
CV_rfc.best_estimator_.support_

array([ True,  True,  True,  True,  True,  True,  True])

In [80]:
import numpy as np
np.absolute(CV_rfc.best_estimator_.estimator_.feature_importances_)

array([0.14554351, 0.09247178, 0.16819497, 0.20586004, 0.1846271 ,
       0.14417149, 0.05913111])

In [81]:
CV_rfc.best_estimator_.support_

array([ True,  True,  True,  True,  True,  True,  True])

## Logistic regression

In [82]:
import statsmodels.api as sm
from statsmodels.formula.api import logit

In [83]:
data = pd.merge(X,Y, left_index=True, right_index=True)

In [84]:
data = data[X.columns.tolist() + ['label_any_activity']].astype(float)

In [85]:
data

Unnamed: 0,number of posts,number of contributors,first post date,second post date,last post date,delay first last post,delay first second post,label_any_activity
0,4.0,3.0,1.502977e+18,1.502984e+18,1.503048e+18,0.0,0.0,1.0
1,8.0,3.0,1.485400e+18,1.485414e+18,1.495721e+18,119.0,0.0,1.0
2,1.0,1.0,1.489794e+18,1.489794e+18,1.489794e+18,0.0,0.0,1.0
3,2.0,2.0,1.484253e+18,1.484339e+18,1.484339e+18,0.0,0.0,1.0
4,2.0,2.0,1.484253e+18,1.484339e+18,1.484339e+18,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
521,7.0,6.0,1.522582e+18,1.522597e+18,1.527886e+18,61.0,0.0,1.0
522,7.0,6.0,1.522582e+18,1.522597e+18,1.527886e+18,61.0,0.0,1.0
523,2.0,2.0,1.492318e+18,1.492510e+18,1.492510e+18,2.0,2.0,1.0
524,10.0,5.0,1.494090e+18,1.496766e+18,1.512572e+18,213.0,30.0,1.0


In [86]:
simple_model = logit(
    "label_any_activity ~ Q('number of posts') + Q('number of contributors') + Q('delay first last post') + Q('delay first second post')",
    data,
).fit()

Optimization terminated successfully.
         Current function value: 0.629151
         Iterations 7


In [87]:
print(simple_model.summary())

                           Logit Regression Results                           
Dep. Variable:     label_any_activity   No. Observations:                  526
Model:                          Logit   Df Residuals:                      521
Method:                           MLE   Df Model:                            4
Date:                Mon, 22 Aug 2022   Pseudo R-squ.:                 0.05904
Time:                        17:10:06   Log-Likelihood:                -330.93
converged:                       True   LL-Null:                       -351.70
Covariance Type:            nonrobust   LLR p-value:                 2.086e-08
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept                        0.0869      0.114      0.759      0.448      -0.138       0.311
Q('number of posts')             0.0558      0.022      2.593      0.010     

## extratrees

In [88]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()

y = Y.label_any_activity

model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.plot(kind='barh')
# plt.show()
feat_importances

[0.13001173 0.11466877 0.18099553 0.17666646 0.17195886 0.15704696
 0.06865168]


number of posts            0.130012
number of contributors     0.114669
first post date            0.180996
second post date           0.176666
last post date             0.171959
delay first last post      0.157047
delay first second post    0.068652
dtype: float64

## selectfrommodel

In [89]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

y = Y.label_any_activity

clf = LassoCV().fit(X, y)
importance = np.abs(clf.coef_)
idx_third = importance.argsort()[-3]
threshold = importance[idx_third] + 0.01

idx_features = (-importance).argsort()[:10]
name_features = np.array(X.columns)[idx_features]
print('Selected features: {}'.format(name_features))

sfm = SelectFromModel(clf, threshold=threshold)
sfm.fit(X, y)
X_transform = sfm.transform(X)

n_features = sfm.transform(X).shape[1]

Selected features: ['first post date' 'last post date' 'number of posts'
 'number of contributors' 'second post date' 'delay first last post'
 'delay first second post']


## Boruta

In [91]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

y = Y.label_any_activity

# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X_arr = X.values
y_arr = y.values

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X_arr, y_arr)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X_arr)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	9 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	10 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	6
Tentative: 	0
Rejected: 	1


BorutaPy finished running.

Iteration: 	13 / 100
Confirmed: 	6
Tentative: 	0
Rejected: 	1


In [93]:
feat_selector.support_

array([ True,  True,  True,  True,  True,  True, False])

In [95]:
set(X.columns[feat_selector.support_])

{'delay first last post',
 'first post date',
 'last post date',
 'number of contributors',
 'number of posts',
 'second post date'}

In [96]:
set(X.columns) - set(X.columns[feat_selector.support_])

{'delay first second post'}