In [6]:
# a demonstration of use of the logistic regression classifier 
# for content classification on doc2vec docs vectors
# where logit probabilities of the classified content against the given categories can be used as relevance rating
# towards the classified categories

# references:
# [1] https://deeplearning4j.org/word2vec.html
# [2] http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# [3] http://scikit-learn.org/stable/modules/cross_validation.html

# author Michal Stefanik  mistefan  at  redhat dot com

import numpy as np
import pandas as pd
import random

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# D2VWrapper is our implementation providing vectorization of the documents based on doc2vec
from doc2vec_wrapper import D2VWrapper

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
# target categories
product_list = ["amq", "eap", "webserver", "datagrid", "fuse", "brms", "bpmsuite", "devstudio", "cdk",
                "developertoolset", "rhel", "softwarecollections", "mobileplatform", "openshift"]

In [8]:
# initialize d2v_wrapper providing as well metadata about the models state
d2v_wrapper = D2VWrapper(content_categories=product_list,
                         vector_length=500)
# EITHER initialize the vocab of documents and minimize the distances of embeddings in training phase
# d2v_wrapper.init_model_vocab(content_basepath="../../data/content/playground/auto/nostem",
#                              basepath_suffix="_content.csv", drop_short_docs=10)
# d2v_wrapper.train_model(shuffle=True, epochs=1 if TEST_MODE else 20)
# d2v_wrapper.infer_content_vectors()

# OR load initialized and trained wrapper if available
d2v_wrapper.load_persisted_wrapper("trained_models/wrapper/header_incl/10epoch_train_stem_not_removed_header")

2017-04-26 14:29:35,096 : INFO : Loading serialized wrapper model from: trained_models/wrapper/header_incl/10epoch_train_stem_not_removed_header
2017-04-26 14:29:35,097 : INFO : Loading all_base_vocab_docs objects
2017-04-26 14:29:43,460 : INFO : Initialized 43478 headers of 43478 for vectorization
2017-04-26 14:29:43,461 : INFO : Loading all_base_vocab_docs vectors
2017-04-26 14:30:21,734 : INFO : Loading trained Doc2Vec model
2017-04-26 14:30:21,735 : INFO : loading Doc2Vec object from trained_models/wrapper/header_incl/10epoch_train_stem_not_removed_headerdoc2vec.mod
2017-04-26 14:30:21,998 : INFO : loading docvecs recursively from trained_models/wrapper/header_incl/10epoch_train_stem_not_removed_headerdoc2vec.mod.docvecs.* with mmap=None
2017-04-26 14:30:21,999 : INFO : loading doctag_syn0 from trained_models/wrapper/header_incl/10epoch_train_stem_not_removed_headerdoc2vec.mod.docvecs.doctag_syn0.npy with mmap=None
2017-04-26 14:30:22,170 : INFO : loading syn1neg from trained_model

In [9]:
from common import parsing_utils as parsing

new_content_basepath = "/home/michal/Documents/Projects/ml/project-classifier-poc/project-classifier-poc/data/content/books_test/"
new_content_df = parsing.get_content_as_dataframe(new_content_basepath, ".csv", ["developers_redhat"])
new_content = parsing.select_training_content(new_content_df, sent_split=False)
new_content_headers = parsing.select_headers(new_content_df)
new_content_docs = parsing.tagged_docs_from_content(new_content, new_content_headers, pd.Series([None]*len(new_content)))

2017-04-26 14:30:22,501 : INFO : Initializing 85 CategorizedDocuments


In [10]:
train_content = d2v_wrapper.infer_vocab_content_vectors()
train_content_vectors = train_content.iloc[:, :-1]
train_content_targets = train_content.iloc[:, -1]

2017-04-26 14:30:22,515 : INFO : Returning already inferred doc vectors of 43478 all_base_vocab_docs


In [11]:
from sklearn.linear_model import LogisticRegression
# classifier training
logging.info("Fitting classifier")

log_reg_classifier = LogisticRegression(C=0.3, solver="sag", multi_class='ovr', n_jobs=8, max_iter=1000)
log_reg_classifier.fit(train_content_vectors, train_content_targets)

2017-04-26 14:30:22,667 : INFO : Fitting classifier


LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=8,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# classifier testing
logging.info("Predicting")
test_content_vectors = d2v_wrapper.infer_content_vectors(new_content_docs)
top_target_categories = log_reg_classifier.predict(test_content_vectors)

2017-04-26 14:34:48,362 : INFO : Predicting
2017-04-26 14:34:48,363 : INFO : Initialized 85 headers of 85 for vectorization
2017-04-26 14:34:48,363 : INFO : Inferring vectors of 85 documents
2017-04-26 14:34:48,511 : INFO : Inferring vectors of 85 headers


In [13]:
new_content_df.keys()

Index([u'project', u'source', u'sys_content_plaintext', u'sys_description',
       u'sys_title', u'sys_url_view', u'target'],
      dtype='object')

In [14]:
# evaluation:
# logits
logging.info("Probs collection")
class_probs = log_reg_classifier.predict_proba(test_content_vectors)
class_ordered = list(log_reg_classifier.classes_)

class_actual_index = pd.Series(top_target_categories).apply(lambda cat_label: class_ordered.index(cat_label))
actual_prob = class_probs[np.arange(len(class_actual_index)),(class_actual_index)]

probs_matrix = pd.DataFrame(data=class_probs, columns=log_reg_classifier.classes_)
# probs_matrix["url"] = new_content_df["sys_url_view"]
top_five_cats = probs_matrix.apply(lambda row: row.nlargest(n=5), axis=1).columns

2017-04-26 14:34:48,622 : INFO : Probs collection


In [15]:
probs_matrix.head(10)

Unnamed: 0,amq,bpmsuite,brms,cdk,datagrid,developertoolset,devstudio,eap,fuse,mobileplatform,openshift,rhel,softwarecollections,webserver
0,0.006989,0.00061,0.072619,0.008377,0.014659,0.00034,0.040166,0.617561,0.077218,0.000162,0.005559,0.001161,0.000236,0.154343
1,0.019781,0.0068,0.003922,0.000301,0.007276,0.000226,0.010787,0.864492,0.047458,0.00121,0.000148,0.000793,7.1e-05,0.036736
2,0.001071,0.004909,0.007066,0.000355,0.000389,3.9e-05,0.478675,0.503412,0.000443,0.0004,0.000313,0.000124,3e-05,0.002774
3,0.00268,0.03473,0.041035,0.000908,0.033139,0.000192,0.0141,0.456726,0.380827,0.000285,0.000251,0.002124,0.000179,0.032825
4,0.000472,0.001951,0.011906,0.000452,0.007861,0.000141,0.036011,0.562413,0.096494,0.000408,0.000353,1.1e-05,3e-05,0.281497
5,0.073711,0.013567,0.002593,0.000313,0.002283,0.00021,0.171163,0.578954,0.08673,0.002615,0.003903,0.053562,0.000155,0.010243
6,0.002207,0.00153,0.017849,0.000152,0.002154,4.7e-05,0.002454,0.955179,0.001635,0.000562,5.5e-05,0.001953,4e-05,0.014184
7,0.050461,0.006597,0.003294,0.001256,0.01203,9.1e-05,0.011606,0.167315,0.008731,0.000544,0.000589,0.736043,0.000102,0.00134
8,0.020994,0.00418,0.004101,0.000243,0.00353,0.000347,0.012495,0.534061,0.363802,0.002091,0.00112,0.039482,0.0001,0.013456
9,0.001926,0.004413,0.00654,0.000713,4e-05,0.000671,0.029472,0.800179,0.018838,0.000136,0.02915,0.025805,0.000616,0.081503


In [19]:
pd.set_option('max_colwidth', 400)

probs_matrix["url"] = new_content_df["sys_url_view"]

display_df = probs_matrix[list(top_five_cats)+["url"]]
display_df["header"] = new_content_df["sys_title"]

display_df = display_df[["header", "url"] + list(display_df.columns[:-2])]
# non_nan_mask
# cols = df.columns
# bt = df.apply(lambda x: x > 0)
# bt.apply(lambda x: list(cols[x.values]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
display_df

Unnamed: 0,header,url,amq,bpmsuite,brms,cdk,datagrid,devstudio,eap,fuse,mobileplatform,openshift,rhel,webserver
0,beginning pojos lightweight java web development plain old java objects spring hibernate tapestry,http://www.apress.com/9781590595961,0.006989,0.000610,0.072619,0.008377,0.014659,0.040166,0.617561,0.077218,0.000162,0.005559,0.001161,0.154343
1,jboss performance tuning,http://books.google.com/books/about/JBoss_AS_5_Performance_Tuning.html?hl=&id=J-6VeA4b-qgC,0.019781,0.006800,0.003922,0.000301,0.007276,0.010787,0.864492,0.047458,0.001210,0.000148,0.000793,0.036736
2,jboss tools developers guide,http://books.google.com/books/about/JBoss_Tools_3_Developers_Guide.html?hl=&id=fwsRzMar6uoC,0.001071,0.004909,0.007066,0.000355,0.000389,0.478675,0.503412,0.000443,0.000400,0.000313,0.000124,0.002774
3,restful java jax edition,http://shop.oreilly.com/product/0636920028925.do?sortby=publicationDate,0.002680,0.034730,0.041035,0.000908,0.033139,0.014100,0.456726,0.380827,0.000285,0.000251,0.002124,0.032825
4,practical jboss seam projects,http://www.apress.com/9781590598634,0.000472,0.001951,0.011906,0.000452,0.007861,0.036011,0.562413,0.096494,0.000408,0.000353,0.000011,0.281497
5,information systems development,http://books.google.com/books/about/Information_Systems_Development.html?hl=&id=3e4qEFw-Qb0C,0.073711,0.013567,0.002593,0.000313,0.002283,0.171163,0.578954,0.086730,0.002615,0.003903,0.053562,0.010243
6,jboss deployment administration,http://www.apress.com/9781590592816,0.002207,0.001530,0.017849,0.000152,0.002154,0.002454,0.955179,0.001635,0.000562,0.000055,0.001953,0.014184
7,distributed applications interoperable systems,http://www.springer.com/us/book/9780412823404,0.050461,0.006597,0.003294,0.001256,0.012030,0.011606,0.167315,0.008731,0.000544,0.000589,0.736043,0.001340
8,meaningful internet systems otm workshops,http://www.springer.com/us/book/9783642052897,0.020994,0.004180,0.004101,0.000243,0.003530,0.012495,0.534061,0.363802,0.002091,0.001120,0.039482,0.013456
9,expanding choice,http://books.google.com/books/about/Expanding_Choice.html?hl=&id=Pm6Dap6Bq7AC,0.001926,0.004413,0.006540,0.000713,0.000040,0.029472,0.800179,0.018838,0.000136,0.029150,0.025805,0.081503


In [23]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s > 0.2
    return ['background-color: yellow' if v else '' for v in is_max]

In [24]:
display_df.style.apply(highlight_max, axis=1)

Unnamed: 0,header,url,amq,bpmsuite,brms,cdk,datagrid,devstudio,eap,fuse,mobileplatform,openshift,rhel,webserver
0,beginning pojos lightweight java web development plain old java objects spring hibernate tapestry,http://www.apress.com/9781590595961,0.00698921,0.000610235,0.0726188,0.00837661,0.0146589,0.040166,0.617561,0.0772182,0.00016199,0.0055593,0.00116128,0.154343
1,jboss performance tuning,http://books.google.com/books/about/JBoss_AS_5_Performance_Tuning.html?hl=&id=J-6VeA4b-qgC,0.0197806,0.00680006,0.00392191,0.000301391,0.00727621,0.0107868,0.864492,0.0474579,0.00120969,0.000148412,0.000792598,0.0367359
2,jboss tools developers guide,http://books.google.com/books/about/JBoss_Tools_3_Developers_Guide.html?hl=&id=fwsRzMar6uoC,0.00107135,0.00490906,0.00706583,0.000354627,0.000388706,0.478675,0.503412,0.00044318,0.000399846,0.000313366,0.000123884,0.00277411
3,restful java jax edition,http://shop.oreilly.com/product/0636920028925.do?sortby=publicationDate,0.00267981,0.0347299,0.0410349,0.000908341,0.0331392,0.0140995,0.456726,0.380827,0.000285093,0.000250812,0.00212351,0.0328251
4,practical jboss seam projects,http://www.apress.com/9781590598634,0.000471907,0.00195127,0.0119059,0.00045169,0.00786115,0.0360113,0.562413,0.0964942,0.000408088,0.000352882,1.08441e-05,0.281497
5,information systems development,http://books.google.com/books/about/Information_Systems_Development.html?hl=&id=3e4qEFw-Qb0C,0.0737105,0.0135665,0.00259322,0.000312892,0.00228347,0.171163,0.578954,0.0867299,0.00261486,0.00390297,0.0535618,0.0102427
6,jboss deployment administration,http://www.apress.com/9781590592816,0.00220724,0.00152983,0.0178493,0.000151682,0.00215388,0.00245392,0.955179,0.00163482,0.000561591,5.53047e-05,0.00195283,0.0141839
7,distributed applications interoperable systems,http://www.springer.com/us/book/9780412823404,0.0504612,0.00659721,0.00329433,0.00125591,0.0120303,0.011606,0.167315,0.00873124,0.00054367,0.000589098,0.736043,0.00133996
8,meaningful internet systems otm workshops,http://www.springer.com/us/book/9783642052897,0.0209939,0.00417957,0.00410085,0.000243478,0.00352985,0.012495,0.534061,0.363802,0.00209078,0.00112013,0.0394815,0.0134558
9,expanding choice,http://books.google.com/books/about/Expanding_Choice.html?hl=&id=Pm6Dap6Bq7AC,0.00192605,0.0044126,0.00654002,0.000713,3.97239e-05,0.029472,0.800179,0.018838,0.000135821,0.0291497,0.0258049,0.0815027
