In [89]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from gensim.corpora import Dictionary


from gensim.sklearn_api.tfidf import TfIdfTransformer
from gensim.matutils import corpus2csc

In [90]:
cats=['sci.space', 'comp.graphics']

In [91]:
newsgroups = fetch_20newsgroups(subset='train',
                                categories=['sci.space', 'comp.graphics'])

In [92]:
def doc_len(x, corpus=True):
    if corpus:
        length = 0
        for i in x:
            length += len(i)
        return length
    else:
        return len(x)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, random_state=49)

id2word = Dictionary([_.split() for _ in X_train])

train_corpus = [id2word.doc2bow(i.split()) for i in X_train]
test_corpus = [id2word.doc2bow(i.split()) for i in X_test]

tfidf_transformer = TfIdfTransformer(pivot_norm=True, slope=0.6).fit(train_corpus)
X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T
X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T
clf = LogisticRegression().fit(X_train_tfidf, y_train)
print clf.score(X_test_tfidf, y_test)

0.9728813559322034


In [94]:
doc_scores = clf.decision_function(X_test_tfidf)

In [95]:
def gettopk(doc_scores, X_test, k=None):
    if k is None:
        k = len(X_test)/10
    doc_scores = sorted(enumerate(doc_scores), key=lambda x: x[1])
    leng=0
    kleng=0
    for i,_ in enumerate(doc_scores):
        leng+=len(X_test[_[0]])
        if i==k:
           kleng=leng 
    print "top",k ,"documents have mean length of",kleng/k
    print "corpus has a mean length of",leng/float(len(X_test))

In [96]:
print ("With pivoted normalisation our top k docs have mean lenght closer to the global mean doc length")
gettopk(doc_scores, X_test)

top 29 documents have mean length of 1432
corpus has a mean length of 1686.73898305


In [97]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, random_state=49)

id2word = Dictionary([_.split() for _ in X_train])

train_corpus = [id2word.doc2bow(i.split()) for i in X_train]
test_corpus = [id2word.doc2bow(i.split()) for i in X_test]

tfidf_transformer = TfIdfTransformer(pivot_norm=True, slope=1).fit(train_corpus)
X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T
X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T
clf = LogisticRegression().fit(X_train_tfidf, y_train)
print clf.score(X_test_tfidf, y_test)

0.9661016949152542


In [98]:
doc_scores = clf.decision_function(X_test_tfidf)

In [99]:
print ("Normal cosine normalisation favors short documents as our top k docs have a smaller mean")
gettopk(doc_scores, X_test)

top 29 documents have mean length of 910
corpus has a mean length of 1686.73898305
