# 20Newsgroups dataset

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import scipy as sp
from scipy.io import mmread
import mpmath as mp
from scipy.optimize import minimize
import time
%matplotlib inline

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [5]:
target_names=['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
# calculates the negative
def negloglik(param, mat):
    alpha = param[0]
    c = param[1]
    sigma = param[2]
    Z = mat
    n = Z.shape[0] # number of rows
    exponent_vec = np.zeros(n)
    for i in range(1, n+1): # need to say 1, n+1 to get 1:n
        exponent_vec[i-1] = (mp.gamma(1 + c) * mp.gamma(i - 1 + c + sigma)) / (mp.gamma(i + c) * mp.gamma(c + sigma))
    m = np.asarray(Z.sum(axis=0)).flatten().astype(int)  # sum of columns
    K = len(m)
    prod_vec = (sp.special.loggamma(m - sigma) + sp.special.loggamma(n - m + c + sigma) + sp.special.loggamma(1 + c)) - (sp.special.loggamma(1 - sigma) + sp.special.loggamma(c + sigma) + sp.special.loggamma(n + c))
    loglikelihood = (-alpha * sum(exponent_vec)) + sum(prod_vec) + K* np.log(alpha)
    return(np.real(-loglikelihood))

In [7]:
# constraints on param = (alpha, c, sigma)
cons = ({'type': 'ineq', 'fun': lambda x: x[1] + x[2]})

In [24]:
start = time.time()
results = {}
initial = np.array([135, 4, 0.5])
for name in target_names:
    newsgroups_train = fetch_20newsgroups(subset='train', categories=[name])
    vectors = vectorizer.fit_transform(newsgroups_train.data)
    test_mat = vectors
    test_mat.data = np.ones(test_mat.data.size)
    result_min = minimize(negloglik, initial, constraints = cons, 
                          bounds=((0, None), (-1, None), (0, 1)), args=(test_mat))
    results[name]=result_min.x
elapsed = time.time() - start

  


ValueError: gamma function pole

In [9]:
sp.special.gamma(0)

inf

In [19]:
results

{'alt.atheism': array([135.02194122,   4.07062466,   0.50186607]),
 'comp.graphics': array([134.98689007,   3.79640476,   0.53419657]),
 'comp.os.ms-windows.misc': array([135.06401209,   4.99794911,   0.50014705]),
 'comp.sys.ibm.pc.hardware': array([134.90812173,   2.42461896,   0.536132  ]),
 'comp.sys.mac.hardware': array([134.90801614,   2.53364614,   0.5119352 ]),
 'comp.windows.x': array([135.11942325,   5.64897701,   0.48883374]),
 'misc.forsale': array([134.81017311,   2.09173238,   0.56024849]),
 'rec.autos': array([135.02805341,   4.74695094,   0.45907182]),
 'rec.motorcycles': array([135.03172669,   4.90538638,   0.46107375]),
 'rec.sport.baseball': array([135.06107267,   5.05585769,   0.40837893]),
 'rec.sport.hockey': array([176.96960424,   4.18530627,   0.42056266]),
 'sci.crypt': array([135.0956874 ,   5.16210435,   0.52163749]),
 'sci.electronics': array([134.99877406,   3.96828253,   0.48980101]),
 'sci.med': array([135.15995464,   6.13434138,   0.5073538 ]),
 'sci.spa

In [17]:
results

{'alt.atheism': array([292.82148128,  -2.69681198,   0.7389434 ]),
 'comp.graphics': array([292.81859876,  -2.69670819,   0.73907546]),
 'comp.os.ms-windows.misc': array([2.92918180e+02, 1.31562554e-01, 7.57205342e-01]),
 'comp.sys.ibm.pc.hardware': array([ 2.92914348e+02, -5.83919092e-02,  4.93323056e-01]),
 'comp.sys.mac.hardware': array([2.92921331e+02, 1.43906531e-01, 5.27901534e-01]),
 'comp.windows.x': array([292.81932603,  -2.69673454,   0.73908038]),
 'misc.forsale': array([292.84284553,  -1.9999775 ,   0.67340729]),
 'rec.autos': array([292.94509865,   0.82421851,   0.46874184]),
 'rec.motorcycles': array([292.94464438,   0.77593322,   0.47648937]),
 'rec.sport.baseball': array([ 292.87831269, -316.99322266,    0.73897987]),
 'rec.sport.hockey': array([292.82056589,  -2.69677923,   0.73912643]),
 'sci.crypt': array([292.82212489,  -2.69683495,   0.73905879]),
 'sci.electronics': array([ 2.92914007e+02, -7.03655525e-02,  4.92410146e-01]),
 'sci.med': array([292.82090464,  -2.69

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [45]:
test_mat = vectors

test_mat.data = np.ones(test_mat.data.size) # change all non-zero values to 1

col_sums = test_mat.sum(axis=0)

col_sums = np.asarray(col_sums)

col_sums = col_sums.flatten() # Col sums ie number of documents where a given word appears

col_sums = col_sums.astype(int) # careful as col_sums has float values not integers