# 20Newsgroups dataset

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import scipy as sp
from scipy.io import mmread
import mpmath as mp
from scipy.optimize import minimize
import time
%matplotlib inline

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [6]:
target_names=['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

what should alpha be for each of the 20 newsgroups?

In [10]:
alpha_est = {}
for name in target_names:
    newsgroups_train = fetch_20newsgroups(subset='train', categories=[name])
    vectors=vectorizer.fit_transform(newsgroups_train.data)
    test_mat=vectors
    test_mat.data=np.ones(test_mat.data.size)
    alpha_est[name]=sum(test_mat.data)/test_mat.shape[0]

In [21]:
np.std(list(alpha_est.values()))

34.411511544783735

In [26]:
a=list(alpha_est.values())
np.mean(a)

159.2761501183457

In [7]:
# calculates the negative loglikelihood
def negloglik(param, mat):
    alpha = param[0]
    c = param[1]
    sigma = param[2]
    Z = mat
    n = Z.shape[0] # number of rows
    exponent_vec = np.zeros(n)
    for i in range(1, n+1): # need to say 1, n+1 to get 1:n
        exponent_vec[i-1] = (mp.gamma(1 + c) * mp.gamma(i - 1 + c + sigma)) / (mp.gamma(i + c) * mp.gamma(c + sigma))
    m = np.asarray(Z.sum(axis=0)).flatten().astype(int)  # sum of columns
    K = len(m)
    prod_vec = (sp.special.loggamma(m - sigma) + sp.special.loggamma(n - m + c + sigma) + sp.special.loggamma(1 + c)) - (sp.special.loggamma(1 - sigma) + sp.special.loggamma(c + sigma) + sp.special.loggamma(n + c))
    loglikelihood = (-alpha * sum(exponent_vec)) + sum(prod_vec) + K* np.log(alpha)
    print(param)
    return(np.real(-loglikelihood)-200000)

In [8]:
# constraints on param = (alpha, c, sigma)
cons = ({'type': 'ineq', 'fun': lambda x: x[1] + x[2]})

In [9]:
start = time.time()
results = {}
initial = np.array([135, 4, 0.5])
for name in target_names:
    newsgroups_train = fetch_20newsgroups(subset='train', categories=[name])
    vectors = vectorizer.fit_transform(newsgroups_train.data)
    test_mat = vectors
    test_mat.data = np.ones(test_mat.data.size)
    result_min = minimize(negloglik, initial, constraints = cons, 
                          bounds=((0, None), (0, None), (0, 1)), args=(test_mat), options={'disp':True})
    results[name]=result_min.x
elapsed = time.time() - start

[135.    4.    0.5]
[135.    4.    0.5]
[135.00000001   4.           0.5       ]
[135.           4.00000001   0.5       ]
[135.           4.           0.50000001]
[140.87695312  23.58984375   1.        ]
[135.58769531   5.95898438   0.55      ]
[135.05876953   4.19589844   0.505     ]
[135.02137298   4.07124327   0.50181837]
[135.02137298   4.07124327   0.50181837]
[135.021373     4.07124327   0.50181837]
[135.02137298   4.07124328   0.50181837]
[135.02137298   4.07124327   0.50181839]
[1.39033308e+02 1.44455115e-10 6.49645604e-01]
[135.42256648   3.66411894   0.5166011 ]
[135.14200623   3.9488267    0.50626333]
[135.14200623   3.9488267    0.50626333]
[135.14200624   3.9488267    0.50626333]
[135.14200623   3.94882672   0.50626333]
[135.14200623   3.9488267    0.50626334]
[4.58762752e+02 2.53748134e-11 9.62744757e-02]
[167.50408076   3.55394403   0.46526444]
[167.50408076   3.55394403   0.46526444]
[167.50408077   3.55394403   0.46526444]
[167.50408076   3.55394405   0.46526444]
[167.

[129.2160997   47.37460166   0.60047765]
[129.2160997   47.37460166   0.60047765]
[129.21609971  47.37460166   0.60047765]
[129.2160997   47.37460168   0.60047765]
[129.2160997   47.37460166   0.60047767]
[186.89581921  25.69363325   1.        ]
[134.98407165  45.20650482   0.64042989]
[134.98407165  45.20650482   0.64042989]
[134.98407167  45.20650482   0.64042989]
[134.98407165  45.20650484   0.64042989]
[134.98407165  45.20650482   0.6404299 ]
[114.76840943  41.92632681   0.72805238]
[125.11369098  43.60494434   0.68321193]
[125.11369098  43.60494434   0.68321193]
[125.11369099  43.60494434   0.68321193]
[125.11369098  43.60494436   0.68321193]
[125.11369098  43.60494434   0.68321195]
[126.2063072   40.88256169   0.70219225]
[126.2063072   40.88256169   0.70219225]
[126.20630721  40.88256169   0.70219225]
[126.2063072   40.8825617    0.70219225]
[126.2063072   40.88256169   0.70219226]
[132.42888694  30.09683709   0.73581986]
[132.42888694  30.09683709   0.73581986]
[132.42888696  3

[136.0336199    2.39657903   0.53492589]
[142.0238359    2.01023611   0.53965253]
[137.94401903   2.27336658   0.5364333 ]
[137.94401903   2.27336658   0.5364333 ]
[137.94401905   2.27336658   0.5364333 ]
[137.94401903   2.2733666    0.5364333 ]
[137.94401903   2.27336658   0.53643331]
[138.21072221   2.29441953   0.53569147]
[138.21072221   2.29441953   0.53569147]
[138.21072223   2.29441953   0.53569147]
[138.21072221   2.29441954   0.53569147]
[138.21072221   2.29441953   0.53569149]
[138.04505739   2.34868055   0.53434122]
[138.15925025   2.31127839   0.53527195]
[138.18736144   2.30207099   0.53550107]
[138.19872538   2.29834891   0.53559369]
[138.20421282   2.29655158   0.53563842]
[138.20421282   2.29655158   0.53563842]
[138.20421283   2.29655158   0.53563842]
[138.20421282   2.2965516    0.53563842]
[138.20421282   2.29655158   0.53563843]
[138.6876179    2.29525684   0.53517343]
[138.6876179    2.29525684   0.53517343]
[138.68761791   2.29525684   0.53517343]
[138.6876179    

  


[117.17505429   4.31307793   0.48894948]
[117.17505429   4.31307793   0.48894948]
[117.1750543    4.31307793   0.48894948]
[117.17505429   4.31307795   0.48894948]
[117.17505429   4.31307793   0.48894949]
[164.09327434   1.55334238   0.50311532]
[136.54666905   3.17363714   0.49479828]
[136.54666905   3.17363714   0.49479828]
[136.54666906   3.17363714   0.49479828]
[136.54666905   3.17363715   0.49479828]
[136.54666905   3.17363714   0.49479829]
[157.51865512   0.77333506   0.52543355]
[138.64386765   2.93360693   0.49786181]
[138.64386765   2.93360693   0.49786181]
[138.64386767   2.93360693   0.49786181]
[138.64386765   2.93360694   0.49786181]
[138.64386765   2.93360693   0.49786182]
[164.64583617   0.           0.53683382]
[141.2440645    2.64024624   0.50175901]
[141.2440645    2.64024624   0.50175901]
[141.24406452   2.64024624   0.50175901]
[141.2440645    2.64024625   0.50175901]
[141.2440645    2.64024624   0.50175902]
[147.38686156   1.51914623   0.52615256]
[142.62998367   

[133.64662079   3.54955286   0.51646609]
[133.64662079   3.54955286   0.51646609]
[133.64662081   3.54955286   0.51646609]
[133.64662079   3.54955288   0.51646609]
[133.64662079   3.54955286   0.51646611]
[137.74458078   0.           0.59445553]
[134.05641679   3.19459757   0.52426504]
[134.05641679   3.19459757   0.52426504]
[134.05641681   3.19459757   0.52426504]
[134.05641679   3.19459759   0.52426504]
[134.05641679   3.19459757   0.52426505]
[1.40945387e+02 2.57571742e-14 5.97724412e-01]
[134.74531384   2.87513782   0.53161097]
[134.74531384   2.87513782   0.53161097]
[134.74531386   2.87513782   0.53161097]
[134.74531384   2.87513783   0.53161097]
[134.74531384   2.87513782   0.53161099]
[1.91562322e-11 6.58013626e+00 7.30461906e-01]
[121.27078246   3.24563766   0.55149607]
[121.27078246   3.24563766   0.55149607]
[121.27078247   3.24563766   0.55149607]
[121.27078246   3.24563768   0.55149607]
[121.27078246   3.24563766   0.55149608]
[111.82319993   3.51042611   0.56006503]
[111

ValueError: gamma function pole

In [19]:
results

{'alt.atheism': array([135.02194122,   4.07062466,   0.50186607]),
 'comp.graphics': array([134.98689007,   3.79640476,   0.53419657]),
 'comp.os.ms-windows.misc': array([135.06401209,   4.99794911,   0.50014705]),
 'comp.sys.ibm.pc.hardware': array([134.90812173,   2.42461896,   0.536132  ]),
 'comp.sys.mac.hardware': array([134.90801614,   2.53364614,   0.5119352 ]),
 'comp.windows.x': array([135.11942325,   5.64897701,   0.48883374]),
 'misc.forsale': array([134.81017311,   2.09173238,   0.56024849]),
 'rec.autos': array([135.02805341,   4.74695094,   0.45907182]),
 'rec.motorcycles': array([135.03172669,   4.90538638,   0.46107375]),
 'rec.sport.baseball': array([135.06107267,   5.05585769,   0.40837893]),
 'rec.sport.hockey': array([176.96960424,   4.18530627,   0.42056266]),
 'sci.crypt': array([135.0956874 ,   5.16210435,   0.52163749]),
 'sci.electronics': array([134.99877406,   3.96828253,   0.48980101]),
 'sci.med': array([135.15995464,   6.13434138,   0.5073538 ]),
 'sci.spa

In [17]:
results

{'alt.atheism': array([292.82148128,  -2.69681198,   0.7389434 ]),
 'comp.graphics': array([292.81859876,  -2.69670819,   0.73907546]),
 'comp.os.ms-windows.misc': array([2.92918180e+02, 1.31562554e-01, 7.57205342e-01]),
 'comp.sys.ibm.pc.hardware': array([ 2.92914348e+02, -5.83919092e-02,  4.93323056e-01]),
 'comp.sys.mac.hardware': array([2.92921331e+02, 1.43906531e-01, 5.27901534e-01]),
 'comp.windows.x': array([292.81932603,  -2.69673454,   0.73908038]),
 'misc.forsale': array([292.84284553,  -1.9999775 ,   0.67340729]),
 'rec.autos': array([292.94509865,   0.82421851,   0.46874184]),
 'rec.motorcycles': array([292.94464438,   0.77593322,   0.47648937]),
 'rec.sport.baseball': array([ 292.87831269, -316.99322266,    0.73897987]),
 'rec.sport.hockey': array([292.82056589,  -2.69677923,   0.73912643]),
 'sci.crypt': array([292.82212489,  -2.69683495,   0.73905879]),
 'sci.electronics': array([ 2.92914007e+02, -7.03655525e-02,  4.92410146e-01]),
 'sci.med': array([292.82090464,  -2.69

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [45]:
test_mat = vectors

test_mat.data = np.ones(test_mat.data.size) # change all non-zero values to 1

col_sums = test_mat.sum(axis=0)

col_sums = np.asarray(col_sums)

col_sums = col_sums.flatten() # Col sums ie number of documents where a given word appears

col_sums = col_sums.astype(int) # careful as col_sums has float values not integers