# Imports

In [71]:
import sklearn as sk
from sklearn import datasets as data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import calm
import gensim as gs
import os

In [75]:
project_dir = '/home/matt/Git/TextMiningFinal/'
os.chdir(project_dir)

In [86]:
newsgroups_processor_config = 'config/20_newsgroups.yml'
newsgroups_mallet_outfile = 'data/20newsgroups_mallet'

## The newsgroups data, cleaned and munged, all in one line

In [2]:
newsgroups = data.fetch_20newsgroups(remove=('headers','footers'))

In [None]:
help(newsgroups_vectorize = data.fetch_20newsgroups_vectorized)

In [3]:
type(newsgroups)

sklearn.datasets.base.Bunch

In [4]:
list(newsgroups.keys())

['target_names', 'description', 'filenames', 'data', 'target', 'DESCR']

In [5]:
len(newsgroups['data'])

11314

In [6]:
newsgroups['data'][0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'


## The reuters data

In [None]:
# if you don't have the reuters corpus installed, run nltk.download()
from nltk.corpus import reuters

## instantiate a text processor object and a token counter

In [81]:
processor = calm.processor.Processor(newsgroups_processor_config)

In [82]:
# test it on the kind of strings we'll see in our 20-newsgroups corpus
processor.process('here is a sentence me@site.com @whoever file_name 1123.1 23423kjl3 looooong worddd salt.atheism alt.atheism alt-some_gasdfasroup')

['here',
 'is',
 'a',
 'sentence',
 '_email_',
 '_username_',
 'filename',
 '_num_',
 '_num_',
 'kjl',
 '_num_',
 'loong',
 'wordd',
 'salt',
 'atheism',
 '_group_',
 '_group_']

## The count vectorizer

In [83]:
# store the stopwords list to pass to the sklearn count vectorizer; the processor isn't configured to remove them
smart_stopwords = list(processor.stopwords)
# this is the tokenizer function we'll pass to the count vectorizer
newsgroups_tokenize = lambda string: processor.process(string)

In [27]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(sklearn.base.BaseEstimator, VectorizerMixin)
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.coo_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetc

In [84]:
# instantiate the vectorizer
counter = CountVectorizer(ngram_range=(1,1), analyzer='word',tokenizer = newsgroups_tokenize,stop_words=smart_stopwords)

# Tokenize the corpus

In [43]:
# fit the counter (learn the vocab)
counter.fit(newsgroups['data'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['example', 'furthermore', 'were', 'different', 'as', 'aside', 'never', 'presumably', 'onto', 'despite', 'thus', 'meanwhile', 'under', 'himself', 'her', 'across', 'on', "we'll", 'hardly', 'ourselves', 'name', 'and', "hadn't", 'over', 'q', 'ie', 'next', 'formerly', 'will', 'seeming', 'e', ...', 'more', 'off', 'certainly', 'enough', 'below', "we'd", 'less', 'probably', 'help', 'wish', 'you'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function <lambda> at 0x7f00a4a02a60>, vocabulary=None)

In [52]:
type(counter.transform('here is a sentence'))

scipy.sparse.csr.csr_matrix

## the Gensim way

In [62]:
print("\nBuilding corpus:")
corpus = []
for i, text in enumerate(newsgroups['data']):
    tokens = processor.process(text)
    tokens = [token for token in tokens if token not in processor.stopwords]
    corpus.append(tokens)

print(str(len(corpus)) + " documents in initial corpus.")



Building corpus:
11314 documents in initial corpus.


In [63]:
print("\nGenerating corpus-wide dictionary:")
id2word = gs.corpora.Dictionary(corpus)
print(str(len(id2word)) + " initial terms in the dictionary.")



Generating corpus-wide dictionary:
78790 initial terms in the dictionary.


In [88]:
def write_MALLET_corpus(list_of_tokenized_docs,output_file):
    with open(output_file,'w') as outfile:
        for i, tokens in enumerate(list_of_tokenized_docs):
            line = str(i) + ' NA ' + ' '.join(tokens) + '\n'
            outfile.write(line)

In [89]:
write_MALLET_corpus(corpus,newsgroups_mallet_outfile)

In [58]:
help(gensim.corpora.MalletCorpus())

Help on class MalletCorpus in module gensim.corpora.malletcorpus:

class MalletCorpus(gensim.corpora.lowcorpus.LowCorpus)
 |  Quoting http://mallet.cs.umass.edu/import.php:
 |  
 |      One file, one instance per line
 |      Assume the data is in the following format:
 |  
 |      [URL] [language] [text of the page...]
 |  
 |  Or, more generally,
 |      [document #1 id] [label] [text of the document...]
 |      [document #2 id] [label] [text of the document...]
 |      ...
 |      [document #N id] [label] [text of the document...]
 |  
 |  Note that language/label is *not* considered in Gensim.
 |  
 |  Method resolution order:
 |      MalletCorpus
 |      gensim.corpora.lowcorpus.LowCorpus
 |      gensim.corpora.indexedcorpus.IndexedCorpus
 |      gensim.interfaces.CorpusABC
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, fname, id2word=None, metadata=False)
 |      Initialize the corpus from a file.
 |      
 |      `id2