In [22]:
import logging
import warnings

# Turn off annoying warning messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import gensim
import gensim.corpora as corpora
import matplotlib.pyplot as PLT
import numpy as NP
import pandas as PD
import pyLDAvis
import pyLDAvis.gensim
import re
import spacy

# Turn off SpaCy's parser and named-entity-recognition since we only need its POS tagger
nlp = spacy.load('en', disable=['parser', 'ner'])

from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from pprint import pprint


In [12]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

df = PD.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())


['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


In [13]:
# Convert to list
data = df.content.values.tolist()
# Remove email addresses
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove '\n' and '\r'
data = [re.sub('\s+', ' ', sent) for sent in data]
# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])


['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [14]:
def sentToTokens(sents):
    for sent in sents:
        # `deacc=True` strips punctuation
        yield(gensim.utils.simple_preprocess(str(sent), deacc=True))


data_tokens = list(sentToTokens(data))
print(data_tokens[:1])


[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [16]:
def removeStopwords(texts):
    return [
        [w for w in simple_preprocess(str(doc)) if w not in stop_words]
        for doc in texts
    ]


data_tokens_clean = removeStopwords(data_tokens)


In [17]:
# A higher threshold results in fewer phrases
bigrammer_init = gensim.models.Phrases(data_tokens, min_count=5, threshold=100)
trigrammer_init = gensim.models.Phrases(bigrammer_init[data_tokens], threshold=100)

bigrammer = gensim.models.phrases.Phraser(bigrammer_init)
trigrammer = gensim.models.phrases.Phraser(trigrammer_init)

def makeBigrams(texts):
    return [bigrammer[doc] for doc in texts]


def makeTrigrams(texts):
    return [trigrammer[bigrammer[doc]] for doc in texts]


data_bigrams = makeBigrams(data_tokens_clean)
print(data_bigrams[:1])


[['wheres', 'thing', 'car', 'nntp_posting', 'host', 'rac_wam', 'umd', 'organization', 'university', 'maryland_college', 'park', 'lines', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', 'early', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']]


In [23]:
def lemmatize(texts, allowed_pos=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = list()
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([tok.lemma_ for tok in doc if tok.pos_ in allowed_pos])
    return texts_out


data_lemmas = lemmatize(data_bigrams)
print(data_lemmas[:1])


[['where', 's', 'thing', 'car', 'nntp_poste', 'host', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]
