## Tokenizing and vectorizing words, training NMF topic model

In [1]:
import numpy as np
 
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.decomposition import NMF
 
data = pd.read_csv('topics.csv')
import string
import unidecode
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import re

def customtokenizer(article):
    punc = str.maketrans('','',string.punctuation+"''``''``\"")
    article_c = article.translate(punc)
    dig = str.maketrans('','',string.digits)
    article_c=article_c.translate(dig)
    article_c = unidecode.unidecode(article_c)
    article_c = article_c.lower()
    regex = re.compile(r'(?u)\b\w\w+\b')
    article_c = re.findall(regex,article_c)
    stop_words = stopwords.words('english')
    article_c = [y for y in article_c if y not in stop_words]
    stemmer = SnowballStemmer('english')
    article_c = [stemmer.stem(y) for y in article_c] 
    return article_c

In [2]:
vectorizer = TfidfVectorizer(tokenizer=customtokenizer,max_features=2000, min_df=50,max_df=.75, stop_words=stopwords.words('english'))
 
X = vectorizer.fit_transform(data['contents'])
 
idx_to_word = np.array(vectorizer.get_feature_names())

  sorted(inconsistent))


In [3]:
nmf = NMF(n_components=25, solver="mu")
 
W = nmf.fit_transform(X)
 
H = nmf.components_
 
# print the topics
 
for i, topic in enumerate(H):
 
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-20:]]])))

Topic 1: plea,pay,special,admit,also,fbi,judg,conspiraci,prosecut,count,agent,charg,plead,year,offic,prison,trial,guilti,crimin,sentenc
Topic 2: fraudul,client,websit,expens,deduct,complaint,credit,revenu,claim,refund,file,custom,busi,fals,injunct,incom,ir,return,prepar,tax
Topic 3: miami,clinic,florida,medicaid,therapi,servic,kickback,fraudul,hhs,home,strike,beneficiari,bill,patient,hhsoig,medic,fraud,care,health,medicar
Topic 4: said,equal,prohibit,tenant,harass,apart,alleg,race,complaint,act,settlement,disabl,famili,lawsuit,hud,civil,right,fair,discrimin,hous
Topic 5: launch,via,video,epidem,apprehend,rescu,internet,obscen,abus,minor,imag,ceo,safe,project,children,childhood,sexual,exploit,pornographi,child
Topic 6: complianc,million,natur,consent,site,reduc,compani,decre,requir,environ,facil,plant,water,clean,environment,emiss,settlement,pollut,air,epa
Topic 7: kill,join,syria,new,york,provid,attempt,isi,state,fbi,organ,secur,nation,travel,materi,terror,attack,support,isil,terrorist

## Transforming test text and investigating properties

In [4]:
text = ['environment ships illegal trafficking sea']
test = vectorizer.transform(text)

In [5]:
nm = nmf.transform(test)

In [6]:
nm.argmax()

23

In [7]:
nm

array([[7.44948350e-015, 0.00000000e+000, 0.00000000e+000,
        7.75194048e-039, 4.04475122e-092, 1.53458460e-004,
        1.84029265e-044, 0.00000000e+000, 9.29379432e-030,
        5.25763840e-037, 6.54008190e-019, 7.91130011e-018,
        0.00000000e+000, 0.00000000e+000, 6.31076301e-005,
        1.03212432e-138, 1.00559678e-022, 4.53067523e-029,
        1.39163509e-003, 4.00158395e-017, 3.22841249e-011,
        5.34503655e-002, 4.27736713e-100, 7.30765673e-002,
        3.45834323e-038]])

In [8]:
W.shape

(13081, 25)

In [9]:
X

<13081x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1820303 stored elements in Compressed Sparse Row format>

## Printing articles most similar to test text by cosine similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
for each in list(cosine_similarity(nm,W).argsort()[0][-15:-1]):
    print(data.at[each,'title'])

California Man Sentenced to One Year in Prison for Illegal Sale of Black Rhinoceros Horns
Maine Fisherman Sentenced for Illegally Trafficking American Eels
New York Antiques Dealer Sentenced to 37 Months in Prison for Wildlife Smuggling
Statements of Associate Attorney General Tony West and Acting Assistant Attorney General of Enrd on the National Strategy for Combatting Wildlife Trafficking
Tennessee Men Plead Guilty to Illegally Trafficking Narwhal Tusks
Massachusetts Antique Dealer Sentenced to 33 Months in Prison for Trafficking in 
Illegally-Imported Narwhal Tusks and Sperm Whale Teeth
Irish National Sentenced to Serve 14 Months in Prison  for Trafficking of Endangered Rhinoceros Horns
Long Island Man Pleads Guilty to Trafficking in Rhinoceros Horns 
Antiques Dealer Sentenced in Manhattan to Two Years in Prison for Smuggling Cups Made from Rhinoceros Horns
Canadian Antiques Dealer Sentenced to 30 Months in Prison for Smuggling Rhinoceros Horns, Elephant Ivory and Coral
Foreign Nat