In [None]:
import pandas as pd
import numpy as np
import os
import re
import spacy as pac
from fuzzywuzzy.process import dedupe as fuzzed

#create new dataframe from preprocessed data
df = pd.read_csv("C:\\datasources\\fullraw.csv", sep='|', usecols=['ArticleId', 'tags', 'Title', 'quote', 'description', 'Links', 'cpub', 'PublicationDate', 'note', 'wtkURL', 'Priority', 'Summary_ref_links', 'Note_links'], encoding='utf-8')
df.sort_values(by='PublicationDate', ascending=False, inplace=True)

#auto-load the SpaCy english lang statistical model
nlp = pac.load('en')

#testdoc = "This string tests if Willie Nelson could have been detected in Florida by a Washington D.C. lobbyist for Monsanto in 2003."
#doc = nlp(testdoc)

#combine summary and note text to make corpus for entity recognition
df['ents'] = df['description']+ " " + df['note'].map(str)

In [None]:
#run entity recognition and create a list of results lists. 10-12min runtime with 9255 paragraphs. slow, but lambda apply strategies take much longer.
docs = []
for i in df['ents'].astype(str).values:
    doc = nlp(i)
    redlist = list(set([(e.text, e.label_) for e in doc.ents]))
    docs.append(redlist)

#check to make sure results list is the same length as dataframe    
len(docs)

In [None]:
#clean up NER results list with loops and regex, then apply fuzzy deduplication. takes a minute to run on 180K ents in 9255 items
#fuzzywuzzy dedupe is finicky about whitespace and special characters
ThingsnKinds = []
for i in docs:
    things = []
    kinds = []
    for thing,kind in i:
        thing.strip()
        thing = ' '.join(filter(None,thing.split(' ')))
        thing = re.sub("[^a-zA-Z0-9 ,'.]+", "", thing)
        kind.strip()
        kind = ' '.join(filter(None,kind.split(' ')))
        kind = re.sub("[^a-zA-Z0-9 ,'.]+", "", kind)
        if len(thing) > 1:
            things.append(thing)
        else:
            things.append('Nothing')
        if len(kind) > 1:  
            kinds.append(kind)
        else:
            kinds.append('ofNote')
    magic = dict(zip(things, kinds))
    deduped = list(fuzzed(things))
    tnk = {k: magic[k] for k in (deduped)}
    ThingsnKinds.append(magic)
        
#check to make sure results list is the same length as dataframe
len(ThingsnKinds)

In [None]:
#add recognized entities to dataframe and save to file
nod = pd.Series(docs)
nodmod = pd.Series(ThingsnKinds)
df['NERsMod'] = nodmod.values
df['NERs'] = nod.values
df[['Title', 'tags', 'quote', 'description', 'Links', 'cpub', 'PublicationDate', 'wtkURL', 'ArticleId', 'Priority', 'Summary_ref_links', 'Note_links', 'NERs', 'NERsMod']].to_csv("C:\\datasources\\totsNER.csv", sep='|', index=False, encoding='utf-8')

In [None]:
#use SpaCy to tokenize and lemmatize
from collections import Counter as cntr

nlp = pac.load("en", disable=['parser', 'tagger', 'ner'])

#clean up text for proocessing
scrubbed = []
df['corpus'] = df['Title']+ " " + df['description'].map(str)
for i in df['corpus'].astype(str).values:
    i.strip()
    i = re.sub("-", " ", i)
    i = re.sub("'s", "", i)
    i = re.sub("[^a-zA-Z ]+", "", i)
    i = ' '.join(filter(None,i.split(' ')))    
    i = i.lower()
    doc = nlp(i)
    lemmings = []
    for e in doc:
        if e.is_stop == True:
            pass
        else:
            lemm = e.lemma_ 
            lemmings.append(lemm)
    scrubbed.append(lemmings)

lems = pd.Series(scrubbed)
df['lemmalist'] = lems.values
ugrams = []
for i in df['lemmalist'].astype(str).values:
    i = re.sub(",", "", i)
    i = re.sub("\[", "", i)
    i = re.sub("\]", "", i)
    i.strip()
    i = ' '.join(filter(None,i.split(' ')))    
    i = re.sub("'", "", i)
    ugrams.append(i)
ugr = pd.Series(ugrams)    
df['unigrams'] = ugr.values   
df[['Title', 'corpus', 'lemmalist', 'unigrams', 'NERsMod']].head(3)

#send to file
#df.to_csv("C:\\datasources\\totsLEMS.csv", sep='|', encoding='utf-8')

In [None]:
#generate ngrams and their frequencies the old fashioned way
utallies = []
btallies = []
ttallies = []
utots = {}
btots = {}
ttots = {}
c = cntr()
for i in df['unigrams'].values:
    utally = {}
    btally = {}
    ttally = {}
    un = i.split()
    bi = [' '.join(un[i:i+2]) for i in range(len(un)-2)]
    tri = [' '.join(un[i:i+3]) for i in range(len(un)-3)]
    for wrd in un:
        if wrd not in utally:
            utally[wrd] = 1            
        else:
            utally[wrd] += 1
        if wrd not in utots:
            utots[wrd] = 1
        else:
            utots[wrd] += 1
    for bigram in bi:
        if bigram not in btally:
            btally[bigram] = 1
            btots[bigram] = 1
        else:
            btally[bigram] += 1
            btots[bigram] += 1
    for trigram in tri:
        if trigram not in ttally:
            ttally[trigram] = 1
            ttots[trigram] = 1
        else:
            ttally[trigram] += 1
            ttots[trigram] += 1
    ttallies.append(ttally)        
    btallies.append(btally)
    utallies.append(utally)
    
utal = pd.Series(utallies)
btal = pd.Series(btallies)
ttal = pd.Series(ttallies)
df['UGC'] = utal.values
df['BGC'] = btal.values
df['TGC'] = ttal.values
ugramTOTS = pd.DataFrame.from_dict(utots, orient='index')
bigramTOTS = pd.DataFrame.from_dict(btots, orient='index')
trigramTOTS = pd.DataFrame.from_dict(ttots, orient='index')

#send to file
#df.to_csv("C:\\datasources\\newsNLPready.csv", sep='|', encoding='utf-8')

In [None]:
#entities identified by SpaCy need to be cleaned up and their frequencies determined
allents = []
allenttypes = []
entcounts = {}

for i in df['NERsMod'].astype(str).values:
    testk = []
    testv = []
    i = re.sub(", ", " | ", i)
    i = re.sub(",", "", i)
    i = re.sub("{", "", i)
    i = re.sub("}", "", i)
    i = i.split(" | ")
    for d in i:
        if d not in entcounts:
            entcounts[d] = 1
        else:
            entcounts[d] += 1
        d = d.split(":")
        k = d[::2]
        v = d[1::2]        
        testk.append(k)    
        testv.append(v)
    allents.append(testk)
    allenttypes.append(testv)

tsk = pd.Series(allents)
ets = pd.Series(allenttypes)

df['EN'] = tsk.values
df['ETYPES'] = ets.values

new = []
for it in df['EN'].astype(str).values:
    it = re.sub("\[", "", it)
    it = re.sub("\]", "", it)
#    it = re.sub("\'", "", it)
    it = re.sub("\"", "", it)
    new.append(it)

tst = pd.Series(new)
df['tst'] = tst.values     

ENTdf = pd.DataFrame.from_dict(entcounts, orient='index')
ENTdf.rename(columns={ ENTdf.columns[0]: "Count" }, inplace=True)
ENTdf['raws'] = ENTdf.index
ENTdf[['Entity', 'E_Type']] = ENTdf['raws'].str.split(':', expand=True)
ENTdf['E_Type'] = ENTdf['E_Type'].astype(str).apply(lambda fx: fx.replace('\'', ''))
ENTdf['Entity'] = ENTdf['Entity'].astype(str).apply(lambda fx: fx.replace('\'', ''))
ENTdf['Entity'] = ENTdf['Entity'].astype(str).apply(lambda fx: fx.replace('this ', ''))
ENTdf['Entity'] = ENTdf['Entity'].astype(str).apply(lambda fx: fx.replace('the ', ''))
ENTdf['Entity'] = ENTdf['Entity'].astype(str).apply(lambda fx: fx.replace('The ', ''))
ENTdf['Entity'] = ENTdf['Entity'].astype(str).apply(lambda fx: fx.replace('\"', ''))
ENTdf.sort_values(by='Count', ascending=False, inplace=True)

#save the entity types of interest and drop the rest
droptype = ['CARDINAL', 'ORDINAL', 'DATE', 'LOC']
#ENTdf = ENTdf[~ENTdf['E_Type'].isin(droptype)]
ENTdf = ENTdf[~ENTdf['E_Type'].str.contains('|'.join(droptype))]
ENTdf = ENTdf[~ENTdf['Entity'].str.contains('Nothing')]

#find min max frequencies for best results and drop the rest
focustype = ['PERSON', 'ORG']
ENTdf = ENTdf[ENTdf['E_Type'].str.contains('|'.join(focustype))]
ENTdf = ENTdf[ENTdf['Count'] > 4]
ENTdf = ENTdf[ENTdf['Count'] < 74]

#a bit more cleaning and save orgs to file
orgDF = ENTdf[~ENTdf['E_Type'].str.contains('PERSON')]
NAo = ['AP', 'CBS', 'UFO Information Center', 'Washington Post', 'Al Jazeera', 'ATM', 'AFP', 'Aftergood', 'Aftershock', 'mercola.com', 'banks', 'the state', 'cooperatives', 'pre911', 'post911', 'the state', 'Harvey', 'Capitol Hill', 'Fox News', 'ABC', 'Federal', 'Foundation', 'The Post', 'Standard Poors', 'Energy']
orgDF = orgDF[~orgDF['Entity'].str.contains('|'.join(NAo))]
orgDF.sort_values(by='Entity', inplace=True)

orgDF[['Entity', 'Count']].to_csv("C:\\datasources\\identorgs.csv", sep='|', encoding='utf-8')

#a bit more cleaning and save people to file
NAp = ['Explore', 'Spies', 'Lords', 'clients', 'kids', 'Anonymous', 'Armageddon', 'WiFi', 'Julia', 'Young', 'Black', 'Kerry', 'mm', 'deception10pg']
pplDF = ENTdf[~ENTdf['E_Type'].str.contains('ORG')]
pplDF = pplDF[~pplDF['Entity'].str.contains('|'.join(NAp))]
pplDF.sort_values(by='Entity', inplace=True)

pplDF[['Entity', 'Count']].to_csv("C:\\datasources\\identpeople.csv", sep='|', encoding='utf-8')