# NLP: Final Project

### By: Matt Norgren

### Instructions

The news articles are related to either Chicago and / or Illinois.

    - Detect major topics
    - Identify top reasons for population decline (negative sentiment)
    - Suggest corrective actions
    - Demonstrate how the city / state can attract new businesses (positive sentiment)
    - Leverage appropriate NLP techniques to identify organizations and people and apply targeted sentiment
        - Why businesses should stay in IL or move into IL?
            - Create appropriate visualization to summarize your recommendations (i.e. word cloud chart or bubble chart)
        - Why residents should stay in IL or move into IL?
            - Create appropriate visualization to summarize your recommendations (i.e. word cloud chart or bubble chart)

### Import Packages

In [1]:
## Import standard packages for cleaning data/ preparation
import pandas as pd
import numpy as np
import json, glob, pickle, re, sys, string
from __future__ import division
## Standard logic for iteration
import itertools
from itertools import combinations
## Base computing and graphing
import matplotlib.pyplot as plt
import seaborn as sns
## Standard logic for iteration 
from collections.abc import Iterable
from collections import Counter
## Ability to confirm directories and establish random variables
from os import path, mkdir
import random, warnings, tqdm, math
#warnings.simplefilter('ignore')
## Setup for multiprocessing LDA / check for how many cores present
import multiprocessing
num_processors = multiprocessing.cpu_count()
print("Number of cores present:",num_processors)
print(sys.version)
warnings.filterwarnings("ignore", category=DeprecationWarning)

Number of cores present: 8
3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 15:50:57) 
[Clang 11.0.1 ]


In [2]:
## Summarization
import summa

In [3]:
## NLTK, TextBlob, gensim set for LDA
import nltk as nltk 
from nltk.text import Text
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
##
nltk.data.path.append('/Users/man/Python/MiniForge/nltk_data/')
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
## Keyword extraction
from rake_nltk import Rake
##
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.test.utils import common_texts, datapath
from gensim.corpora.dictionary import Dictionary
import pyLDAvis.gensim
##
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
#TextBlob(sentence).sentiment
##
from langdetect import detect

In [4]:
## Sentiment pre-classifier // Labeling
import snorkel
#from snorkel.labeling.model import LabelModel
#from snorkel.labeling import PandasLFApplier

  and should_run_async(code)


In [5]:
## 
from tqdm.auto import tqdm
tqdm.pandas()
from tqdm.notebook import tqdm, trange
import time  

  and should_run_async(code)


In [6]:
## Spacy based packages and methods -- related to 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import spacy 
from spacy.kb import KnowledgeBase
from spacy import displacy
from spacy.util import minibatch, compounding
from joblib import Parallel, delayed

  and should_run_async(code)


In [7]:
## Sentiment Analysis based packages
import sklearn, wordcloud, eli5
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from joblib import dump, load
from urllib.request import urlretrieve
import urllib

In [8]:
## Keras and Tensorflow
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Convolution1D, Flatten, LeakyReLU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras import callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import model_to_dot

## Executive Summary of Actions

### Import Data

In [10]:
## Import data utilizing JSON method
df = pd.read_json('/Users/man/OneDrive - The University of Chicago/Prior_Quarters/2021_Winter/NLP[MSCA_32018]/Final_Project/Archive/news_chicago_il.json', orient='records', lines=True)
df.head()

Unnamed: 0,crawled_date,language,text,title
0,1589155200000,english,\nGov. Jay “Fatso” Pritzker called on all Illi...,All In Illinois
1,1589155200000,english,"May 10, 2020 -The Illinois Department of Publi...",The Illinois Department of Public Health Annou...
2,1589155200000,english,"Gloria Lawrence said: May 10, 2020 at 1:31 AM\...","Foto Friday: Alton, Illinois"
3,1589155200000,english,NBA to follow German soccer league model with ...,Chris Broussard on Michael Jordan returning to...
4,1589155200000,english,"Search Minggu, 10 Mei 2020 Pork chops vs. peop...",Pork chops vs. people: Can Americans’ appetite...


 ### Summary Statistics for RAW

In [11]:
## Count of unique by language
print("By Language:",df.groupby(df['language']).count())

By Language:           crawled_date    text   title
language                              
english         373069  373069  373069


In [12]:
## Average character length per article title
df['len'] =  df['title'].map(lambda calc: len(calc))
article_len = sum(df['len']) / len(df['len'])
article_len

77.67314625444622

In [13]:
## Average character length per article text set
df['len2'] =  df['text'].map(lambda calc: len(calc))
article_len2 = sum(df['len2']) / len(df['len2'])
article_len2

2316.408085367586

In [14]:
## Drop added test columns
df.drop(['len','len2'],axis=1, inplace=True)

In [14]:
## Interesting that the average title is 77 characters long. It should lend to a 
## significant amount sentiment analysis // at minimum deeper overview of the topics

In [15]:
## Averages extrapolated
extrapolated = article_len + article_len2
extrapolated * len(df)

893157490.9999999

In [16]:
## Subset the df to just text and title
df = df[['text','title']]
df.shape

(373069, 2)

### Data Cleaning & Setup for Ingestion 

In [17]:
## Remove all null values and only return records where the new dataframe is not null or NaN
df = df[df.text.notnull()]
df = df[df.title.notnull()]
df.shape
## Clean already regarding nulls

(373069, 2)

In [18]:
## Data Prep for feed - concatenated
## Combine the title and text column for one unified record per article -- add space in between so that there is clear delineation
df['combo'] = df['title'] + " " +  df['text']
df['combo'].head(10)

0    All In Illinois \nGov. Jay “Fatso” Pritzker ca...
1    The Illinois Department of Public Health Annou...
2    Foto Friday: Alton, Illinois Gloria Lawrence s...
3    Chris Broussard on Michael Jordan returning to...
4    Pork chops vs. people: Can Americans’ appetite...
5    4 Dead, at Least 17 Injured in Shootings Acros...
6    CITY OF LASALLE: Restore Illinois City of Lasa...
7    Who Ya Got: 72-Win Chicago Bulls or 73-Win Gol...
8    You know I had to do it >:DI love Chicago so f...
9    Developer behind botched Chicago demolition fa...
Name: combo, dtype: object

In [19]:
## Create function that defines a langauge for the above
def lang_detect(x):
    try:
        return detect(x)
    except:
        return '--'

In [20]:
%time df['lang'] = df['combo'].apply(lambda x: lang_detect(x))

CPU times: user 29min 33s, sys: 6.97 s, total: 29min 40s
Wall time: 35min 17s


In [21]:
df = df[df['lang']=='en'].reset_index(drop=True)
df.shape

(370556, 4)

In [22]:
## Remove /n characters to avoid problems with analysis
df['combo'] = df['combo'].map(lambda x: re.sub(r'\n', '.  ', str(x)))
## Remove special characters to avoid problems with analysis -- utilizing the map function 
## Should it be a space?
df['combo'] = df['combo'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))
df['combo'].head(10)

0    All In Illinois .  Gov. Jay Fatso Pritzker cal...
1    The Illinois Department of Public Health Annou...
2    Foto Friday: Alton, Illinois Gloria Lawrence s...
3    Chris Broussard on Michael Jordan returning to...
4    Pork chops vs. people: Can Americans appetite ...
5    4 Dead, at Least 17 Injured in Shootings Acros...
6    CITY OF LASALLE: Restore Illinois City of Lasa...
7    Who Ya Got: 72Win Chicago Bulls or 73Win Golde...
8    You know I had to do it :DI love Chicago so fr...
9    Developer behind botched Chicago demolition fa...
Name: combo, dtype: object

In [23]:
## Save and set for reload of pandas df so that language detect doesnt need to re-run
#df.to_csv('Lang-check.txt', sep=',', header=None, mode='w')

In [24]:
## Load lang-check.csv as dataframe - no need to run before this line
#df = pd.read_csv('Lang-Check.csv')

In [26]:
## StopWord Removal
doc_clean = []
## Establish variables articulating stop words / exclusions for punctuation and a 
## lemmatizer -- then write a function called doc_clean with results of the 
## cleaning function

#stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    #stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

%time doc_clean = [clean(doc).split() for doc in df['combo']]
%time df['doc_clean'] = [clean(doc) for doc in df['combo']]
len(doc_clean)

NameError: name 'stop_free' is not defined

NameError: name 'stop_free' is not defined

0

In [None]:
## Reduce df to combo and clean set in order to reduce overall size 
df = df[['combo','doc_clean']]
df

In [27]:
##Save DF to reload for testing 
#df.to_csv('doc_clean.csv')

In [28]:
## Import doc_clean csv
#df = pd.read_csv('doc_clean.csv')
#df.drop(["Unnamed: 0"], axis=1)
#doc_clean = list(df['doc_clean'])

In [29]:
## Create a doc_clean to mimic earlier functions in order to serve as a load point
#combo = []

#for i in range(len(df['combo'])):
#    A = (df['combo'][i])
#    AA = A.split()
#    combo.append(AA)

In [24]:
## Create a doc_clean to mimic earlier functions in order to serve as a load point
doc_clean = []

for i in range(len(df['doc_clean'])):
    A = (df['doc_clean'][i])
    AA = A.split()
    doc_clean.append(AA)

SyntaxError: invalid syntax (<ipython-input-24-0ce748bb5ba7>, line 7)

In [None]:
#doc_clean

### Create sub-sample for testing

In [28]:
df = pd.read_csv('/Users/man/OneDrive - The University of Chicago/Prior_Quarters/2021_Winter/NLP[MSCA_32018]/Final_Project/Archive/doc_clean.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,combo,doc_clean
0,0,All In Illinois . Gov. Jay Fatso Pritzker cal...,illinois gov jay fatso pritzker called illinoi...
1,1,The Illinois Department of Public Health Annou...,illinois department public health announces 16...
2,2,"Foto Friday: Alton, Illinois Gloria Lawrence s...",foto friday alton illinois gloria lawrence sai...
3,3,Chris Broussard on Michael Jordan returning to...,chris broussard michael jordan returning chica...
4,4,Pork chops vs. people: Can Americans appetite ...,pork chop v people american appetite meat fill...


In [29]:
## Create a doc_clean to mimic earlier functions in order to serve as a load point
doc_clean = []

for i in range(len(df['doc_clean'])):
    A = (df['doc_clean'][i])
    AA = A.split()
    doc_clean.append(AA)

In [30]:
## Establish a 35% setup to run through LDA for topic generation
## Create variable called random sample to be callable
perc = int(len(doc_clean)*0.35)
rs1 = random.sample(doc_clean,perc)
len(rs1)

129704

In [32]:
df2 = pd.DataFrame(rs1)

In [34]:
df2.to_csv('doc_clean.txt', header=None, index=None, sep=' ', mode='a')

KeyboardInterrupt: 

In [35]:
df2min = df2.sample(frac=0.1, replace = True)

In [36]:
df2min.to_csv('doc_clean-min.txt', header=None, index=None, sep=' ', mode='a')

### Summa

In [102]:
## Write loop for summarizer to subset and run through / iterate and store in a dictionary or list
## Possible loop for entity isolation
article_sum = []

for i in range(len(doc_clean)):
            article2 = summa.summarizer.summarize(str(doc_clean))
            article_sum.append(article2)

#df_summa = pd.DataFrame({'Entities':entities,'Labels':labels,'Position':position})

KeyboardInterrupt: 

In [None]:
len(article_sum)

In [99]:
df['doc_clean'][250]

'first horizon advisor inc sell 1917 share illinois tool work inc nyseitw first horizon advisor inc trimmed holding illinois tool work inc nyseitw 67 1st quarter according company recent filing security exchange commission sec institutional investor owned 26827 share industrial product company stock selling 1917 share quarter first horizon advisor inc holding illinois tool work worth 3811000 recent filing security exchange commission sec several institutional investor also modified holding itw cfm wealth partner llc bought new position share illinois tool work 4th quarter worth 25000 stonebridge financial planning group llc bought new stake illinois tool work 4th quarter valued 27000 allred capital management llc bought new stake illinois tool work 4th quarter valued 49000 savior llc bought new stake illinois tool work 4th quarter valued 50000 finally golden state wealth management llc bought new stake illinois tool work 1st quarter valued 51000 institutional investor 8037 company stoc

In [None]:
article_sum[250]

### Topic Modeling | LDA (latent dirichlet allocation)

In [None]:
df = pd.read_csv()

In [None]:
## Create a doc_clean to mimic earlier functions in order to serve as a load point
doc_clean = []

for i in range(len(df['doc_clean'])):
    A = (df['doc_clean'][i])
    AA = A.split()
    doc_clean.append(AA)

In [33]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

In [36]:
## Creating specific list / train
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)
## Build high / low filter to remove incredibly high occuring words/ very low
# https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb
## This will remove all words occuring less than 15 times or present in more than 10% of the articles
%time dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000, keep_tokens=None)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

CPU times: user 637 ms, sys: 44.5 ms, total: 682 ms
Wall time: 711 ms


In [37]:
## Set to multicore in order to take advantage of large set
N = 8
num_topics = N
chunksize = 10000
iterations = 100
passes = 20
workers = num_processors-1
eval_every = None

%time ldamodel = LdaMulticore(corpus=doc_term_matrix,\
                       id2word=dictionary,\
                       chunksize=chunksize,\
                       eta='auto',\
                       num_topics=num_topics,\
                       iterations=iterations,\
                       passes=passes,\
                       eval_every=eval_every,\
                       workers = workers)

CPU times: user 5min 20s, sys: 49.5 s, total: 6min 10s
Wall time: 7min 51s


In [39]:
## Load a potentially pretrained model from disk.
#ldamodel = LDA.load(G_LDA)

In [38]:
## Print an example of the topics per line
print(*ldamodel.print_topics(num_topics=N, num_words=10), sep='\n')
#B = list(ldamodel.print_topics(num_topics=N, num_words=10))

(0, '0.005*"loan" + 0.003*"customer" + 0.003*"product" + 0.003*"online" + 0.003*"experience" + 0.003*"market" + 0.003*"job" + 0.003*"offer" + 0.003*"system" + 0.003*"provide"')
(1, '0.009*"pritzker" + 0.008*"rate" + 0.007*"credit" + 0.006*"region" + 0.005*"virus" + 0.005*"debt" + 0.005*"hospital" + 0.005*"test" + 0.005*"male" + 0.005*"care"')
(2, '0.009*"bear" + 0.008*"0" + 0.006*"win" + 0.005*"real" + 0.005*"estate" + 0.005*"player" + 0.005*"coach" + 0.005*"sport" + 0.005*"play" + 0.005*"broker"')
(3, '0.004*"white" + 0.004*"im" + 0.004*"cub" + 0.004*"think" + 0.004*"sox" + 0.003*"photo" + 0.003*"hair" + 0.003*"got" + 0.003*"there" + 0.003*"run"')
(4, '0.009*"man" + 0.008*"shooting" + 0.006*"mayor" + 0.006*"lightfoot" + 0.005*"federal" + 0.004*"protest" + 0.004*"video" + 0.004*"violence" + 0.004*"street" + 0.004*"crime"')
(5, '0.027*"stock" + 0.023*"quarter" + 0.021*"rating" + 0.020*"tool" + 0.015*"price" + 0.013*"inc" + 0.011*"average" + 0.010*"research" + 0.010*"ratio" + 0.009*"llc"

In [39]:
## Test size of topics and potential overlap
## Graphical representation
lda_displayN = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_displayN)

In [42]:
## Save LDA model and be able to recall results
##https://radimrehurek.com/gensim/models/ldamodel.html
#G_LDA = datapath("Gensim-LDA-Model")
#ldamodel.save(G_LDA)

In [40]:
## Create new dataframe housing the top words
top_words_per_topic = []
for t in range(ldamodel.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in ldamodel.show_topic(t, topn = 25)])
top_df = pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P'])
pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv")

In [41]:
## subset
##Topic 0
T0 = top_df[top_df['Topic'] == 0]
T0 = list(T0['Word'])
##Topic 1
T1 = top_df[top_df['Topic'] == 1]
T1 = list(T1['Word'])
##Topic 2
T2 = top_df[top_df['Topic'] == 2]
T2 = list(T2['Word'])
##Topic 3
T3 = top_df[top_df['Topic'] == 3]
T3 = list(T3['Word'])
##Topic 4
T4 = top_df[top_df['Topic'] == 4]
T4 = list(T4['Word'])
##Topic 5
T5 = top_df[top_df['Topic'] == 5]
T5 = list(T5['Word'])
##Topic 6
T6 = top_df[top_df['Topic'] == 6]
T6 = list(T6['Word'])
##Topic 7
T7 = top_df[top_df['Topic'] == 7]
T7 = list(T7['Word'])

In [43]:
T7

['student',
 'district',
 'teacher',
 'park',
 'restaurant',
 'food',
 'union',
 'university',
 'class',
 'event',
 'open',
 'learning',
 'building',
 'lake',
 'st',
 'street',
 'art',
 'river',
 '2021',
 'north',
 'il',
 'inperson',
 'education',
 'hotel',
 'staff']

In [47]:
## VADER Sentiment analysis on NER- loop to assign polarity score / build lists / append lists and etner
## A
spot = []
neg = []
pos = []
neu = []
compound = []

for i in range(len(MEGA)):
    for ii in range(0,1):
        spot.append([i])
        article2 = sid.polarity_scores(MEGA[i])
        article_list = list(article2.values()) 
        #dc.append(article2)
        compound.append(article_list[3])
        neg.append(article_list[0])
        neu.append(article_list[1])
        pos.append(article_list[2])
MEGA_vader = pd.DataFrame({'Index':spot,'Ind. Compound':compound,'Ind. Negative':neg, 'Ind. Neutral':neu, 'Ind. Positive':pos})

In [48]:
MEGA_vader

Unnamed: 0,Index,Ind. Compound,Ind. Negative,Ind. Neutral,Ind. Positive
0,[0],0.0,0.0,1.0,0.0
1,[1],0.0,0.0,1.0,0.0
2,[2],0.0,0.0,1.0,0.0
3,[3],0.0,0.0,1.0,0.0
4,[4],0.0,0.0,1.0,0.0
5,[5],0.0,0.0,1.0,0.0
6,[6],0.0,0.0,1.0,0.0
7,[7],0.0,0.0,1.0,0.0


In [45]:
MEGA = [T0,T1,T2,T3,T4,T5,T6,T7]

In [46]:
T1

['black',
 'trump',
 'shooting',
 'federal',
 'mayor',
 'man',
 'election',
 'court',
 'lightfoot',
 'video',
 'protest',
 'law',
 'vote',
 'violence',
 'woman',
 'told',
 'attorney',
 'crime',
 'gun',
 'associated',
 'district',
 'charge',
 'child',
 'judge',
 'white']

### TF-IDF | textblob

In [47]:
## Utilizing TF-IDF will penalize the often used words / remove stop words in order to simplify
## could this be used in conjunciton with LDA? pre-feed
tfidfvectorizer = TfidfVectorizer(stop_words='english')

In [48]:
## Degining functions that will assist in the text blob analysis
# http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)
# tf(word, blob) computes "term frequency" which is the number of times a word appears in a document blob, 
# normalized by dividing by the total number of words in blob. We use TextBlob for breaking up the text into words 
# and getting the word counts.
def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)
# n_containing(word, bloblist) returns the number of documents containing word. 
# A generator expression is passed to the sum() function.
def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
# idf(word, bloblist) computes "inverse document frequency" which measures how common a word is 
# among all documents in bloblist. The more common a word is, the lower its idf. 
# We take the ratio of the total number of documents to the number of documents containing word, 
# then take the log of that. Add 1 to the divisor to prevent division by zero
def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)
# tfidf(word, blob, bloblist) computes the TF-IDF score. It is simply the product of tf and idf.

In [49]:
## Create mini list and empty dictionary
bloblist = []
position1 = []
#del bloblist[:]

for i  in range(0,len(df)):
    bloblist.append(TextBlob(df['combo'].iloc[i]))
    #position1.append([i])
    
len(bloblist)
df_blob = pd.DataFrame({'Blob':bloblist}) #,'Position':position1})

In [50]:
df_blob

Unnamed: 0,Blob
0,"(A, l, l, , I, n, , I, l, l, i, n, o, i, s, ..."
1,"(T, h, e, , I, l, l, i, n, o, i, s, , D, e, ..."
2,"(F, o, t, o, , F, r, i, d, a, y, :, , A, l, ..."
3,"(C, h, r, i, s, , B, r, o, u, s, s, a, r, d, ..."
4,"(P, o, r, k, , c, h, o, p, s, , v, s, ., , ..."
...,...
370581,"(D, o, , I, l, l, i, n, o, i, s, , l, a, w, ..."
370582,"(Q, u, i, c, k, , h, i, t, s, :, , I, l, l, ..."
370583,"(B, a, i, l, , f, o, r, , I, l, l, i, n, o, ..."
370584,"(6, , D, a, y, , T, r, i, p, , t, o, , D, ..."


In [51]:
#for i, blob in enumerate(bloblist):
# Print top 5 values
#    if i == 5:
#        break
#    print("Top words in tweet {}".format(i + 1))
#    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
#    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#    for word, score in sorted_words[:5]:
#        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

### Name Entity Recognition with SpaCy

In [159]:
## Load SpaCy model
#nlp = spacy.load("en_core_web_sm") ## 
#nlp = spacy.load("en_core_web_md") ## 47.1MB
nlp = spacy.load("en_core_web_lg") ## 
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7eefe8800>

In [53]:
## Create a table utilizing spacy to identify the entities and labels
doc = nlp(str(df['doc_clean']))
entities = []
labels = []
position = []

for outerloop in range(len(df['doc_clean'])):
    for ent in doc.ents:
        entities.append(ent.text)
        labels.append(ent.label_)
        position.append([outerloop])
        
df_spacy = pd.DataFrame({'Entities':entities,'Labels':labels,'Position':position})

In [54]:
#df_spacy.shape

In [55]:
## Group by for top names -- top 15
#spacy_l = df_spacy['Entities'].groupby(df_spacy['Entities']).agg(pd.Series.count)
#spacy_l

In [56]:
## Export spacy DF
#df_spacy.to_csv('Spacy.csv')

### Keyword Extraction via Rake

In [57]:
## Defines a variable and a function for use with rake
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
##
def rake_implement(x,r):
    r.extract_keywords_from_text(x) # r.extract_keywords_from_text(<text to process>)
    return r.get_ranked_phrases() # r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.

In [58]:
## Create a new column in df2 based on the keywords removed
df['rake_phrases'] = df['doc_clean'].apply(lambda x: rake_implement(x,r))
df.head()

Unnamed: 0,combo,doc_clean,rake_phrases
0,All In Illinois . Gov. Jay Fatso Pritzker cal...,illinois gov jay fatso pritzker called illinoi...,[kayleen carlson executive director illinois r...
1,The Illinois Department of Public Health Annou...,illinois department public health announces 16...,[illinois department public health announces 1...
2,"Foto Friday: Alton, Illinois Gloria Lawrence s...",foto friday alton illinois gloria lawrence sai...,[army corp engineer raise lower water level us...
3,Chris Broussard on Michael Jordan returning to...,chris broussard michael jordan returning chica...,[chris broussard michael jordan returning chic...
4,Pork chops vs. people: Can Americans appetite ...,pork chop v people american appetite meat fill...,[pork chop v people american appetite meat fil...


In [59]:
## Import Raked data
#df = pd.read_csv('Rake+.csv')
#df.drop(['Unnamed: 0'],axis=1)

In [60]:
## Create a doc_clean to mimic earlier functions in order to serve as a load point
#combo = []

#for i in range(len(df['combo'])):
#    A = (df['combo'][i])
#    AA = A.split()
#    combo.append(AA)

In [61]:
## Create a doc_clean to mimic earlier functions in order to serve as a load point
#doc_clean = []

#for i in range(len(df['doc_clean'])):
#    A = (df['doc_clean'][i])
#    AA = A.split()
#    combo.append(AA)

### Sentiment Analysis

### VADER (Valence Aware Dictionary and sEntiment Reasoner)

In [62]:
## Testing of vader - subset with clean data but small sample
## Combined but unclean dataset
V0 = df['combo'][458]

print(sid.polarity_scores(V0))
%time a0 = sid.polarity_scores(V0)

## {'neg': 0.0, 'neu': 0.911, 'pos': 0.089, 'compound': 0.9868}

{'neg': 0.0, 'neu': 0.911, 'pos': 0.089, 'compound': 0.9868}
CPU times: user 3.3 ms, sys: 0 ns, total: 3.3 ms
Wall time: 3.3 ms


In [63]:
## Testing of vader - subset with clean data but small sample
## Cleaned dataset 
V0 = df['doc_clean'][458]

print(sid.polarity_scores(V0))
%time a0 = sid.polarity_scores(V0)

## {'neg': 0.0, 'neu': 0.88, 'pos': 0.12, 'compound': 0.9819}

{'neg': 0.0, 'neu': 0.88, 'pos': 0.12, 'compound': 0.9819}
CPU times: user 2.18 ms, sys: 0 ns, total: 2.18 ms
Wall time: 2.18 ms


In [64]:
## Testing of vader - subset with clean data but small sample
## Raked dataset
V0 = df['rake_phrases'][458]

print(sid.polarity_scores(V0))
%time a0 = sid.polarity_scores(V0)

## {'neg': 0.0, 'neu': 0.879, 'pos': 0.121, 'compound': 0.9819}

{'neg': 0.0, 'neu': 0.879, 'pos': 0.121, 'compound': 0.9819}
CPU times: user 2 ms, sys: 7 µs, total: 2 ms
Wall time: 2.02 ms


In [65]:
## Actual cleaned words that VADER is going to try and resolve
df['doc_clean'][458]

'lecturer physic department physic nontenure track loyola university chicago department physic loyola university chicago luc college art science department physic invite application lecturer position department physic beginning august 10 2020 20202021 academic year department physic extensive undergraduate teaching duty college art science well program physic major minor information department please visit httpsifttt2z0rgnh teaching responsibility shall include 44 teaching load includes introductory physic course lab nonphysics major well course physic major addition candidate must willing support mission luc goal jesuit catholic education experience wide variety different educational research work activity including industry plus loyola universitychicago job httpsifttt35wyng3 via ifttt source loyola university chicago loyola university chicago often referred loyola luc private catholic research university chicago illinois founded 1870 jesuit today loyola one largest catholic universit

In [66]:
## Note that there was no discernable difference in perforamnce between cleaned / raked / unchanged models driven by the lexicon
## and stopword removals

In [67]:
## VADER Sentiment analysis - loop to assign polarity score / build lists / append lists and etner
## A
spot = []
neg = []
pos = []
neu = []
compound = []

for i in range(len(df['doc_clean'])):
    for ii in range(0,1):
        spot.append([i])
        article2 = sid.polarity_scores(df['doc_clean'][i])
        article_list = list(article2.values()) 
        #dc.append(article2)
        compound.append(article_list[3])
        neg.append(article_list[0])
        neu.append(article_list[1])
        pos.append(article_list[2])
df_vader = pd.DataFrame({'Index':spot,'Compound':compound,'Negative':neg, 'Neutral':neu, 'Positive':pos})

In [68]:
## Validate - dataset length
df_vader.shape

(370586, 5)

In [69]:
## Save Vader set in order to reduce churn/re-running
#df_vader.to_csv('Vader.csv')

In [70]:
## Load VADER
#df_vader = pd.read_csv('Vader.csv')

### Dataset Consolidation for Analysis

In [71]:
## Combine Rake DF with df_vader 
df = df.join(df_vader, lsuffix='Unnamed: 0',rsuffix='Index')

In [74]:
#df.drop(['Unnamed: 0Unnamed: 0', 'Unnamed: 0.1','Unnamed: 0Index','Index'],axis=1, inplace=True)

In [73]:
df

Unnamed: 0,combo,doc_clean,rake_phrases,Index,Compound,Negative,Neutral,Positive
0,All In Illinois . Gov. Jay Fatso Pritzker cal...,illinois gov jay fatso pritzker called illinoi...,[kayleen carlson executive director illinois r...,[0],0.9893,0.069,0.796,0.135
1,The Illinois Department of Public Health Annou...,illinois department public health announces 16...,[illinois department public health announces 1...,[1],-0.9136,0.083,0.917,0.000
2,"Foto Friday: Alton, Illinois Gloria Lawrence s...",foto friday alton illinois gloria lawrence sai...,[army corp engineer raise lower water level us...,[2],0.9413,0.030,0.728,0.243
3,Chris Broussard on Michael Jordan returning to...,chris broussard michael jordan returning chica...,[chris broussard michael jordan returning chic...,[3],0.9948,0.015,0.649,0.336
4,Pork chops vs. people: Can Americans appetite ...,pork chop v people american appetite meat fill...,[pork chop v people american appetite meat fil...,[4],-0.4479,0.091,0.858,0.052
...,...,...,...,...,...,...,...,...
370581,Do Illinois laws encourage corruptionprone uti...,illinois law encourage corruptionprone utility...,[illinois law encourage corruptionprone utilit...,[370581],0.9870,0.046,0.783,0.171
370582,"Quick hits: Illinois news in brief for Monday,...",quick hit illinois news brief monday nov 2 202...,[quick hit illinois news brief monday nov 2 20...,[370582],0.9308,0.036,0.871,0.093
370583,Bail for Illinois teen charged in Kenosha kill...,bail illinois teen charged kenosha killing set...,[bail illinois teen charged kenosha killing se...,[370583],-0.9538,0.286,0.661,0.053
370584,"6 Day Trip to Detroit, Chicago On 4th Nov 202...",6 day trip detroit chicago 4th nov 2020 tripho...,[6 day trip detroit chicago 4th nov 2020 triph...,[370584],0.6369,0.000,0.819,0.181


In [75]:
## Adding column to act as a flag for the compound for easier sorting of pos/ neut/ neg
df['Vader'] = 0
df['Vader-i'] = 0

for i in range(len(df['Compound'])):
    #df['Vader-i'][i] = [i]
    if df['Compound'][i] >= .25:
        b = 1
    elif 0.25<=df['Compound'][i]>=-0.25:
        b= 0
    elif df['Compound'][i] <= (-0.25):
        b= -1
    else:
        b=5
    df['Vader'][i] = b

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Vader'][i] = b


In [76]:
df

Unnamed: 0,combo,doc_clean,rake_phrases,Index,Compound,Negative,Neutral,Positive,Vader,Vader-i
0,All In Illinois . Gov. Jay Fatso Pritzker cal...,illinois gov jay fatso pritzker called illinoi...,[kayleen carlson executive director illinois r...,[0],0.9893,0.069,0.796,0.135,1,0
1,The Illinois Department of Public Health Annou...,illinois department public health announces 16...,[illinois department public health announces 1...,[1],-0.9136,0.083,0.917,0.000,-1,0
2,"Foto Friday: Alton, Illinois Gloria Lawrence s...",foto friday alton illinois gloria lawrence sai...,[army corp engineer raise lower water level us...,[2],0.9413,0.030,0.728,0.243,1,0
3,Chris Broussard on Michael Jordan returning to...,chris broussard michael jordan returning chica...,[chris broussard michael jordan returning chic...,[3],0.9948,0.015,0.649,0.336,1,0
4,Pork chops vs. people: Can Americans appetite ...,pork chop v people american appetite meat fill...,[pork chop v people american appetite meat fil...,[4],-0.4479,0.091,0.858,0.052,-1,0
...,...,...,...,...,...,...,...,...,...,...
370581,Do Illinois laws encourage corruptionprone uti...,illinois law encourage corruptionprone utility...,[illinois law encourage corruptionprone utilit...,[370581],0.9870,0.046,0.783,0.171,1,0
370582,"Quick hits: Illinois news in brief for Monday,...",quick hit illinois news brief monday nov 2 202...,[quick hit illinois news brief monday nov 2 20...,[370582],0.9308,0.036,0.871,0.093,1,0
370583,Bail for Illinois teen charged in Kenosha kill...,bail illinois teen charged kenosha killing set...,[bail illinois teen charged kenosha killing se...,[370583],-0.9538,0.286,0.661,0.053,-1,0
370584,"6 Day Trip to Detroit, Chicago On 4th Nov 202...",6 day trip detroit chicago 4th nov 2020 tripho...,[6 day trip detroit chicago 4th nov 2020 triph...,[370584],0.6369,0.000,0.819,0.181,1,0


In [77]:
## Send final to DF CSV
#df.to_csv('DF+Vader.csv')

In [None]:
## Call forward Spacy_df - for reference
#Spacy_df = pd.read_csv('Spacy')

In [None]:
### 2:18:12 Runtime for this entire set

In [None]:
#####
####
###
##
#

#Start of POS / NEG NER SET based on lists this time...

In [9]:
df = pd.read_csv('DF+Vader.csv')

In [10]:
## Create positive and negative sentiment analysis datasets for NER 
pos_sent = df['doc_clean'][df['Vader'] == 1]
neg_sent = df['doc_clean'][df['Vader'] == -1]

In [13]:
## Load SpaCy model
#nlp = spacy.load("en_core_web_sm") ## 
#nlp = spacy.load("en_core_web_md") ## 47.1MB
nlp = spacy.load("en_core_web_lg") ## 
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x173a47c00>

In [14]:
pos_sent2 = list(pos_sent)
neg_sent2 = list(neg_sent)

In [15]:
## Reset threshold
nlp.max_length = 10000000

In [18]:
doc = nlp(str(pos_sent2[0:4500]))
entities = []
labels = []
#position_start = []
#position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    #position_start.append(ent.start_char)
    #position_end.append(ent.end_char)
    
%time df2 = pd.DataFrame({'Entities':entities,'Labels':labels}) #,'Position_Start':position_start, 'Position_End':position_end})

CPU times: user 7.16 ms, sys: 1.33 ms, total: 8.5 ms
Wall time: 11.5 ms


In [21]:
df2.groupby('Labels').count()

Unnamed: 0_level_0,Entities
Labels,Unnamed: 1_level_1
CARDINAL,45142
DATE,31843
EVENT,9
FAC,86
GPE,6242
LANGUAGE,5
LAW,58
LOC,102
MONEY,938
NORP,2229


In [22]:
df2.to_csv('Pos_NER.csv')

In [23]:
Entities = df2[df2.Labels == "ORG"].groupby('Entities')['Labels'].count()
organizations_p = Entities.sort_values(ascending=False)
organizations_p.head(20)

Entities
nbc                 259
shaw                 67
gop                  58
lincoln              54
mvp                  40
abc                  33
41f                  32
nba                  26
k9                   25
cnn                  23
104                  16
ecf                  14
mlb                  10
k12                  10
cbs                  10
york mellon corp     10
oakland               9
misericordia          8
909pm                 8
1248                  8
Name: Labels, dtype: int64

In [29]:
doc = nlp(str(neg_sent2[0:4500]))
entities = []
labels = []
#position_start = []
#position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    #position_start.append(ent.start_char)
    #position_end.append(ent.end_char)
    
%time df3 = pd.DataFrame({'Entities':entities,'Labels':labels}) #,'Position_Start':position_start, 'Position_End':position_end})

CPU times: user 6.13 ms, sys: 1.79 ms, total: 7.92 ms
Wall time: 12.3 ms


In [30]:
df3.groupby('Labels').count()

Unnamed: 0_level_0,Entities
Labels,Unnamed: 1_level_1
CARDINAL,34292
DATE,23885
EVENT,21
FAC,132
GPE,5814
LANGUAGE,3
LAW,17
LOC,49
MONEY,433
NORP,1786


In [31]:
Entities4 = df3[df3.Labels == "ORG"].groupby('Entities')['Labels'].count()
organizations_p4 = Entities4.sort_values(ascending=False)
organizations_p4.head(20)

Entities
nbc                        293
cnn                        126
ecf                        114
fbi                        111
gop                         86
abc                         79
lincoln                     79
faa                         56
52900 le 2018               38
mcsweeney                   31
cdc                         26
pa28                        22
charles missouri daniel     21
sr22                        21
chevrolet                   20
cpd                         20
shaw                        18
h3                          14
msnbc                       12
cbs                         11
Name: Labels, dtype: int64

In [32]:
df3.to_csv('Neg_NERSENT.csv')

In [None]:
## Export data for use in Analysis workbook
df2.to_csv('Pos_sent.csv')
df3.to_csv('Neg_sent.csv')
#rake_ner.to_csv('RAKE_NER.csv')

In [109]:
pos_ner.groupby('Entities')['Entities'].count()

Entities
201807             201807
370580             201807
370582             201807
370584             201807
370585             201807
42 percent         201807
4th                201807
6 day              201807
72win              201807
73win              201807
chicago            201807
detroit            201807
friday             201807
illinois gloria    201807
monday nov 2       201807
nov 2020           201807
Name: Entities, dtype: int64

In [108]:
pos_ner.columns

Index(['Entities', 'Labels', 'Position'], dtype='object')

In [80]:
## VADER Sentiment analysis on NER- loop to assign polarity score / build lists / append lists and etner
## A
spot = []
neg = []
pos = []
neu = []
compound = []

for i in range(len(df_spacy['Entities'])):
    for ii in range(0,1):
        spot.append([i])
        article2 = sid.polarity_scores(df_spacy['Entities'][i])
        article_list = list(article2.values()) 
        #dc.append(article2)
        compound.append(article_list[3])
        neg.append(article_list[0])
        neu.append(article_list[1])
        pos.append(article_list[2])
df_space_vader = pd.DataFrame({'Index':spot,'Ind. Compound':compound,'Ind. Negative':neg, 'Ind. Neutral':neu, 'Ind. Positive':pos})

In [82]:
df_space_vader

Unnamed: 0,Index,Compound,Negative,Neutral,Positive
0,[0],0.0,0.0,1.0,0.0
1,[1],0.0,0.0,1.0,0.0
2,[2],0.0,0.0,1.0,0.0
3,[3],0.0,0.0,1.0,0.0
4,[4],0.0,0.0,1.0,0.0
...,...,...,...,...,...
5558785,[5558785],0.0,0.0,1.0,0.0
5558786,[5558786],0.0,0.0,1.0,0.0
5558787,[5558787],0.0,0.0,1.0,0.0
5558788,[5558788],0.0,0.0,1.0,0.0


In [93]:
## Combine 
df_space_vader2 = df_spacy.join(df_space_vader, lsuffix='Index',rsuffix='Index')

In [94]:
df_space_vader

Unnamed: 0,Entities,Labels,Position,Index,Compound,Negative,Neutral,Positive
0,16,CARDINAL,[0],[0],0.0,0.0,1.0,0.0
1,friday,DATE,[0],[1],0.0,0.0,1.0,0.0
2,illinois gloria,PERSON,[0],[2],0.0,0.0,1.0,0.0
3,370581,CARDINAL,[0],[3],0.0,0.0,1.0,0.0
4,370582,CARDINAL,[0],[4],0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
5558785,chicago,GPE,[370585],[5558785],0.0,0.0,1.0,0.0
5558786,4th,ORDINAL,[370585],[5558786],0.0,0.0,1.0,0.0
5558787,nov 2020,DATE,[370585],[5558787],0.0,0.0,1.0,0.0
5558788,370585,CARDINAL,[370585],[5558788],0.0,0.0,1.0,0.0


In [96]:
## View group by
df_space_vader.groupby('Compound')['Compound'].value_counts()

Compound  Compound
0.0       0.0         5558790
Name: Compound, dtype: int64

In [103]:
## Export Space Vader
df_space_vader.to_csv('Space_Vader.csv')