## Sample Workflow

![Workflow]("../docs/btap_0401.png")

In [37]:
# Set Project Root
PROJ_ROOT = os.path.join(os.pardir)
print(os.path.abspath(PROJ_ROOT))

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)


# %run "$BASE_DIR/settings.py"


%config InlineBackend.figure_format = 'png'

# import my method from the source code
# from features.build_features import remove_invalid_data
# from features.build_features import awesome_function

#TODO put tqdm progress bars in here


import os
import sys

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly as plty

import missingno as msno

from wordcloud import WordCloud
from pandas_profiling import ProfileReport

import html
import re

# Textacy
import textacy.preprocessing as tprep # Preprocesing of accents/normalization
from textacy.preprocessing.resources import RE_URL

import nltk
from nltk.tokenize import RegexpTokenizer

from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import spacy

# Set to Reload all custom packages
%load_ext autoreload
%autoreload 2

from sklearn.feature_extraction.text import CountVectorizer

# Custom developed tools
import nlp_tools
import constants

nltk.download('stopwords')
nltk.download('wordnet')

%matplotlib inline

!python -m spacy download en_core_web_sm

from collections import Counter


g:\My Drive\Code\springboard_proj_nlp_nytimes
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


## Load up data

In [2]:
# Load Sample Data
df_test = pd.read_pickle('..//data//processed//df_test_sample.pkl')
df_train = pd.read_pickle('..//data//processed//df_train_sample.pkl')
df_comments = pd.read_pickle('..//data//processed//df_comments_sample.pkl')
df_articles = pd.read_pickle('..//data//processed//df_articles_sample.pkl')

# # Load Real Data
# # df_test = pd.read_pickle('..//data//raw//df_test.pkl')
# # df_train = pd.read_pickle('..//data//raw//df_train.pkl')
# df_comments = pd.read_pickle('..//data//raw//df_comments.pkl')
# df_articles = pd.read_pickle('..//data//raw//df_articles.pkl')

# Drop constant columns, userTitle is 99% empty
df_comments.drop(columns=['status','trusted','recommendedFlag','isAnonymous','userTitle'],inplace=True)

# Convert Timestamps where necessary
df_articles['pub_date'] = pd.to_datetime(df_articles['pub_date'])
df_comments['createDate'] = pd.to_datetime(df_comments['createDate'])
df_comments['updateDate'] = pd.to_datetime(df_comments['updateDate'])
df_comments['approveDate'] = pd.to_datetime(df_comments['approveDate'])

TODO make data columns dates

In [3]:
# Sorft by n_comments
df_articles.sort_values(ascending=False,by='n_comments',inplace=True)
df_articles.head(1)

Unnamed: 0,newsdesk,section,subsection,material,headline,abstract,keywords,word_count,pub_date,n_comments,uniqueID
2209,Washington,U.S.,Politics,News,"Trump Grants Clemency to Blagojevich, Milken a...",The president also pardoned or commuted the se...,"['Trump, Donald J', 'Amnesties, Commutations a...",1765,2020-02-18 17:51:52+00:00,3595,nyt://article/4a3415ef-4390-577c-9cde-438b8b10...


In [11]:
list(df_articles.head(1)['keywords'])

["['Trump, Donald J', 'Amnesties, Commutations and Pardons', 'Kerik, Bernard B', 'Debartolo, Edward J Jr', 'Blagojevich, Rod R', 'Milken, Michael R', 'United States Politics and Government']"]

In [144]:
df_articles['abstract'].loc[2209]

'The president also pardoned or commuted the sentences of eight others on Tuesday, including Edward DeBartolo, a former owner of the San Francisco 49ers.'

In [145]:
df_articles['keywords'].loc[2209]

"['Trump, Donald J', 'Amnesties, Commutations and Pardons', 'Kerik, Bernard B', 'Debartolo, Edward J Jr', 'Blagojevich, Rod R', 'Milken, Michael R', 'United States Politics and Government']"

### Join Articles to Comments (potentially useful)

In [146]:
df_articles.iloc[0]

newsdesk                                             Washington
section                                                    U.S.
subsection                                             Politics
material                                                   News
headline      Trump Grants Clemency to Blagojevich, Milken a...
abstract      The president also pardoned or commuted the se...
keywords      ['Trump, Donald J', 'Amnesties, Commutations a...
word_count                                                 1765
pub_date                              2020-02-18 17:51:52+00:00
n_comments                                                 3595
uniqueID      nyt://article/4a3415ef-4390-577c-9cde-438b8b10...
Name: 2209, dtype: object

In [147]:
id = df_articles.iloc[0]['uniqueID']
id

'nyt://article/4a3415ef-4390-577c-9cde-438b8b10bfd7'

In [148]:
# def getComments(uniqueid):
#     '''
#     Provide Unique ID of an article and receive dataframe of the comments for that article
#     '''
#     return df_comments[df_comments['articleID'] == uniqueid].sort_values(by='createDate')


In [12]:
df_articles.head(1)

Unnamed: 0,newsdesk,section,subsection,material,headline,abstract,keywords,word_count,pub_date,n_comments,uniqueID
2209,Washington,U.S.,Politics,News,"Trump Grants Clemency to Blagojevich, Milken a...",The president also pardoned or commuted the se...,"['Trump, Donald J', 'Amnesties, Commutations a...",1765,2020-02-18 17:51:52+00:00,3595,nyt://article/4a3415ef-4390-577c-9cde-438b8b10...


# Bag of Words from article keywords

In [45]:
# d = df_articles['keywords'].explode()
keyword_list = nlp_tools.getListfromKeyWordListStr(df_articles['keywords'])

# Create single list of words
word_list = []
for x in keyword_list:
    word_list.extend(x)



In [43]:
cv = CountVectorizer()
cv.fit(word_list)
dt = cv.transform(word_list)
feature_keywords = pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

feature_keywords


Unnamed: 0,11,12,1580,1627,18,1800,1861,1863,1888,1914,...,zhejiang,zion,zoning,zoo,zoom,zoominfo,zovio,zscaler,zte,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF_IDF Skip Vectorizer

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(df_articles['keywords'])
df_feat_keywords = pd.DataFrame(dt.toarray(),columns=tfidf.get_feature_names_out())
df_feat_keywords




Unnamed: 0,11,12,1930,1935,1936,1938,1939,1954,1955,1957,...,wuhan,xi,yang,york,youth,youtube,zhang,zoning,zoom,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.162434,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.214750,0.0,0.0,0.0,0.0,0.0,0.0
835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


### df Comment Processing Section

In [150]:
# Remove noise and compute impurity score
df_comments['commentBody_Impurity'] = df_comments['commentBody'].apply(nlp_tools.impurity).sort_values(ascending=False)


In [151]:
# Remove not printable characters like tags
df_comments['commentBody_CLEAN_1'] = df_comments['commentBody'].map(nlp_tools.clean)
df_comments['commentBody_CLEAN_1'].head(1)

2893132    @Jo Williams Exactly I see a nightmarish Gotha...
Name: commentBody_CLEAN_1, dtype: object

In [152]:
df_comments['commentBody_NORMALIZE_2'] = df_comments['commentBody_CLEAN_1'].map(nlp_tools.normalize)
df_comments['commentBody_NORMALIZE_2'].head(1)

2893132    @Jo Williams Exactly I see a nightmarish Gotha...
Name: commentBody_NORMALIZE_2, dtype: object

In [153]:
df_comments['commentBody_LOSSCONTRACTIONS_3']

KeyError: 'commentBody_LOSSCONTRACTIONS_3'

# Spacy
https://spacy.io/usage/processing-pipelines

In [None]:
# Create 

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
df_comments['commentBody'].iloc[800]

In [None]:

doc = nlp(df_comments['commentBody'].iloc[800])

In [None]:
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
#print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

In [None]:
# Find named entities, phrases and concepts
#for entity in doc.ents:
#    print(entity.text, entity.label_)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
df_ = tfidf.fit_transform(df_comments["commentBody"])

In [None]:
df_.data

## Remove Stop words

In [None]:
df_comments['commentBody_clean2']

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
print(len(stopwords))
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(df_comments["commentBody_clean2"])

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(df_comments['commentBody'])
dt

### Linguistic Analysis

In [None]:
# import spacy

# nlp = spacy.load('en_core_web_sm')
# nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
# for i, row in df_comments.iterrows():
#     doc = nlp(str(row['commentBody']))
#     df_comments.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
#     df_comments.at[i, "nav"] = " ".join([token.lemma_ for token in doc
#                      if token.pos_ in nouns_adjectives_verbs])

# Features for Articles

### Build list of all keywords - Bag of Words

In [154]:
# Counter of all keywords
all_keywords = []
for _ in df_articles['keywords'].apply(eval):
    all_keywords.extend(_)
cnt = Counter(all_keywords)
cnt.most_common(10)

# TODO Barchart here

[('Coronavirus (2019-nCoV)', 256),
 ('Trump, Donald J', 141),
 ('United States Politics and Government', 130),
 ('Presidential Election of 2020', 107),
 ('Biden, Joseph R Jr', 59),
 ('New York City', 54),
 ('Democratic Party', 53),
 ('Quarantines', 51),
 ('United States', 50),
 ('Real Estate and Housing (Residential)', 45)]

In [165]:
nlp_tools.convertKeywordsToBag(df_articles['keywords'])


2209    [Trump, Donald J, Amnesties, Commutations and ...
7152    [Meat, Meatpacking Plants and Slaughterhouses,...
4559    [Kushner, Jared, Trump, Donald J, Coronavirus ...
3274    [Sanders, Bernard, Democratic Party, Primaries...
2421    [Nevada, Primaries and Caucuses, Presidential ...
                              ...                        
9631    [Hockey, Ice, Coronavirus Reopenings, National...
428     [Sexual Harassment, Sex Crimes, Evans, Lucia, ...
2141    [Theater, Jack (Brooklyn, NY, Performance Spac...
4695                                            [Ecuador]
2822    [Crossword Puzzles, Lucido, Aimee (Crossword C...
Name: keywords, Length: 839, dtype: object


NameError: name 'bag' is not defined

In [132]:
# length of all words
len(all_keywords)

5945

In [133]:
# Set of all those keywords
all_keywords_set = set(all_keywords)
len(all_keywords_set)

2377

In [None]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf
