### Set up libraries

In [44]:
import pandas as pd
import numpy as np

from textblob import TextBlob  # for sentiment analysis
import spacy 
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

stop_words = spacy.lang.en.stop_words.STOP_WORDS

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

### Import data

In [3]:
# Open the file
df = pd.read_csv('data/news_articles.csv', encoding='unicode_escape')
df = df.iloc[:,0:6]

# Save out the name of news publication
df['source'] = [url.split('/')[2] for url in df['article_source_link']]

### Preprocess data

In [4]:
# Drop rows with no article text
print('Shape before:', df.shape)
df = df.dropna(subset=['text'])
print('Shape after dropping rows with no article text:', df.shape)

Shape before: (3824, 7)
Shape after dropping rows with no article text: (3791, 7)


### (testing)

In [9]:
df = df.head()
df

# cluster the news articles to figure out what topics they generally are about
# calculate a sentiment score (polarity & subjectivity) for each -- use the mean and rms? (as magnitude measure)
# pull out  average polarity
# pull out average subjectivity
# pull out 3 most "intense sentneces



ValueError: too many values to unpack (expected 2)

In [47]:
# Preprocess text
#TODO: Make this more performant and not traverse everything each time? use textblob?
#TODO: Add pydoc comments

def preprocess_text(text):
    # Tokenize
    doc = nlp(text)
#     [token.text for token in doc]

    # Remove punctuation & stopwords
    doc = [t for t in doc if t not in stop_words and t.pos_ != 'PUNCT']

    # Normalize (convert non-text to text)
    
    # Stemming/Lemmatization (& lowercase & strip)
    doc = [t.lemma_.lower().strip() for t in doc]

    return doc

In [None]:
def cluster_text(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

In [48]:
preprocess_text(df.text[0])

['michigan',
 'billionaire',
 'education',
 'activist',
 'betsy',
 'devos',
 'be',
 'confirm',
 'today',
 'to',
 'serve',
 'as',
 'the',
 'secretary',
 'of',
 'education',
 'in',
 'president',
 'trump',
 "'s",
 'administration',
 'after',
 'vice',
 'president',
 'mike',
 'pence',
 'cast',
 'a',
 'tie',
 'break',
 'vote',
 'in',
 'the',
 'senate',
 'the',
 'senate',
 'vote',
 'on',
 'devos"?highly',
 'contentious',
 'nomination',
 'this',
 'afternoon',
 'and',
 'the',
 'tally',
 'be',
 'split',
 'evenly',
 'require',
 'pence',
 'to',
 'use',
 '-pron-',
 'authority',
 'as',
 'president',
 'of',
 'the',
 'upper',
 'chamber',
 'of',
 'congress',
 'to',
 'break',
 'the',
 'impasse',
 'this',
 'be',
 'the',
 'first',
 'time',
 'that',
 'a',
 'vice',
 'president',
 'have',
 'break',
 'a',
 'tie',
 'to',
 'confirm',
 'a',
 'cabinet',
 'nominee',
 'pence',
 'read',
 'the',
 'vote',
 'count',
 '50',
 '-',
 '50',
 'and',
 'then',
 'vote',
 '-pron-',
 'render',
 'the',
 'tally',
 '51',
 '-',
 '50'

In [21]:
def get_sentiment(article):
    try:
        return TextBlob(article).sentiment
    except:
        return None

df['pol_sub'] = df.text.apply(get_sentiment)
df['polarity'] = df['pol_sub'].apply(lambda x: x[0])
df['subjectivity'] = df['pol_sub'].apply(lambda x: x[1])

#NOTE: these scores are for entire articles

df


Unnamed: 0,article_id,publish_date,article_source_link,title,subtitle,text,source,pol_sub,polarity,subjectivity
0,1,2017/2/7,http://abcnews.go.com/Politics/pence-break-tie...,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...,abcnews.go.com,"(0.030378787878787884, 0.42757575757575755)",0.030379,0.427576
1,2,2017/2/7,http://abcnews.go.com/Politics/wireStory/melan...,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...,abcnews.go.com,"(0.06409090909090906, 0.37349954771596555)",0.064091,0.3735
2,3,2017/2/7,http://abcnews.go.com/Politics/wireStory/trump...,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...,abcnews.go.com,"(0.07278344671201815, 0.39367913832199536)",0.072783,0.393679
3,4,2017/2/7,http://abcnews.go.com/Politics/appeals-court-d...,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ...",abcnews.go.com,"(0.09401373988330514, 0.4382505175983437)",0.094014,0.438251
4,5,2017/2/7,http://abcnews.go.com/US/23-states-winter-weat...,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...,abcnews.go.com,"(0.02194023569023569, 0.35596921596921605)",0.02194,0.355969


#### Possible areas for improvement   

* Using sentiment analysis for phrases instead of entire sentences (or even words)
* Improving how StanfordNLP is parsing sentences (some of them are clauses instead)--pulling out punctuation in preprocessing may help this?
* How good is Stanford NLP's dependency parser + pos tagger? (false positives? false negatives?)
* Pulling out quotes when someone is saying something negative

* What metrics am I trying to measure against/to improve?"