# Unsupervised learning Capstone (name TBA)
Author: Matthew Huh
    
## About the Data

Collection of 142,570 articles from 15 different publications...

## Research Question

...

## Overview

...

## Packages

In [1]:
# Basic imports
import os
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Clustering packages
import sklearn.cluster as cluster
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation

# Natural Language processing
import re
import spacy
import nltk
from nltk.corpus import stopwords, twitter_samples, gutenberg
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_rcv1

# Machine Learning packages
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Data Preview

In [2]:
# Create list of files from directory
filelist = os.listdir('articles')

# Import the files
df_list = [pd.read_csv(file) for file in filelist]

#concatenate them together
articles = pd.concat(df_list)

# Preview the data
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [3]:
articles.shape

(142570, 10)

In [4]:
articles = articles.sample(frac=0.1)

In [5]:
articles.select_dtypes(include=['object']).nunique()

title          14247
publication       15
author          3917
date            1027
url             8595
content        14238
dtype: int64

In [6]:
# Drop variables that have no impact on the outcome
articles = articles[['title', 'publication', 'author', 'content']]

In [7]:
articles.groupby(['author']).size().sort_values(ascending=False)

author
Breitbart News                                         147
Associated Press                                       127
Pam Key                                                127
Joel B. Pollak                                          82
Camila Domonoske                                        75
Jerome Hudson                                           74
Daniel Nussbaum                                         73
Charlie Spiering                                        71
Post Editorial Board                                    67
AWR Hawkins                                             66
John Hayward                                            66
Ian Hanchett                                            63
Alex Swoyer                                             62
NPR Staff                                               58
Warner Todd Huston                                      53
Katherine Rodriguez                                     52
Reuters                                          

Well, that partly explains how there are so many authors in this dataset. It seems as though there are over 15,000 authors, and many of them have only published one article, or have co-written multiple articles with other authors. This complicates the problem, so in order to best represent each author's writing style, let's see what happens if we simply remove all authors that only published one article as is.

## Feature Selection

In [8]:
# Drop author from the dataframe if they wrote less than 5 articles
vc = articles['author'].value_counts()
u  = [i not in set(vc[vc<=4].index) for i in articles['author']]
articles = articles[u]

In [9]:
# Reprint how many unique authors there are
articles.select_dtypes(include=['object']).nunique()

title          9342
publication      15
author          615
content        9334
dtype: int64

In [10]:
# View number of articles after feature selection
articles.shape

(9348, 4)

So after removing authors that composed fewer than 5 articles, we are left with 125k articles, or 87.8% of the data, and roughly 3k/15k of the authors. Now, we can create a better representation of each author since each author has at least 5 articles to evaluate from.

In [11]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [12]:
articles['content'] = articles.content.map(lambda x: text_cleaner(str(x)))
articles.head()

Unnamed: 0,title,publication,author,content
3583,Nice attack: French police arrest two more peo...,Guardian,Sam Jones,French police have arrested a man and a woman ...
15629,New Video Shows LaVoy Finicumâ€™s Last Moments...,Talking Points Memo,,Cell phone footage taken from the backseat of ...
7877,Fugitive Paris attacker Salah Abdeslam capture...,Fox News,,"Salah Abdeslam, the main fugitive from Islamic..."
33099,We’re sending too much junk into space,New York Post,"Mike Wehner, BGR",Space is big and the hardware that humans keep...
43518,Trump just received the lowest approval rating...,Business Insider,Brett LoGiurato,’ ’ ’ US President Donald Trump received the l...


In [13]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

for article in articles['content']:
    article = lemmatizer.lemmatize(article)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# Identify predictor and target variables
X = articles['content']
y = articles['publication']

# Create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [27]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.99, random_state=42)

In [23]:
X_train.shape

(7011,)

## Tf-idf Vectorization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=5, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
X_tfidf=vectorizer.fit_transform(X)
print("Number of features: %d" % X_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25, random_state=42)

#Removes all zeros from the matrix
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]

#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()

#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

Number of features: 26856


In [20]:
# Examining shapes 
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(7011, 26856)
(2337, 26856)


In [21]:
from sklearn.preprocessing import normalize
X_norm = normalize(X_train_tfidf)

In [31]:
# Instantiating spaCy
nlp = spacy.load('en')
X_train_words = []

for row in X_train:
    # Processing each row for tokens
    row_doc = nlp(row)
    # Calculating length of each sentence
    sent_len = len(row_doc) 
    # Initializing counts of different parts of speech
    advs = 0
    verb = 0
    noun = 0
    adj = 0
    for token in row_doc:
        # Identifying each part of speech and adding to counts
        if token.pos_ == 'ADV':
            advs +=1
        elif token.pos_ == 'VERB':
            verb +=1
        elif token.pos_ == 'NOUN':
            noun +=1
        elif token.pos_ == 'ADJ':
            adj +=1
    # Creating a list of all features for each sentence
    X_train_words.append([row_doc, advs, verb, noun, adj, sent_len])

IndexError: list index out of range

In [28]:
# Instantiating spaCy
nlp = spacy.load('en')
X_train_words = []

for row in X_train2:
    # Processing each row for tokens
    row_doc = nlp(row)
    # Calculating length of each sentence
    sent_len = len(row_doc) 
    # Initializing counts of different parts of speech
    advs = 0
    verb = 0
    noun = 0
    adj = 0
    for token in row_doc:
        # Identifying each part of speech and adding to counts
        if token.pos_ == 'ADV':
            advs +=1
        elif token.pos_ == 'VERB':
            verb +=1
        elif token.pos_ == 'NOUN':
            noun +=1
        elif token.pos_ == 'ADJ':
            adj +=1
    # Creating a list of all features for each sentence
    X_train_words.append([row_doc, advs, verb, noun, adj, sent_len])

In [29]:
X_counter = pd.DataFrame(data=X_train_words, columns=['BOW', 'ADV', 'VERB', 'NOUN', 'ADJ', 'sent_length'])

In [30]:
X_counter

Unnamed: 0,BOW,ADV,VERB,NOUN,ADJ,sent_length
0,"(America, ’s, central, bank, is, nudging, its,...",42,118,172,59,811
1,"(As, time, goes, by, MLB, ’s, home, run, leade...",30,61,96,40,505
2,"(A, foreign, government, has, revealed, anothe...",15,72,83,29,538
3,"(LAS, VEGAS, —, sensation, point, guard, Chass...",19,72,96,30,476
4,"(Imagine, if, ,, as, happens, every, thousand,...",94,334,461,181,2328
5,"(President, Donald, Trump, stands, by, his, be...",10,49,67,27,348
6,"(How, close, was, notorious, Mexican, drug, lo...",35,104,124,33,613
7,"((, CNN, ), For, the, second, time, in, a, wee...",34,101,98,52,607
8,"(”, Everyone, in, business, wants, to, know, w...",0,4,3,0,13
9,"(If, you, ’ve, ever, been, addicted, to, an, a...",14,50,60,21,279


In [None]:
X_normal  = pd.DataFrame(data=X_norm.toarray())

In [None]:
features = pd.concat([X_counter,X_normal])

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Instantiating and fitting the 150 best features
kbest = SelectKBest(chi2, k=150)
X2_train = kbest.fit_transform(X_, y_train)

# Clustering

### K-means

In [None]:
# Calulate predicted values
kmeans = KMeans(n_clusters=15, init='k-means++', random_state=42, n_init=20)
y_pred = kmeans.fit_predict(X_train)

pd.crosstab(y_train, y_pred)

In [None]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score

print('Adjusted Rand Score: {:0.7}'.format(adjusted_rand_score(y_train, y_pred)))
print('Silhouette Score: {:0.7}'.format(silhouette_score(X_train, y_pred, metric='euclidean')))

# Modelling

### Random Forest

In [None]:
tf_rfc = ensemble.RandomForestClassifier()
train = tf_rfc.fit(X_train_tfidf, y_train)

print('Training set score:', tf_rfc.score(X_train_tfidf, y_train))
print('\nTest set score:', tf_rfc.score(X_test_tfidf, y_test))

### Logistic Regression

In [None]:
tf_lr = LogisticRegression()
train = tf_lr.fit(X_train_tfidf, y_train)

print('Training set score:', tf_lr.score(X_train_tfidf, y_train))
print('\nTest set score:', tf_lr.score(X_test_tfidf, y_test))

# Source

https://www.kaggle.com/snapcrack/all-the-news