# Unsupervised learning Capstone (name TBA)
Author: Matthew Huh
    
# About the Data

Collection of 142,570 articles from 15 different publications...

# Research Question

...

# Overview

...

# Packages

In [33]:
# Basic imports
import os
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Clustering packages
import sklearn.cluster as cluster
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation

# Natural Language processing
import re
import spacy
import nltk
from nltk.corpus import stopwords, twitter_samples, gutenberg
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_rcv1

# Machine Learning packages
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Data Preview

In [2]:
# Create list of files from directory
filelist = os.listdir('articles')

# Import the files
df_list = [pd.read_csv(file) for file in filelist]

#concatenate them together
articles = pd.concat(df_list)

# Preview the data
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [3]:
articles.shape

(142570, 10)

In [4]:
articles.select_dtypes(include=['object']).nunique()

title          142132
publication        15
author          15647
date             1646
url             85559
content        142038
dtype: int64

In [5]:
# Drop variables that have no impact on the outcome
articles = articles[['title', 'publication', 'author', 'content']]

In [6]:
articles.groupby(['author']).size().sort_values(ascending=False)

author
Breitbart News                                                      1559
Pam Key                                                             1282
Associated Press                                                    1231
Charlie Spiering                                                     928
Jerome Hudson                                                        806
John Hayward                                                         747
Daniel Nussbaum                                                      735
AWR Hawkins                                                          720
Ian Hanchett                                                         647
Joel B. Pollak                                                       624
Post Editorial Board                                                 620
Alex Swoyer                                                          604
Camila Domonoske                                                     593
Warner Todd Huston                          

Well, that partly explains how there are so many authors in this dataset. It seems as though there are over 15,000 authors, and many of them have only published one article, or have co-written multiple articles with other authors. This complicates the problem, so in order to best represent each author's writing style, let's see what happens if we simply remove all authors that only published one article as is.

# Feature Selection

In [7]:
# Drop author from the dataframe if they wrote less than 5 articles
vc = articles['author'].value_counts()
u  = [i not in set(vc[vc<=4].index) for i in articles['author']]
articles = articles[u]

In [8]:
# Reprint how many unique authors there are
articles.select_dtypes(include=['object']).nunique()

title          124811
publication        15
author           3063
content        124724
dtype: int64

In [11]:
# View number of articles after feature selection
articles.shape

(125223, 4)

So after removing authors that composed fewer than 5 articles, we are left with 125k articles, or 87.8% of the data, and roughly 3k/15k of the authors. Now, we can create a better representation of each author since each author has at least 5 articles to evaluate from.

In [12]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [15]:
articles['content'] = articles.content.map(lambda x: text_cleaner(str(x)))
articles['content'].head()

0    WASHINGTON — Congressional Republicans have a ...
2    When Walt Disney’s “Bambi” opened in 1942, cri...
4    SEOUL, South Korea — North Korea’s leader, Kim...
5    LONDON — Queen Elizabeth II, who has been batt...
6    BEIJING — President Tsai of Taiwan sharply cri...
Name: content, dtype: object

In [16]:
# Identify predictor and target variables
X = articles['content']
y = articles['publication']

# Create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Tf-idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
X_tfidf=vectorizer.fit_transform(X)
print("Number of features: %d" % X_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25, random_state=42)

#Removes all zeros from the matrix
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]

#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()

#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original article:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Number of features: 143632
Original sentence: I have been battling depression and sleeplessness while thinking about how to fight Donald Trump and what his rise means about the United States of America. It is dispiriting that after both modest and substantial gains have been made during the Obama and Black Lives Matter years (on race, gender, political imagination, LGBT rights and healthcare) many stand to be rolled back with a vengeance in the coming months and years. The retrenchment is frightening. And it’s become clear that this fight is going to last for the rest of my life. For advice on how to deal with this, I turned to my late father, Bill Thrasher. He was an air force sergeant who went to night school as an adult to become a high school and community college teacher of US and “Afro American” history. My dad died suddenly, in 2003, while teaching a class full of students a blood clot killed him instantly while he was lecturing about the civil war and the reconstruction. His su

Tf_idf vector: {'wallowed': 0.06931321625421569, 'thorniest': 0.06363930065064267, 'amenable': 0.05434236466942158, 'imploded': 0.05677460991608222, 'centrists': 0.056243465874808024, 'rep': 0.028934347418638647, 'substantively': 0.054178288918359074, 'faltered': 0.05381004305094117, 'manifested': 0.052096535996925306, 'enrollees': 0.04972473410552688, 'chopping': 0.04995449643779567, 'recess': 0.04414323379094647, 'gop': 0.141725614999326, 'allotted': 0.052176811867940986, 'iteration': 0.047161854553466095, 'plague': 0.0484404949865122, 'fallout': 0.040083951182843566, 'seeded': 0.056853957721552634, 'waiver': 0.045572223204764624, 'opt': 0.08420036636852049, 'opting': 0.04741939014331366, 'departed': 0.043801208858714365, 'meadows': 0.04620175193016908, 'waivers': 0.04687623045282667, 'hash': 0.052258037771310537, 'mulvaney': 0.09193813666859636, '115': 0.04661996302583731, 'illustrative': 0.056541926320211336, 'firmly': 0.03903188165195264, 'dominate': 0.039712860337673205, 'inabili

In [38]:
# Examining shapes 
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(93917, 143632)
(31306, 143632)


In [30]:
from sklearn.preprocessing import normalize
X_norm = normalize(X_train_tfidf)

In [31]:
X_norm

<93917x143632 sparse matrix of type '<class 'numpy.float64'>'
	with 21890402 stored elements in Compressed Sparse Row format>

# Clustering

### K-means

In [39]:
num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(X_tfidf)

clusters = km.labels_.tolist()

KeyboardInterrupt: 

AttributeError: 'KMeans' object has no attribute 'labels_'

# Modelling

### Random Forest

In [34]:
tf_rfc = ensemble.RandomForestClassifier()
train = tf_rfc.fit(X_train_tfidf, y_train)

print('Training set score:', tf_rfc.score(X_train_tfidf, y_train))
print('\nTest set score:', tf_rfc.score(X_test_tfidf, y_test))

Training set score: 0.9943780146299392

Test set score: 0.44330160352648057


### Logistic Regression

In [35]:
tf_lr = LogisticRegression()
train = tf_lr.fit(X_train_tfidf, y_train)

print('Training set score:', tf_lr.score(X_train_tfidf, y_train))
print('\nTest set score:', tf_lr.score(X_test_tfidf, y_test))

Training set score: 0.8200538773597964

Test set score: 0.722545199003386
