In [1]:
# Basic imports
import os
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Clustering packages
import sklearn.cluster as cluster
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation

# Natural Language processing
import re
import spacy
import nltk
from nltk.corpus import stopwords, twitter_samples, gutenberg
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_rcv1

In [2]:
# Create list of files from directory
filelist = os.listdir('articles')

# Import the files
df_list = [pd.read_csv(file) for file in filelist]

#concatenate them together
articles = pd.concat(df_list)

# Preview the data
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [3]:
articles.shape

(142570, 10)

In [4]:
articles.select_dtypes(include=['object']).nunique()

title          142132
publication        15
author          15647
date             1646
url             85559
content        142038
dtype: int64

In [5]:
# Drop variables that have no impact on the outcome
articles = articles[['title', 'publication', 'author', 'content']]

In [6]:
articles.groupby(['author']).size().sort_values(ascending=False)

author
Breitbart News                                                      1559
Pam Key                                                             1282
Associated Press                                                    1231
Charlie Spiering                                                     928
Jerome Hudson                                                        806
John Hayward                                                         747
Daniel Nussbaum                                                      735
AWR Hawkins                                                          720
Ian Hanchett                                                         647
Joel B. Pollak                                                       624
Post Editorial Board                                                 620
Alex Swoyer                                                          604
Camila Domonoske                                                     593
Warner Todd Huston                          

In [7]:
# Drop author from the dataframe if they wrote less than 5 articles
vc = articles['author'].value_counts()
u  = [i not in set(vc[vc<=4].index) for i in articles['author']]
articles = articles[u]

In [8]:
articles.select_dtypes(include=['object']).nunique()

title          124811
publication        15
author           3063
content        124724
dtype: int64

In [9]:
articles.shape

(125223, 4)

So after removing authors that composed fewer than 5 articles, we are left with 125k articles, or 87.8% of the data, and roughly 3k/15k of the authors. Now, we can create a better representation of each author since each author has at least 5 articles to evaluate from.

In [10]:
articles.groupby(['author']).size().sort_values(ascending=False)

author
Breitbart News                                 1559
Pam Key                                        1282
Associated Press                               1231
Charlie Spiering                                928
Jerome Hudson                                   806
John Hayward                                    747
Daniel Nussbaum                                 735
AWR Hawkins                                     720
Ian Hanchett                                    647
Joel B. Pollak                                  624
Post Editorial Board                            620
Alex Swoyer                                     604
Camila Domonoske                                593
Warner Todd Huston                              545
NPR Staff                                       514
Jeff Poor                                       505
Merrit Kennedy                                  484
Trent Baker                                     457
Breitbart London                                447
Kathe