# EDA

In [2]:
# data munging
import pandas as pd
import numpy as np
from copy import deepcopy

# text preprocessing
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

## Importing data
### Loading dataframes

In [3]:
!ls cd data

ls: cd: No such file or directory
data:
all_articles_clean.csv    doc_topic.csv             time_df_count.csv
all_articles_topics.csv   movies.csv                time_df_norm.csv
archive_article_links.csv newspaper_data.xlsx       topic_by_year_df.csv
archive_article_soups.csv recent_articles_clean.csv topic_word.csv
archive_articles_raw.csv  recent_articles_raw.csv   user_movie_likes2.csv
archive_articles_raw2.csv ships.dta                 user_movie_likes_copy.csv
[34mbbc[m[m                       tableau_topic_dist.csv
bbc.zip                   time


In [4]:
df1, df2, df3 = (pd.read_csv('data/recent_articles_raw.csv').drop('Unnamed: 0', axis=1),
                 pd.read_csv('data/archive_articles_raw.csv').drop('Unnamed: 0', axis=1),
                 pd.read_csv('data/archive_articles_raw2.csv').drop('Unnamed: 0', axis=1)
                )

In [5]:
# recent articles
df1.shape

(7023, 6)

In [6]:
# archived articles, part 1
df2.shape

(74955, 6)

In [7]:
# dropping rows with missing body_text (scraping error)
df2 = df2.iloc[:29672]

In [8]:
df2.shape

(29672, 6)

In [9]:
# archived articles, part 2
df3.shape

(45280, 6)

### Concatenating dataframes

In [10]:
df = (pd.concat([df1, df2, df3])
      .reset_index(drop=True))

In [11]:
df.shape

(81975, 6)

In [12]:
len(df)

81975

## Text preprocessing
### Removing rows with invalid body_text

Some rows either have completely missing body_text or only have a youtube link. This will drop those rows and remove URLs from remaining rows. 

In [13]:
df.tail(3)

Unnamed: 0,title,pub_date,author,source,body_text,url
81972,Scitex expands in Latin America p.,1993-02-27,Jim Rosenberg,,Jim RosenbergBy: Jim Rosenberg SCITEX EXPAN...,http://www.editorandpublisher.com//stories/sci...
81973,Child's death inspires Chicago Tribune contest...,1993-02-27,E&P Staff,,E&P StaffBy: Editorial Staff SEVENTY-ONE YE...,http://www.editorandpublisher.com//stories/chi...
81974,USPS nixes rule for small papers p.,1993-02-13,E&P Staff,,E&P StaffBy: Editorial Staff AFTER A MEETIN...,http://www.editorandpublisher.com//stories/usp...


In [14]:
# example of one row

df.iloc[7022]

title        New Albany Herald publisher talks print media'...
pub_date                                            2012-02-16
author                                               E&P Staff
source                                                     NaN
body_text                          http://youtu.be/fmLRTiUoxDQ
url          http://www.editorandpublisher.com//stories/new...
Name: 7022, dtype: object

In [15]:
df_clean = deepcopy(df)

Dropping rows missing body_text

In [16]:
df_clean.dropna(subset=['body_text'], inplace=True)

In [17]:
df_clean.shape

(70257, 6)

Eliminating URLs and dropping rows with nothing other than URL in body text

In [18]:
# regex query 

select_url_query = '(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?'

In [19]:
# function to remove URLs

cut_urls = (lambda x: re.sub(select_url_query, '', x))

In [20]:
# applying function

df_clean['body_text_clean'] = df_clean.body_text.map(cut_urls)

In [21]:
df_clean.tail(3)

Unnamed: 0,title,pub_date,author,source,body_text,url,body_text_clean
81972,Scitex expands in Latin America p.,1993-02-27,Jim Rosenberg,,Jim RosenbergBy: Jim Rosenberg SCITEX EXPAN...,http://www.editorandpublisher.com//stories/sci...,Jim RosenbergBy: Jim Rosenberg SCITEX EXPAN...
81973,Child's death inspires Chicago Tribune contest...,1993-02-27,E&P Staff,,E&P StaffBy: Editorial Staff SEVENTY-ONE YE...,http://www.editorandpublisher.com//stories/chi...,E&P StaffBy: Editorial Staff SEVENTY-ONE YE...
81974,USPS nixes rule for small papers p.,1993-02-13,E&P Staff,,E&P StaffBy: Editorial Staff AFTER A MEETIN...,http://www.editorandpublisher.com//stories/usp...,E&P StaffBy: Editorial Staff AFTER A MEETIN...


In [22]:
# converting empty strings to None type

conv = lambda i : i or None
df_clean.body_text_clean = [conv(i) for i in df_clean.body_text_clean] 

In [23]:
df_clean.dropna(subset=['body_text_clean'], inplace=True)

In [24]:
df_clean.tail(3)

Unnamed: 0,title,pub_date,author,source,body_text,url,body_text_clean
81972,Scitex expands in Latin America p.,1993-02-27,Jim Rosenberg,,Jim RosenbergBy: Jim Rosenberg SCITEX EXPAN...,http://www.editorandpublisher.com//stories/sci...,Jim RosenbergBy: Jim Rosenberg SCITEX EXPAN...
81973,Child's death inspires Chicago Tribune contest...,1993-02-27,E&P Staff,,E&P StaffBy: Editorial Staff SEVENTY-ONE YE...,http://www.editorandpublisher.com//stories/chi...,E&P StaffBy: Editorial Staff SEVENTY-ONE YE...
81974,USPS nixes rule for small papers p.,1993-02-13,E&P Staff,,E&P StaffBy: Editorial Staff AFTER A MEETIN...,http://www.editorandpublisher.com//stories/usp...,E&P StaffBy: Editorial Staff AFTER A MEETIN...


In [25]:
df_clean.shape

(69848, 7)

### Removing unnecessary text

In [26]:
remove_author = (lambda x: 
                 x.replace('E&P Staff', '')
                 .replace('By:', '')
                 .replace('Editorial Staff', ''))

df_clean['body_text_clean'] = df_clean.body_text_clean.map(remove_author)

### Adding titles to body text

Now that rows with missing/invalid body text are gone, I will add title text into the body of those articles that remain on the list. 

In [27]:
df_clean['full_text_clean'] = df_clean.body_text_clean.str.cat(df_clean.title, sep=' ')

### Cleaning strings

In [28]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers, etc.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…–]', '', text)
    text = re.sub('\n', '', text)
    return text

In [29]:
# Text preprocessing steps - remove numbers, captial letters and punctuation

df_clean['full_text_clean'] = df_clean.full_text_clean.map(clean_text_round1)

In [30]:
df_clean.full_text_clean.tail()

81970     mark fitzgerald    traveling exhibit honors e...
81971     debra gersh    freedom forum media studies ce...
81972    jim rosenberg jim rosenberg    scitex expanded...
81973         seventyone years after sponsoring the int...
81974         after a meeting with representatives from...
Name: full_text_clean, dtype: object

### Tokenizing

In [31]:
df_clean['tokens'] = df_clean.full_text_clean.apply(nltk.word_tokenize)

### Stemming

In [32]:
stemmer = PorterStemmer()

df_clean['stemmed'] = df_clean['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

### Formatting for vectorizers

Joining lists in new columns back to feed into count/TF-IDF vectorizers

In [33]:
df_clean.tokens = df_clean.tokens.apply(lambda x: ' '.join(x))

In [34]:
df_clean.stemmed = df_clean.stemmed.apply(lambda x: ' '.join(x))

### Saving to CSV

Now it's time to move into dimensionality reduction and topic modeling

In [35]:
# saving to csv

df_clean.to_csv('data/all_articles_clean.csv')

In [36]:
df_clean.head()

Unnamed: 0,title,pub_date,author,source,body_text,url,body_text_clean,full_text_clean,tokens,stemmed
0,Four Politicians in Four Debates Isn’t Enough!,2020-08-09,Mike Blinder,,"As of today, The Commission on Presidential De...",http://www.editorandpublisher.com//stories/fou...,"As of today, The Commission on Presidential De...",as of today the commission on presidential deb...,as of today the commission on presidential deb...,as of today the commiss on presidenti debat ha...
1,Chatham Names Former Tribune Executive as New ...,2020-08-07,Kevin G. Hall,McClatchyDC,"Chatham Asset Management, the hedge fund that ...",http://www.editorandpublisher.com//stories/cha...,"Chatham Asset Management, the hedge fund that ...",chatham asset management the hedge fund that w...,chatham asset management the hedge fund that w...,chatham asset manag the hedg fund that won mcc...
2,News Corp Reports Fourth Quarter and Full Year...,2020-08-07,Press Release,News Corp,News Corporation (“News Corp” or the “Company”...,http://www.editorandpublisher.com//stories/new...,News Corporation (“News Corp” or the “Company”...,news corporation news corp or the company nasd...,news corporation news corp or the company nasd...,news corpor news corp or the compani nasdaq nw...
3,WAMU General Manager Resigns After Tumultuous ...,2020-08-07,Elahe Izadi and Paul Farhi,Washington Post,The head of Washington’s NPR affiliate resigne...,http://www.editorandpublisher.com//stories/wam...,The head of Washington’s NPR affiliate resigne...,the head of washingtons npr affiliate resigned...,the head of washingtons npr affiliate resigned...,the head of washington npr affili resign frida...
4,Misinformation is Everybody’s Problem Now,2020-08-07,Joan Donovan and Claire Wardle,Items,"In June 2020, The World Health Organization (W...",http://www.editorandpublisher.com//stories/mis...,"In June 2020, The World Health Organization (W...",in june the world health organization who beg...,in june the world health organization who bega...,in june the world health organ who began a web...
