# NLP: Initial Text Pre-Processing

- Remove stop words
- Remove numbers
- Remove punctuation
- Underscore entities with more than one word
- Tidy up Reuters news article text

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer, normalize
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.utils.extmath import randomized_svd
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

from nltk.tag import StanfordNERTagger
from string import punctuation
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora, models, similarities, matutils, models
import spacy

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



## Config

In [2]:
java_path = "C:/Program Files/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [3]:
stanford_ner_path = 'C:\ds\stanford-ner\stanford-ner.jar'
stanford_classifier = 'C:\ds\stanford-ner\classifiers\english.muc.7class.distsim.crf.ser.gz'

In [4]:
nlp = spacy.load(name='en')

In [5]:
stops = stopwords.words('english')

In [6]:
extra_stops = ['pct', 'percent', 'cent', 'high', 'low', 'top', 'news', 'topnew', 'topnews', 'GMT', 'BST', 'AM', 'PM',
              'Reuters', 'reuters', 'reuters.com', 'plc', 'PLC', 'visit', 'click', 'thomson', 'Thomson', 'thomsonreuters',
              'suggest', 'feedback', 'alert', 'email', 'best', 'cms', 'CMS', 'pageid', 'livemarket', 'client', 'link',
              'net', 'change', 'chg', 'open', 'site', 'eikon', 'EIKON', 'yld', 'yr', 'say', 'year', 'close', 'performance',
              'perform', 'performs', 'id', 'ID', 'pa', 'report', 'reports', 'reporting', 'share', 'break', 'recent', 'past',
              'point', 'said', 'index', 'data', 'new', 'points', 'market', 'markets', 'bn', 'Bn', 'Mn', 'mn', 'avg', 'average',
              'fell', 'fall', 'rose', 'rise', 'time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 
               'Sunday', 'people', 'higher', 'lower', 'likely', 'Reporting']

In [7]:
stops = stops + extra_stops
stops = set(stops)

## Functions

In [8]:
def underscore_entities(document):
    spacy = nlp(document)
    
    uds_entities = []
    ent_chars = []
    for i in spacy.ents:
        ent_chars.append((i.start_char, i.end_char))
    
    new_string = []
    for index, item in enumerate(list(str(spacy))):
        for i, j in enumerate(ent_chars):
            if j[0] < index < j[1] and item == ' ':
                item = '_'
        new_string.append(item)

    ns = ''.join(new_string)
    uds_entities.append(ns)
        
    return uds_entities

In [9]:
def clean_reuters(article):
    text_to_find = '(Reuters) - '
    try:
        cleaned = article[article.index(text_to_find) + len(text_to_find):]
        return cleaned
    except:
        return article

In [11]:
def clean_text(raw_text, stop=True):
    raw_text = raw_text.replace('U.S', 'US')
    raw_text = clean_reuters(raw_text)
    letters_only = re.sub('[^a-zA-Z]', ' ', raw_text)
    letters_only = ' '.join(letters_only.split())
#     underscored = underscore_entities(letters_only)
#     words = underscored[0].split()
    words = letters_only.split()
    
    if stop == True:
        meaningful_words = [w for w in words if not w in stops]
        return( " ".join(meaningful_words))
    else:
        return( " ".join(words))

## Load Data

In [12]:
df = pd.read_csv('news_articles.csv')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84809 entries, 0 to 84808
Data columns (total 5 columns):
article_text    84809 non-null object
datetime        84809 non-null object
source          84809 non-null object
title           84809 non-null object
url             84809 non-null object
dtypes: object(5)
memory usage: 3.2+ MB


## Clean Article Text

In [13]:
df['cleaned_article_text'] = df['article_text'].apply(clean_text)

In [14]:
df.sample(10)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text
52206,"TOKYO, Jan 15 (Reuters) - Japan's Nikkei share...",2013-01-14,Reuters,Nikkei may test level not seen since April 2010,http://uk.reuters.com/article/markets-japan-st...,Japan Nikkei expected may test mark level seen...
22977,* FTSE 100 down 0.4 pct * Estate agents slump ...,2016-11-23 00:00:00,Reuters,Britain's FTSE hit by commodity reversal,http://uk.reuters.com/article/britain-stocks-i...,Britain hit reversal mining energy stocks Wall...
44127,"(Adds stocks, detail) By Aiko Hayashi TOK...",2007-11-07,Reuters,"Japan stocks rebound as Softbank jumps, Toyota...",http://uk.reuters.com/article/markets-japan-st...,The Nikkei rebounded little logging seven week...
7796,\nA rise in local authority borrowing pushed u...,2014-03-21 11:36:25,Guardian,Government borrowing edges up in February,https://www.theguardian.com/business/2014/mar/...,A local authority borrowing pushed government ...
2325,\nSeeking health care in Ethiopia can be a dif...,2007-07-19 17:08:11,Guardian,Brain drain still hurting world's poorest coun...,https://www.theguardian.com/world/2007/jul/19/...,Seeking health care Ethiopia difficult task Fo...
3832,\nThe reaction from the City today as both Gor...,2010-05-07 17:21:05,Guardian,City confidence drains at prospect of horse-tr...,https://www.theguardian.com/business/2010/may/...,The reaction City today Gordon Brown David Cam...
81822,The Asia strategy team at SEB has published it...,2016-12-20 12:25:00,FT,Momentum is now favourable for Chinese equities,https://www.ft.com/content/cec378e6-c68c-11e6-...,The Asia strategy team SEB published three tra...
30255,"LONDON, April 9 (Reuters) - European stocks we...",2014-04-09 00:00:00,Reuters,European Factors to Watch-Shares set to open m...,http://uk.reuters.com/article/markets-europe-f...,European stocks seen broadly steady two sessio...
33961,* FTSEurofirst 300 index up 0.04 pct * Insu...,2009-10-26 00:00:00,Reuters,European shares little changed; insurers slip,http://uk.reuters.com/article/markets-europe-s...,European shares little changed pharma stocks o...
60334,By Ambar Warrick\n April 5 (Reuters) - ...,2017-04-05 00:00:00,Reuters,SE Asia Stocks-Slightly down; all eyes on Trum...,http://uk.reuters.com/article/southeast-asia-s...,Southeast Asian stock barring Philippines Indo...


## Save Data

In [15]:
df.to_csv('news_articles_cleaned.csv', index=None, encoding='utf-8')

In [19]:
df.iloc[35750]['article_text']

'* FTSEurofirst 300 up 0.9 pct * BNP Paribas rises on relief at U.S. sanctions settlement * Chinese data lifts mining sector * BES shares volatile after short-selling ban By Tricia Wright LONDON, July 1 (Reuters) - European shares began the month with a gain, as BNP Paribas rose on relief it had settled a U.S. sanctions case and mining companies rallied after encouraging economic data came out of China, the world\'s top metals consumer. The pan-European FTSEurofirst 300 index closed up 0.9 percent at 1,382.31 points - notching its biggest one-day percentage gain since May 8. BNP Paribas rose 3.6 percent in trading volume of almost twice its 90-day daily average. It had lost about 20 percent - or $21 billion of its market value - since Feb. 13 when it announced the provision for the fine. The French bank pleaded guilty to two criminal charges and agreed to pay almost $9 billion to resolve allegations that in many financial dealings it violated U.S. sanctions against Sudan, Cuba and Iran

In [17]:
df.iloc[35750]['cleaned_article_text']

'European shares began month gain BNP Paribas relief settled US sanctions case mining companies rallied encouraging economic came China world metals consumer The pan European FTSEurofirst closed notching biggest one day percentage gain since May BNP Paribas trading volume almost twice day daily It lost billion value since Feb announced provision fine The French bank pleaded guilty two criminal charges agreed pay almost billion resolve allegations many financial dealings violated US sanctions Sudan Cuba Iran Analysts investors stock could recover ground lost last months The size fine knew reaction BNP extremely reassuring comments efforts made protect dividend The bank keeping targets must mean enjoyed good first half Montsegur Finance fund manager Francois Chaulet Mining stocks also demand upbeat factory activity China reinforced signs stabilisation economy Rio Tinto among performers falling underperforming mining sector whole An upgrade buy BofA Merrill Lynch citing factors including 