In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import re
import string

# Scrape URLs of all articles

In [None]:
from scrape_urls import *

# Commented out because it starts a long process
# urls_2019 = get_months(2019)

# with open('url_19.txt', 'w') as fout:
#     fout.write('\n'.join(urls_2019))

Once URLs were scraped they were fed into Scrapy which got the actual articles.

Article data was loaded into MongoDB.

# Pull text data from mongo

In [None]:
client = MongoClient()

db = client.proj4

In [None]:
data = list(db.all_19.find({}, {'title':1, 'text':1, 'date':1, 'url':1, '_id':0}))

tds_data = pd.DataFrame(data)

# Clean text data

To clean the raw text, I removed all non-standard characters like emojis, made all words lowercase, dropped punctuation, and removed all numeric characters.

This left the documents in a much more standardized state where they were easier to work with.

## Cleaning Helper Functions

In [None]:
def del_emoji(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def clean_text(text):
    
    # remove emoji and make lowercase
    clean_text = del_emoji(text).lower()
    
    # remove punctuation
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', clean_text)
    
    # remove digits
    clean_text = re.sub('\w*\d\w*', ' ', clean_text)
    
    return clean_text


tds_data['text'] = tds_data.text.apply(clean_text)

In addition to cleaning the article's text, I dropped all the articles that were less than 500 words in length. This eliminated a number of articles that were not parsed properly during scraping, and left me with slightly longer articles that had more well-defined topics.

The URLs of articles all had a '?' followed by some kind of hex key. I needed to remove this key from the end to be able to merge with the claps data which I scraped later on.

In [None]:
# Drop articles with less than 500 words
tds_data = tds_data[tds_data.text.apply(lambda x: len(x.split(' '))>=500)]

tds_data['url'] = tds_data.url.apply(lambda x: x.split('?')[0])

# Stemming and lemmatization

I stemmed and lemmatized all of my documents to standardize the forms of all the words and find more commonality between documents.

I was hoping to use Spacy for lemmatization because I find their lemmatizer to be more consistent than NLTK, but I did not have time to implement that.

In [None]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def stemmify(text):
    return ' '.join([stemmer.stem(word) for word in text.split(' ')])

def lemmafy(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])

In [None]:
tds_data['stemmed'] = tds_data.text.apply(stemmify)
tds_data['lemmad'] = tds_data.text.apply(lemmafy)


# Pull claps data

I had to go back and scrape claps (likes) for each of my articles after the fact, so I pulled them in from a separate JSON and merged it with the text data.

In [None]:
import json

with open('./scrapy/tds/tds/claps.json', 'r') as j:
    clap_json = json.loads(j.read())
    
claps = pd.DataFrame(clap_json)

## Cleaning claps

I did a bit of work cleaning up the clap data, filling Nulls and trimming the URLs to be able to merge with the text.

Articles with >1000 claps are listed as having #.#K claps, so I converted the K to a multiple of 1000 with a short function.

In [None]:
claps['claps'] = claps.claps.fillna('0')
claps['claps'] = claps.claps.apply(lambda x: x.strip())
claps['url'] = claps.url.apply(lambda x: x.split('?')[0])

def convert_claps(clap_str):
    
    try:
        claps = int(clap_str)
    except:
        if 'K' in clap_str:
            claps = int(float(clap_str[:-1])*1000)
        else:
            claps = 0
    return claps

claps['claps'] = claps.claps.apply(convert_claps)

# Merge text and claps

In [None]:
tds_data = tds_data.merge(claps, on='url')

# Send data back to mongo

The dataframe needs to be converted to a list of dicts before it can be imported into mongodb

In [None]:
tds_dicts = tds_data.to_dict('records')

In [None]:
# db.final_cleaned.insert_many(tds_dicts)