# Natural Langauge Processing

1. Topic Modeling of Articles
2. Clustering of sensational websites with news outlets

In [14]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
from pymongo import MongoClient
import pandas as pd
import pickle
from pymongo import MongoClient
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords 
import re # Regular expression library
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from gensim import corpora, models, similarities, matutils


chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [2]:
#url = 'https://www.nickiswift.com/'
#url = 'https://www.grunge.com/'
url = 'http://www.giveitlove.com/'

In [317]:
driver = webdriver.Chrome(chromedriver)
driver.get(url)

In [315]:
def scroll():
   
    '''Scrolls to the bottom of a long webpage for a max of 30 seconds'''
    SCROLL_PAUSE_TIME = 4

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    t_end = time.time() + 120 
    
    while time.time() < t_end:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [113]:
def create_article_list(soup):
    ''' Creates an article list from the home page of a website'''
    
    article_list = []
    for link in soup.find_all('a'): 
        try:
            if url in link['href']:
                article_list.append(link['href'])
        except:
            break
    return article_list

In [170]:
def create_article_dict(article_list):
    '''returns the article text for each article in a list of articles'''
    article_dict = {}
    
    for article in article_list:
        
        response = requests.get(article)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        article_title = soup.find('h1')
        article_text = soup.find_all(['h2', 'p'])
        
        if len(article_text) > 3:
                   
            article_dict[article_title] = article_text
    
    return article_dict

In [100]:
article_dict = create_article_dict(article_list)

In [179]:
def create_article_dictionary_list(url):
    
    #scroll()
    soup = BeautifulSoup(driver.page_source, "lxml")
    article_list = create_article_list(soup)
    
    return create_article_dict(article_list)

In [181]:
data = create_article_dictionary_list(url)

In [287]:
grunge_data = create_article_dictionary_list(url)

In [318]:
giveitlove_data = create_article_dictionary_list(url)

## Create MongoDB

In [3]:
client = MongoClient()

In [4]:
mydb = client["junk_website_data"]

Insert into MongoDB

In [7]:
for key, value in giveitlove_data.items():
    
    data_dict = {}
    data_dict['website'] = url
    data_dict['title'] = str(key)
    data_dict['text'] = str(value)
    mydb.junk_website_data.insert_one(data_dict)
    
#     print(key, value)

NameError: name 'giveitlove_data' is not defined

In [8]:
print(client.list_database_names())

['admin', 'config', 'junk_website_data', 'local']


In [9]:
mydb.list_collection_names()

['junk_website_data']

## Clean MongoDB Data

### Titles

In [42]:
title_cursor = mydb.junk_website_data.find({}, {'_id':0, 'title': 1}).limit(1)
title_list = list(title_cursor)
title = title_list[0]['title']

In [33]:
soup = BeautifulSoup(title, "lxml")

In [34]:
soup.get_text()

'The real reason these contestants were kicked off The Bachelor'

### Text

In [19]:
client = MongoClient()
mydb = client["junk_website_data"]

In [20]:
text_cursor = mydb.junk_website_data.find({}, {'_id':0, 'title': 1, 'text': 1})
articles = list(text_cursor)

In [54]:
class NLP_Pipeline:
    
    def __init__(self, vectorizer=None):

        self.nltk_stop_words = set(stopwords.words('english'))
        if not vectorizer:
            vectorizer = CountVectorizer(stop_words=self.nltk_stop_words, min_df=15, max_df=0.25, ngram_range=(1,3))
        self.model = None
        self.vectorizer = vectorizer
        
    def remove_html(self, article):

        text_corpus_list = []

        text = article['text']
        soup = BeautifulSoup(text, "lxml")
        article_text = soup.get_text()[1:-1]

        return article_text
    
    def text_cleaning(self, article):
    
        clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', article)
        clean_text = re.sub('\w*\d\w*', ' ', clean_text)
        clean_text = clean_text.lower() 

        return clean_text

    def text_lemmatizing(self, article):
    
        lemmatized_word_list = []
        words = word_tokenize(article)
        wordnet_lemmatizer = WordNetLemmatizer()
        for word in words:
            lemmatized_word = wordnet_lemmatizer.lemmatize(word, pos='v')
            lemmatized_word_list.append(lemmatized_word)

        lemmatized_word_string = ' '.join(lemmatized_word_list)
        
        return lemmatized_word_string
        
    def fit(self, articles):
        
        cleaned_article_corpus = []
        for article in articles:
    
            article_no_html = self.remove_html(article)
            clean_article = self.text_cleaning(article_no_html)
            lemmatized_article = self.text_lemmatizing(clean_article)
            cleaned_article_corpus.append(lemmatized_article)
        
        self.model = self.vectorizer.fit_transform(cleaned_article_corpus)
         

## Model the Data

In [39]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## SVD - Count Vectorizer

In [72]:
cv = NLP_Pipeline()
cv.fit(articles)

<3094x20521 sparse matrix of type '<class 'numpy.int64'>'
	with 1264814 stored elements in Compressed Sparse Row format>

In [73]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(cv.model)
display_topics(lsa, cv.vectorizer.get_feature_names(), 20)


Topic  0
game, dog, movie, band, team, cat, war, rock, character, song, video, white, school, season, photo, actor, john, album, human, fire

Topic  1
dog, cat, photo, rescue, animals, save, credit, owner, pet, animal, humans, water, look like, cub, pup, human, photo credit, food, adopt, boat

Topic  2
cat, myth, snapchat, food, human, milk, image, kitty, kitten, credit, researchers, space, feral, ancient, paw, study, hunt, litter, domesticate, color

Topic  3
game, team, sport, season, bowl, super, players, coach, nba, super bowl, player, nfl, league, football, ball, field, basketball, cup, olympics, championship

Topic  4
game, band, song, winner, album, team, rock, dog, cat, songs, sport, bowl, player, super bowl, players, tour, nba, coach, outstanding, super


## SVD - TFIDF

In [74]:
tfidf_vectorizer = TfidfVectorizer(stop_words=nltk_stop_words, min_df=15, max_df=0.25, ngram_range=(1,3))
tfidf = NLP_Pipeline(vectorizer = tfidf_vectorizer)
tfidf.fit(articles)

In [75]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(tfidf.model)
display_topics(lsa, tfidf.vectorizer.get_feature_names(), 20)


Topic  0
game, movie, band, actor, wed, dog, character, song, war, season, team, video, tweet, award, photo, rock, royal, grande, school, police

Topic  1
markle, meghan, royal, harry, prince, wed, prince harry, duchess, meghan markle, royal family, thomas, swift previously report, swift previously, nicki swift previously, grande, engagement, palace, middleton, kate, davidson

Topic  2
markle, royal, meghan, harry, prince, prince harry, duchess, meghan markle, thomas, royal family, queen, palace, dog, middleton, game, william, war, george, princess, sussex

Topic  3
grande, davidson, ariana, pete, miller, ariana grande, pete davidson, band, album, song, mac, mac miller, night live, pop star, comedian, saturday night live, bieber, saturday night, baldwin, grande davidson

Topic  4
lovato, sexual, drug, sobriety, arrest, abuse, band, addiction, overdose, charge, demi, assault, allege, police, weinstein, tmz, argento, rehab, accuse, allegedly


## NMF - Count Vectorizer

In [78]:
#Count Vectorizer
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(cv.model)
display_topics(nmf_model, cv.vectorizer.get_feature_names(), 20)


Topic  0
movie, war, character, president, white, actor, trump, school, men, build, job, fire, human, role, america, movies, party, stuff, murder, allegedly

Topic  1
dog, photo, rescue, save, look like, pup, cub, animals, adopt, credit, boat, animal, owner, water, photo credit, breed, pant, khan, pet, taco

Topic  2
cat, credit, food, myth, human, snapchat, image, pet, animals, owner, humans, paw, eat, milk, kitten, animal, water, kitty, sleep, fish

Topic  3
game, team, sport, season, super, bowl, players, player, coach, nba, super bowl, league, nfl, football, ball, field, score, basketball, title, cup

Topic  4
band, song, rock, album, winner, songs, roll, tour, group, stone, sing, video, john, mercury, roll stone, single, lyric, queen, albums, award


## NMF - TFIDF

In [76]:
#Using TF-IDF
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(tfidf.model)
display_topics(nmf_model, tfidf.vectorizer.get_feature_names(), 20)


Topic  0
game, movie, band, dog, war, character, team, song, rock, season, school, president, sport, actor, trump, white, album, men, human, murder

Topic  1
markle, meghan, royal, harry, prince, prince harry, wed, duchess, meghan markle, thomas, royal family, palace, middleton, queen, sussex, windsor, kate, princess, duchess sussex, harry meghan

Topic  2
kardashian, jenner, welcome, first child, caption, swift previously, swift previously report, nicki swift previously, mom, divorce, child together, marriage, thompson, excite, june, us weekly, girl, wed, social media, weekly

Topic  3
grande, davidson, ariana, pete, miller, ariana grande, pete davidson, engagement, comedian, night live, pop star, saturday, saturday night live, bieber, mac miller, saturday night, mac, baldwin, grande davidson, snl

Topic  4
lovato, sobriety, overdose, demi, addiction, drug, rehab, abuse, sober, health, demi lovato, substance abuse, substance, relapse, struggle, help privacy, help privacy policy, reco

## LDA - Count Vectorizer

In [216]:
doc_word = count_vectorizer.transform(cleaned_article_corpus).transpose()

In [217]:
pd.DataFrame(doc_word.toarray(), count_vectorizer.get_feature_names()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3084,3085,3086,3087,3088,3089,3090,3091,3092,3093
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
aback,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abbey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abbott,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

In [221]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [223]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)

In [224]:
lda.print_topics()

[(0,
  '0.003*"band" + 0.002*"trump" + 0.002*"movies" + 0.002*"murder" + 0.001*"arrest" + 0.001*"director" + 0.001*"album" + 0.001*"mercury" + 0.001*"comedy" + 0.001*"roll stone"'),
 (1,
  '0.004*"dog" + 0.002*"cat" + 0.002*"band" + 0.002*"credit" + 0.002*"sport" + 0.001*"image" + 0.001*"animals" + 0.001*"century" + 0.001*"animal" + 0.001*"space"'),
 (2,
  '0.002*"divorce" + 0.002*"winner" + 0.002*"harry" + 0.002*"tmz" + 0.002*"grande" + 0.001*"royal" + 0.001*"rapper" + 0.001*"kardashian" + 0.001*"meghan" + 0.001*"prince"')]

In [225]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]

<gensim.interfaces.TransformedCorpus at 0x1a38e6deb8>

In [226]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [227]:
lda_docs[0:5]

[[(0, 0.37225583), (2, 0.62675345)],
 [(0, 0.2294572), (1, 0.2791612), (2, 0.4913816)],
 [(0, 0.7855218), (2, 0.21371226)],
 [(2, 0.99563247)],
 [(0, 0.3221556), (2, 0.67626154)]]

## LDA -TFIDF

In [38]:
tfidf_vectorizer = TfidfVectorizer(stop_words=nltk_stop_words, min_df=15, max_df=0.25, ngram_range=(1,3))
doc_word_tfidf = tfidf_vectorizer.fit_transform(cleaned_article_corpus).transpose()

In [39]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word_tfidf)

In [40]:
id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())

In [41]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=5)

In [42]:
lda.print_topics()

[(0,
  '0.001*"game" + 0.001*"dog" + 0.001*"band" + 0.001*"movie" + 0.001*"war" + 0.001*"team" + 0.001*"character" + 0.001*"song" + 0.001*"photo" + 0.001*"rock"'),
 (1,
  '0.000*"moonves" + 0.000*"frankel" + 0.000*"shield" + 0.000*"mercury" + 0.000*"chen" + 0.000*"farrow" + 0.000*"sexual" + 0.000*"freddie" + 0.000*"brolin" + 0.000*"freddie mercury"'),
 (2,
  '0.002*"grande" + 0.002*"davidson" + 0.001*"kardashian" + 0.001*"engagement" + 0.001*"thompson" + 0.001*"khloé" + 0.001*"swift previously report" + 0.001*"swift previously" + 0.001*"nicki swift previously" + 0.001*"pete davidson"'),
 (3,
  '0.001*"sorrentino" + 0.001*"child together" + 0.001*"chopra" + 0.001*"jonas" + 0.001*"wilkinson" + 0.001*"first child" + 0.001*"baby news" + 0.001*"excite baby" + 0.001*"second child" + 0.001*"kendra"'),
 (4,
  '0.003*"markle" + 0.002*"meghan" + 0.002*"lovato" + 0.002*"royal" + 0.001*"harry" + 0.001*"prince harry" + 0.001*"duchess" + 0.001*"prince" + 0.001*"meghan markle" + 0.001*"sobriety"')]

## Cluster Topics