In [22]:
import os
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import json
import re

In [3]:
import pandas as pd
import numpy as np

In [None]:
import os
import nltk
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Import a zipfile from web and reading it into pandas

In [4]:
news_uci = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip'

In [5]:
file = requests.get(news_uci).content

In [7]:
with zipfile.ZipFile(io.BytesIO(file)) as z:
    with z.open('newsCorpora.csv', mode = 'r') as f:
        train = pd.read_csv(f, header = 0, delimiter="\t")
        print(train.head())    # print the fir

   1 Fed official says weak data caused by weather, should not slow taper  \
0  2  Fed's Charles Plosser sees high bar for change...                     
1  3  US open: Stocks fall after Fed official hints ...                     
2  4  Fed risks falling 'behind the curve', Charles ...                     
3  5  Fed's Plosser: Nasty Weather Has Curbed Job Gr...                     
4  6  Plosser: Fed May Have to Accelerate Tapering Pace                     

  http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310,0,1312750.story\?track=rss  \
0  http://www.livemint.com/Politics/H2EvwJSK2VE6O...                                                                            
1  http://www.ifamagazine.com/news/us-open-stocks...                                                                            
2  http://www.ifamagazine.com/news/fed-risks-fall...                                                                            
3  http://www.moneyne

## Online Resources

In [9]:
r = requests.get("https://quotes.rest/qod.json").json()

In [11]:
print(json.dumps(r, indent=4))

{
    "success": {
        "total": 1
    },
    "contents": {
        "quotes": [
            {
                "quote": "Great things are done by a series of small things brought together.",
                "author": "Vincent Van Gogh",
                "length": "67",
                "tags": [
                    "inspire",
                    "small-things",
                    "tso-art"
                ],
                "category": "inspire",
                "title": "Inspiring Quote of the day",
                "date": "2019-02-24",
                "id": null
            }
        ],
        "copyright": "2017-19 theysaidso.com"
    }
}


In [18]:
q = r['contents']['quotes'][0]['author']

In [19]:
q

'Vincent Van Gogh'

## Cleaning

In [20]:
r = requests.get("https://news.ycombinator.com").text

In [21]:
print(r)

<html op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?Swdnfjd2lvQXPAqH2Hs6">
            <link rel="shortcut icon" href="favicon.ico">
          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
              <a href="newest">new</a> | <a href="front">past</a> | <a href="newco

In [23]:
pattern = re.compile(r'<.*?>')  # tags look like <...>
print(pattern.sub('', r))  # replace them with blank


            
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      CSS Powered 3D Engine (keithclark.co.uk)
        59 points by Zagitta 1 hour ago  | hide | 4&nbsp;comments              
      
                
      2.      Discord Scaled Elixir to 5M Concurrent Users (2017) (discordapp.com)
        43 points by lelf 1 hour ago  | hide | 3&nbsp;comments              
      
                
      3.      AMA: Steven Pruitt, Wikipedian with over 3 million edits (reddit.com)
        131 points by aboutruby 5 hours ago  | hide | 19&nbsp;comments              
      
                
      4.      Tracking my phone's silent connections (kushaldas.in)
        56 points by jaclaz 3 hours ago  | hide | 26&nbsp;comments              
      
                
      5.      Linux Fsync Issue for 

### Using BeautifulSoup

In [24]:
soup = BeautifulSoup(r,'lxml')

In [28]:
print(soup.get_text())




Hacker News

Hacker News
new | past | comments | ask | show | jobs | submit 
login




1. CSS Powered 3D Engine (keithclark.co.uk)
59 points by Zagitta 1 hour ago  | hide | 4 comments 


2. Discord Scaled Elixir to 5M Concurrent Users (2017) (discordapp.com)
43 points by lelf 1 hour ago  | hide | 3 comments 


3. AMA: Steven Pruitt, Wikipedian with over 3 million edits (reddit.com)
131 points by aboutruby 5 hours ago  | hide | 19 comments 


4. Tracking my phone's silent connections (kushaldas.in)
56 points by jaclaz 3 hours ago  | hide | 26 comments 


5. Linux Fsync Issue for Buffered IO and Its Preliminary Fix for PostgreSQL (percona.com)
17 points by avivallssa 2 hours ago  | hide | 1 comment 


6. HyperCard Adventures (hypercardadventures.com)
146 points by vmbrasseur 9 hours ago  | hide | 22 comments 


7. Enquire: Everything you wanted to know about your C Compiler and Machine (cwi.nl)
57 points by Tomte 6 hours ago  | hide | 17 comments 


8. Prominent D.C. media firm implic

In [29]:
summaries = soup.find_all("tr", class_="athing")
summaries[0]

<tr class="athing" id="19238340">
<td align="right" class="title" valign="top"><span class="rank">1.</span></td> <td class="votelinks" valign="top"><center><a href="vote?id=19238340&amp;how=up&amp;goto=news" id="up_19238340"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://keithclark.co.uk/labs/css-fps/">CSS Powered 3D Engine</a><span class="sitebit comhead"> (<a href="from?site=keithclark.co.uk"><span class="sitestr">keithclark.co.uk</span></a>)</span></td></tr>

In [30]:
# Find all articles, extract titles
articles = []
summaries = soup.find_all("tr", class_="athing")
for summary in summaries:
    title = summary.find("a", class_="storylink").get_text().strip()
    articles.append((title))

print(len(articles), "Article summaries found. Sample:")
print(articles[0])

30 Article summaries found. Sample:
CSS Powered 3D Engine


## Normalization

In [31]:
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?


In [33]:
text.lower()

'the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?'

In [36]:
text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
text

'The first time you see The Second Renaissance it may look boring  Look at it at least twice and definitely watch part 2  It will change your view of the matrix  Are the human people the ones who started the war   Is AI a bad thing  '

## Tokenization

In [39]:
word_tokenize(text)

['The',
 'first',
 'time',
 'you',
 'see',
 'The',
 'Second',
 'Renaissance',
 'it',
 'may',
 'look',
 'boring',
 'Look',
 'at',
 'it',
 'at',
 'least',
 'twice',
 'and',
 'definitely',
 'watch',
 'part',
 '2',
 'It',
 'will',
 'change',
 'your',
 'view',
 'of',
 'the',
 'matrix',
 'Are',
 'the',
 'human',
 'people',
 'the',
 'ones',
 'who',
 'started',
 'the',
 'war',
 'Is',
 'AI',
 'a',
 'bad',
 'thing']

In [41]:
sent_tokenize(text)

['The first time you see The Second Renaissance it may look boring  Look at it at least twice and definitely watch part 2  It will change your view of the matrix  Are the human people the ones who started the war   Is AI a bad thing']

In [45]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
# Remove stop words
words = [w for w in word_tokenize(text) if w not in stopwords.words("english")]
print(words)

['The', 'first', 'time', 'see', 'The', 'Second', 'Renaissance', 'may', 'look', 'boring', 'Look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'It', 'change', 'view', 'matrix', 'Are', 'human', 'people', 'ones', 'started', 'war', 'Is', 'AI', 'bad', 'thing']


## Stemming and Lemmatization

### Stemming

In [49]:
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['the', 'first', 'time', 'see', 'the', 'second', 'renaiss', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definit', 'watch', 'part', '2', 'It', 'chang', 'view', 'matrix', 'are', 'human', 'peopl', 'one', 'start', 'war', 'Is', 'AI', 'bad', 'thing']


### Lemmatization

In [50]:
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['The', 'first', 'time', 'see', 'The', 'Second', 'Renaissance', 'may', 'look', 'boring', 'Look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'It', 'change', 'view', 'matrix', 'Are', 'human', 'people', 'one', 'started', 'war', 'Is', 'AI', 'bad', 'thing']


In [51]:
# Lemmatize verbs by specifying pos
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
print(lemmed)

['The', 'first', 'time', 'see', 'The', 'Second', 'Renaissance', 'may', 'look', 'bore', 'Look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'It', 'change', 'view', 'matrix', 'Are', 'human', 'people', 'one', 'start', 'war', 'Is', 'AI', 'bad', 'thing']
