In [38]:
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from functools import reduce
import operator

In [83]:
def get_media_list():
    response = requests.get('https://m.news.naver.com/newspaper/home.nhn')
    soup = BeautifulSoup(response.text, 'html.parser')
    return [{'id': media.get('id'), 'title': media.find('img')['alt']} for media in soup.find('ul', {'class': ['offc_lst', '_headline_list']}).find_all('li')]

def get_article_list(media, ymd):
    url = 'https://media.naver.com/press/{}/newspaper?date={}'
    response = requests.get(url.format(media, ymd))
    soup = BeautifulSoup(response.text, 'html.parser')
    article_list = [page.find_all('li') for page in soup.find_all('ul', {'class': 'newspaper_article_lst'})]
    if article_list:
        article_list = reduce(operator.concat, article_list)
    article_list = [{'title': article.find('strong').text, 'link': article.find('a')['href']} for article in article_list]
    return article_list

In [85]:
media_list = get_media_list()
ymd = datetime.now().strftime('%Y%m%d')
for media in media_list:
    article_list = get_article_list(media['id'], ymd)
    print(media['title'], len(article_list))

조선일보 129
전자신문 129
세계일보 90
디지털타임스 98
파이낸셜뉴스 105
헤럴드경제 105
한국일보 78
매일경제 161
한국경제 154
서울신문 117
머니투데이 83
코리아헤럴드 16
서울경제 140
한겨레 78
동아일보 107
국민일보 95
중앙일보 75
경향신문 92
이데일리 86
문화일보 0
아시아경제 0


In [99]:
from IPython.core.display import display, HTML
display(HTML('''
<!DOCTYPE html>
<html>
<head>
  <link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700,900|Material+Icons" rel="stylesheet">
  <link href="https://cdn.jsdelivr.net/npm/vuetify/dist/vuetify.min.css" rel="stylesheet">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no, minimal-ui">
</head>
<body>
  <div id="app">
    <v-app>
      <v-content>
          <v-layout row>
            <v-flex xs12 sm2 offset-sm3>
              <v-card>
                <v-toolbar color="pink" dark>
                  <v-toolbar-title>Inbox</v-toolbar-title>
                </v-toolbar>

                <v-list>
                  <template v-for="(item, index) in items">
                    <v-list-tile
                      :key="item.title"
                      @click="toggle(index)"
                    >
                      <v-list-tile-content>
                        <v-list-tile-title>{{ item.title }}</v-list-tile-title>
                      </v-list-tile-content>
                    </v-list-tile>
                  </template>
                </v-list>
              </v-card>
            </v-flex>
          </v-layout>
      </v-content>
    </v-app>
  </div>

  <script src="https://cdn.jsdelivr.net/npm/vue/dist/vue.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/vuetify/dist/vuetify.js"></script>
  <script>
    new Vue({ 
        el: '#app',
        data: {
            selected: [2],
            items: [
              {
                action: '15 min',
                headline: 'Brunch this weekend?',
                title: 'Ali Connors',
                subtitle: "I'll be in your neighborhood doing errands this weekend. Do you want to hang out?"
              },
              {
                action: '2 hr',
                headline: 'Summer BBQ',
                title: 'me, Scrott, Jennifer',
                subtitle: "Wish I could come, but I'm out of town this weekend."
              },
              {
                action: '6 hr',
                headline: 'Oui oui',
                title: 'Sandra Adams',
                subtitle: 'Do you have Paris recommendations? Have you ever been?'
              },
              {
                action: '12 hr',
                headline: 'Birthday gift',
                title: 'Trevor Hansen',
                subtitle: 'Have any ideas about what we should get Heidi for her birthday?'
              },
              {
                action: '12 hr',
                headline: 'Birthday gift',
                title: 'Trevor Hansen',
                subtitle: 'Have any ideas about what we should get Heidi for her birthday?'
              },
              {
                action: '12 hr',
                headline: 'Birthday gift',
                title: 'Trevor Hansen',
                subtitle: 'Have any ideas about what we should get Heidi for her birthday?'
              },
              {
                action: '12 hr',
                headline: 'Birthday gift',
                title: 'Trevor Hansen',
                subtitle: 'Have any ideas about what we should get Heidi for her birthday?'
              },
              {
                action: '18hr',
                headline: 'Recipe to try',
                title: 'Britta Holt',
                subtitle: 'We should eat this: Grate, Squash, Corn, and tomatillo Tacos.'
              }
            ]        
        },
        methods: {
         toggle (index) {
            const i = this.selected.indexOf(index)

            if (i > -1) {
              this.selected.splice(i, 1)
            } else {
              this.selected.push(index)
            }
          }        
        }
    })
  </script>
</body>
</html>
'''))

In [1]:
from requests_html import HTMLSession

In [2]:
session = HTMLSession()

[{'id': '015', 'title': '한국경제'},
 {'id': '008', 'title': '머니투데이'},
 {'id': '020', 'title': '동아일보'},
 {'id': '014', 'title': '파이낸셜뉴스'},
 {'id': '469', 'title': '한국일보'},
 {'id': '022', 'title': '세계일보'},
 {'id': '009', 'title': '매일경제'},
 {'id': '044', 'title': '코리아헤럴드'},
 {'id': '005', 'title': '국민일보'},
 {'id': '029', 'title': '디지털타임스'},
 {'id': '081', 'title': '서울신문'},
 {'id': '032', 'title': '경향신문'},
 {'id': '028', 'title': '한겨레'},
 {'id': '018', 'title': '이데일리'},
 {'id': '011', 'title': '서울경제'},
 {'id': '025', 'title': '중앙일보'},
 {'id': '030', 'title': '전자신문'},
 {'id': '023', 'title': '조선일보'},
 {'id': '016', 'title': '헤럴드경제'},
 {'id': '021', 'title': '문화일보'},
 {'id': '277', 'title': '아시아경제'}]

In [62]:
li.attrs['id']

'015'

In [66]:
li.find('img', first=True).attrs['alt']

'한국경제'

In [68]:
def get_media_list():
    r = session.get('https://m.news.naver.com/newspaper/home.nhn')
    return [{'id': li.attrs['id'], 'title': li.find('img', first=True).attrs['alt']} for li in r.html.find('ul.offc_lst._headline_list > li')]

In [69]:
def get_article_list(media, ymd):
    url = 'https://media.naver.com/press/{}/newspaper?date={}'
    r = session.get(url.format(media, ymd))
    article_list = [{'title': article.find('strong').text, 'link': article.find('a')} for article in r.html.find('ul.newspaper_article_lst > li')]
    return article_list

[{'id': '029', 'title': '디지털타임스'},
 {'id': '469', 'title': '한국일보'},
 {'id': '025', 'title': '중앙일보'},
 {'id': '032', 'title': '경향신문'},
 {'id': '015', 'title': '한국경제'},
 {'id': '014', 'title': '파이낸셜뉴스'},
 {'id': '044', 'title': '코리아헤럴드'},
 {'id': '009', 'title': '매일경제'},
 {'id': '023', 'title': '조선일보'},
 {'id': '081', 'title': '서울신문'},
 {'id': '011', 'title': '서울경제'},
 {'id': '022', 'title': '세계일보'},
 {'id': '028', 'title': '한겨레'},
 {'id': '018', 'title': '이데일리'},
 {'id': '020', 'title': '동아일보'},
 {'id': '008', 'title': '머니투데이'},
 {'id': '005', 'title': '국민일보'},
 {'id': '030', 'title': '전자신문'},
 {'id': '016', 'title': '헤럴드경제'},
 {'id': '021', 'title': '문화일보'},
 {'id': '277', 'title': '아시아경제'}]

In [70]:
url = 'https://media.naver.com/press/{}/newspaper?date={}'
r = session.get(url.format('029', '20190502'))

In [84]:
r.html.find('ul.newspaper_article_lst > li', first=True).html

'<li>\n<a href="https://n.news.naver.com/article/newspaper/029/0002524423?date=20190502" onclick="nclk(event,\'pap.alist\',\'\',\'\');">\n<div class="newspaper_img_frame">\n<img alt="&#xC12C;&#xB124;&#xC77C; &#xC774;&#xBBF8;&#xC9C0;" onerror="setNotFoundImage(this, \'430\');" src="https://mimgnews.pstatic.net/image/origin/029/2019/05/01/2524423.jpg?type=nf600_340"/>\n</div>\n<div class="newspaper_txt_box">\n<strong>저성장 구도 고착화…위축경제 직면한 韓</strong>\n<p>실질 GDP, 잠재 GDP 밑돌아 7년 연속 \'마이너스 아웃풋 갭\' 성장률 2배 넘는 정부지출 문제 투자·소비여력 감소 가능성 커 [디지털타임스 박정일 기자] 한국 경제가 저성장 구도의 고착화 단계인 \'위축경제\'에 </p>\n</div>\n</a>\n</li>'

In [2]:
#https://tts.news.naver.com/article/032/0002938140/summary
from pampy import match
import re

In [9]:
s = 'https://n.news.naver.com/article/newspaper/029/0002524423?date=20190502'
m = match(s, re.compile('https://.+/.+/.+/([0-9]+)/([0-9]+)'), lambda m, a: (m, a))
'https://tts.news.naver.com/article/{}/{}/summary'.format(*m)

'https://tts.news.naver.com/article/029/0002524423/summary'

In [10]:
r = session.get('https://tts.news.naver.com/article/029/0002524423/summary')

NameError: name 'session' is not defined