In [7]:
from bs4 import BeautifulSoup
import urllib3
import pandas as pd

In [5]:
import json


def extract_inshorts(categories, min_items):
    result = []
    for category in categories:
        labeled = []
        offset = ''
        while len(labeled) < min_items:
            downloaded = download_for(category, offset)
            offset = downloaded['offset']
            labeled += downloaded['stories']
        result += labeled
    return result


def download_for(category, offset):
    http = urllib3.PoolManager()
    params = {'category': category, 'news_offset': offset}
    resp = http.request('POST', 'https://inshorts.com/en/ajax/more_news', body=json.dumps(params))
    obj = json.loads(resp.data)
    return {'stories': label(parse_stories_from(obj['html']), category), 'offset': obj['min_news_id']}


def label(stories, category):
    return [{"label": category, "content": story} for story in stories]


def parse_stories_from(html):
    soup = BeautifulSoup(html, 'html.parser')
    unflattened = [i.contents for i in soup.findAll("div", {"itemprop": 'articleBody'})]
    stories_list = [item for sublist in unflattened for item in sublist]
    return stories_list

In [8]:
df = pd.DataFrame(extract_inshorts(['business', 'sports'], 100))

















In [98]:
%matplotlib inline
import re
import nltk
import unicodedata
from functools import reduce
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [101]:
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def apply_on(collection, fun):
    mapped = list(map(lambda x: (fun(x), 1 if fun(x) != x else 0), collection))
    is_changed = list(map(lambda x: x[1], mapped))
    changed_count = reduce(lambda x, y: x + y, is_changed)
    collection = list(map(lambda x: x[0], mapped))
    return collection, changed_count


def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


def lemmatize(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    return text


def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

'man dog'

In [110]:
# chain of text pre-processing functions to apply
applies = [
    lambda s: s.lower(),
    remove_accented_chars,
    remove_special_characters,
    lemmatize,
    remove_stopwords
]

df.content = reduce(lambda res, fun: list(map(fun, res)), applies, df.content)

In [None]:
df.sample(frac=1).head(20)

Unnamed: 0,content,label
0,official statement kauvery hospital dmk presid...,business
1,yearold homeless web developer california davi...,business
2,neeraj chopra became first indian win gold jav...,business
3,yearold bengaluru woman lost mistakenly paid p...,business
4,yearold mentallychallenged girl wa raped repea...,business
5,bjps delhi spokesperson tajinder pal singh bag...,business
6,yearold tribal man wa beaten death group peopl...,business
7,trai chairman r sharma challenged twitterati p...,business
8,bullet train mumbai ahmedabad get dedicated ro...,business
9,wishing sanjay dutt th birthday sunday pooja b...,business
