### Web data retrieval

In [None]:
import requests
import textwrap

In [None]:
#Let's download the data from inshorts website. In this case, news articles will be from 'technolgy' category
url = 'https://inshorts.com/en/read/technology'

In [None]:
news_category = url.split('/')[-1]
news_category

Download HTML data

In [None]:
#Download the data from Website
data = requests.get(url)
data.content

### Data Cleaning

We can use Beautiful Soup package to clean Web data

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(data.content, 'html.parser')

In [None]:
soup

Read all the articles. For each article, we will read:

1. Headline
2. Article body
3. Category

This is done by reading text between specific HTML tags. The tags depend on actual web page

In [None]:
news_data = []

In [None]:
news_articles = [{'news_headline': headline.find('span', attrs={'itemprop': 'headline'}).string,
                  'news_article': article.find('div', attrs={'itemprop': 'articleBody'}).string,
                  'news_category': news_category} 
                 for headline, article in zip(soup.find_all('div', 
                                                            class_ = ['news-card-title news-right-box']), 
                                              soup.find_all('div', class_=['news-card-content news-right-box']))]

In [None]:
#Check news data
news_data.extend(news_articles)
news_data

Read the news data in a Dataframe

In [None]:
import pandas as pd

In [None]:
#Building dataframe
df = pd.DataFrame(news_data, columns=['news_headline', 'news_article', 'news_category'])

In [None]:
df.head()

### Extract multiple categories

Function to extract data from inshorts.com. The function will:

1. take a URLs list as input
2. Get content for each URL
3. Extract news article headline, body and category

In [None]:
urls_list = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [None]:
def datasetPrepare(urls_list):
    
    news_data = []
    for url in urls_list:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        news_articles = [{'news_headline': headline.find('span', attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles) 
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df    

In [None]:
#Build the dataframe
news_df = datasetPrepare(urls_list)

In [None]:
news_df.sample(n=5)

In [None]:
#Articles count by category
news_df.news_category.value_counts()

# Text Wrangling and Pre-processing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import re
import unicodedata
from nltk.stem import WordNetLemmatizer

## Remove HTML tags

In [None]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

## Remove accented characters

In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

## Remove special characters

In [None]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [None]:
remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)

## Text lemmatization

In [None]:
def lemmatize_text(text):

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])    

In [None]:
lemmatize_text("My system keeps crashing, his crashed yesterday, ours crashes daily")

## Text stemming

In [None]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

## Building a text normalizer

In [None]:
def normalize_corpus(corpus, html_stripping=True, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

## Pre-process and normalize news articles

In [None]:
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

In [None]:
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

In [None]:
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

# Save the news articles

In [None]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')