In [18]:
!pip install vaderSentiment



In [19]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vs = SentimentIntensityAnalyzer()


In [20]:
text = 'The product is really Awesome'
vs.polarity_scores(text)

{'compound': 0.659, 'neg': 0.0, 'neu': 0.477, 'pos': 0.523}

In [21]:
text2 = 'Super cheap material. I would not buy even the curtains made from this synthetic. I did not even try it.'
vs.polarity_scores(text2)

{'compound': 0.5994, 'neg': 0.0, 'neu': 0.83, 'pos': 0.17}

In [22]:
text3 = 'Does not look like the dress pictured. Too short to wear as a dress, I’m 5’7” and got a large to be sure that it fit and it barely covers my bum.'
vs.polarity_scores(text3)

{'compound': 0.3999, 'neg': 0.059, 'neu': 0.808, 'pos': 0.134}

In [23]:
# Web scraping

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np 
import os 

In [25]:
url = 'https://inshorts.com/en/read/world'
news_data = []
news_category = url.split('/')[-1]
data = requests.get(url)
soup = BeautifulSoup(data.content)
print(soup)

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<style>
    /* The Modal (background) */
    .modal_contact {
        display: none; /* Hidden by default */
        position: fixed; /* Stay in place */
        z-index: 8; /* Sit on top */
        left: 0;
        top: 0;
        width: 100%; /* Full width */
        height: 100%;
        overflow: auto; /* Enable scroll if needed */
        background-color: rgb(0,0,0); /* Fallback color */
        background-color: rgba(0,0,0,0.4); /* Black w/ opacity */
    }

    /* Modal Content/Box */
    .modal-content {
        background-color: #fefefe;
        margin: 15% auto;
        padding: 20px !important;
        padding-top: 0 !important;
        /* border: 1px solid #888; */
        text-align: center;
        position: relative;
        border-radius: 6px;
    }

    /* The Close Button */
    .close {
      left: 90%;
      color: #aaa;
      float: right;
      font-size: 28px;
      font-weight: bold;
    /* positio

In [27]:
urls = ['https://inshorts.com/en/read/world',
        'https://inshorts.com/en/read/sports',
        'https://inshorts.com/en/read/politics'
        ]

def build_dataset(urls):
  news_data = []
  for url in urls:
    news_category = url.split('/')[-1]
    data = requests.get(url)
    soup = BeautifulSoup(data.content)

    news_articles = [{'news_headline':headline.find('span', attrs={"itemprop":"headline"}).string,
                      'news_article':article.find('div', attrs={"itemprop":"articleBody"}).string,
                      'news_category':news_category}
                     
                     for headline,article in zip(soup.find_all('div',class_=["news-card-title news-right-box"]),
                                                 soup.find_all('div',class_=["news-card-content news-right-box"]))]
                     
    news_articles = news_articles[0:20]
    news_data.extend(news_articles)

  df = pd.DataFrame(news_data)
  df = df[['news_headline','news_article','news_category']]
  return df

In [29]:
df = build_dataset(urls)
df.tail()

Unnamed: 0,news_headline,news_article,news_category
55,UP govt giving fake data to show fewer COVID-1...,Samajwadi Party chief Akhilesh Yadav has alleg...,politics
56,There is no oxygen shortage in Telangana: Stat...,Telangana Health Minister Eatala Rajender on T...,politics
57,Rajasthan BJP MLAs to donate 1 month salary to...,Rajasthan BJP MLAs will donate one month's sal...,politics
58,Delhi managed to install only 1 oxygen plant f...,BJP MP Gautam Gambhir on Monday criticised Del...,politics
59,Future generations won't forgive you: Priyanka...,Congress leader Priyanka Gandhi Vadra has writ...,politics


In [30]:
df.to_csv('news.csv', index=False)

In [38]:
import nltk
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
len(stopword_list)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


177

In [39]:
# Remove html tag
def html_tag(text):
  soup = BeautifulSoup(text, "html.parser")
  new_text = soup.get_text()
  return new_text

html_tag('<html><h2> Some important info </h2></html>')

' Some important info '

In [41]:
# Expand Contractions
!pip install contractions
import contractions
def con(text):
  expand = contractions.fix(text)
  return expand

con("Y'all can't expand I'd think")



'you all can not expand I would think'

In [51]:
import re 
def remove_sp(text):
  pattern = r'[^A-Za-z0-9\s]'
  text = re.sub(pattern,'',text)
  return text

remove_sp("Wow this was fun! +- skm") 

'Wow this was fun  skm'

In [62]:
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()

In [65]:
def remove_stopwords(text):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  filtered_tokens = [token for token in tokens if token not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

In [66]:
remove_stopwords("The, and, if are all stopwords and even not")

'The , , stopwords even not'

In [67]:
# 1. Lower case
# 2. HTML Tags
# 3. Contractions
# 4. Special Characters
# 5. Stop words

df.news_headline = df.news_headline.apply(lambda x:x.lower())
df.news_article = df.news_article.apply(lambda x:x.lower())

df.news_headline = df.news_headline.apply(html_tag)
df.news_article = df.news_article.apply(html_tag)

df.news_headline = df.news_headline.apply(con)
df.news_article = df.news_article.apply(con)

df.news_headline = df.news_headline.apply(remove_sp)
df.news_article = df.news_article.apply(remove_sp)

df.news_headline = df.news_headline.apply(remove_stopwords)
df.news_article = df.news_article.apply(remove_stopwords)

df.head()

Unnamed: 0,news_headline,news_article,news_category
0,australia send 500 ventilators 1m surgical mas...,australian government tuesday said provide eme...,world
1,french president posts message india hindi ami...,facebook post hindi french president emmanuel ...,world
2,india get first batch russias sputnik v covid1...,india receive first batch russias sputnik v va...,world
3,pink supermoon seen night sky across world pic...,world witnessed first 2021s two supermoons mon...,world
4,denmark aims start local production covid19 va...,denmark aims start local production covid19 va...,world


In [68]:
df['compound'] = df['news_article'].apply(lambda x: vs.polarity_scores(x)['compound'])

In [69]:
df

Unnamed: 0,news_headline,news_article,news_category,compound
0,australia send 500 ventilators 1m surgical mas...,australian government tuesday said provide eme...,world,-0.2023
1,french president posts message india hindi ami...,facebook post hindi french president emmanuel ...,world,0.1027
2,india get first batch russias sputnik v covid1...,india receive first batch russias sputnik v va...,world,0.6486
3,pink supermoon seen night sky across world pic...,world witnessed first 2021s two supermoons mon...,world,0.0
4,denmark aims start local production covid19 va...,denmark aims start local production covid19 va...,world,0.5994
5,pakistan deploys troops 16 cities enforce covi...,pakistan deployed troops 16 major cities assis...,world,-0.6124
6,hong kong reopen bars nightclubs vaccinated re...,hong kong government said would ease socialdis...,world,0.3612
7,ireland approves astrazeneca jj vaccines citiz...,irish government agreed allow use johnson john...,world,0.4588
8,fiji reports cases covid19 variant first detec...,fiji tuesday said recorded cases coronavirus v...,world,0.0808
9,steps aid india stem covid19 surge,world health organization said stepping effort...,world,0.7351
