# Import Library

In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article  
import csv 
import pandas as pd
import numpy as np
import nltk
from textblob import TextBlob

# Crawling News 

In [2]:
domain = "https://timesofindia.indiatimes.com"
r = requests.get(domain)

In [3]:
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.findAll('a', attrs = {'class':'w_img'}) 

In [5]:
indianews=[]
for row in table: 
    if not row['href'].startswith('http'):
        indianews.append('https://timesofindia.indiatimes.com'+row['href'])

In [7]:
dataframe=[]
for i in indianews:
    article = Article(i, language="en")
    article.download() 
    article.parse() 
    article.nlp() 
    data={}
    data['Title']=article.title
    data['Text']=article.text
    data['Summary']=article.summary
    data['Keywords']=article.keywords
    dataframe.append(data)

In [8]:
data=pd.DataFrame(dataframe)
data.head()

Unnamed: 0,Keywords,Summary,Text,Title
0,"[patients, single, toll, number, reports, day,...",Paramedics wearing protective suits amid fears...,Paramedics wearing protective suits amid fears...,"Maharashtra reports 27 Covid-19 deaths, highes..."
1,"[marson, trust, results, tests, team, positive...","At the least, the finding could greatly compli...","The researchers worked around the clock, in sh...",Coronavirus antibody tests: Can you trust the ...
2,"[trials, global, gets, trial, patients, clinic...",“It won’t be a success until the drug gets app...,A Chinese drugmaker has received approval from...,Chinese alzheimer’s drug gets U.S. approval fo...
3,"[fever, university, cabinet, pharmacy, doctor,...",Here is the medicine and equipment they recomm...,"Right now, there is no cure for the coronaviru...",How should you stock your medicine cabinet?
4,"[crisis, patients, heart, attack, nissen, stro...","“Everywhere, the coronavirus .”After a few day...",Bishnu Virachan was a bicycle deliveryman for ...,"Amid the coronavirus crisis, heart and stroke ..."


# Model for predicting virality of news

In [15]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [16]:
Dataset="OnlineNewsPopularity.csv"

In [17]:
def clean_cols(data):
    """Clean the column names by stripping and lowercase."""
    clean_col_map = {x: x.lower().strip() for x in list(data)}
    return data.rename(index=str, columns=clean_col_map)

def TrainTestSplit(X, Y, R=0, test_size=0.2):
    """Easy Train Test Split call."""
    return train_test_split(X, Y, test_size=test_size, random_state=R)

In [21]:
# Data Cleaning
dataset = clean_cols(pd.read_csv(Dataset))
train_set, test_set = train_test_split(dataset, test_size=0.20, random_state=42)

x_train = train_set.drop(['url','shares', 'timedelta', 'lda_00','lda_01','lda_02','lda_03','lda_04','num_self_hrefs', 'kw_min_min', 'kw_max_min', 'kw_avg_min','kw_min_max','kw_max_max','kw_avg_max','kw_min_avg','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','rate_positive_words','rate_negative_words','abs_title_subjectivity','abs_title_sentiment_polarity'], axis=1)
y_train = train_set['shares']

x_test = test_set.drop(['url','shares', 'timedelta', 'num_self_hrefs', 'kw_min_min', 'kw_max_min', 'kw_avg_min','kw_min_max','kw_max_max','kw_avg_max','kw_min_avg','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','rate_positive_words','rate_negative_words','abs_title_subjectivity','abs_title_sentiment_polarity'], axis=1)
y_test = test_set['shares']

In [22]:
clf = SVR(kernel='rbf')
clf.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
rf_res = pd.DataFrame(clf.predict(x_train),list(y_train))

In [24]:
rf_res.reset_index(level=0, inplace=True)
rf_res_df = rf_res.rename(index=str, columns={"index": "Actual shares", 0: "Predicted shares"})
rf_res_df.head()

Unnamed: 0,Actual shares,Predicted shares
0,16100,1407.673163
1,508,1402.412021
2,1300,1398.077211
3,3100,1399.904678
4,6900,1412.927887


# Converting Crawled News according to Training Set 

In [25]:
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ragnarock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def rate_unique(words):
    words=tokenize(words)
    no_order = list(set(words))
    rate_unique=len(no_order)/len(words)
    return rate_unique

In [27]:
def rate_nonstop(words):
    words=tokenize(words)
    filtered_sentence = [w for w in words if not w in stopwords]
    rate_nonstop=len(filtered_sentence)/len(words)
    no_order = list(set(filtered_sentence))
    rate_unique_nonstop=len(no_order)/len(words)
    return rate_nonstop,rate_unique_nonstop

In [31]:
def avg_token(words):
    words=tokenize(words)
    length=[]
    for i in words:
        length.append(len(i))
    return np.average(length)

In [32]:
import datefinder
import datetime  
from datetime import date 
def day(article_text):
    article=article_text
    if len(list(datefinder.find_dates(article)))>0:
        date=str(list(datefinder.find_dates(article))[0])
        date=date.split()
        date=date[0]
        year, month, day = date.split('-')     
        day_name = datetime.date(int(year), int(month), int(day)) 
        return day_name.strftime("%A")
    return "Monday"

In [33]:
def tokenize(text):
    text=text
    return word_tokenize(text)

In [34]:
pos_words=[]
neg_words=[]
def polar(words):
    all_tokens=tokenize(words)
    for i in all_tokens:
        analysis=TextBlob(i)
        polarity=analysis.sentiment.polarity
        if polarity>0:
            pos_words.append(i)
        if polarity<0:
            neg_words.append(i)
    return pos_words,neg_words

In [35]:
def rates(words):
    words=polar(words)
    pos=words[0]
    neg=words[1]
    all_words=words
    global_rate_positive_words=(len(pos)/len(all_words))/100
    global_rate_negative_words=(len(neg)/len(all_words))/100
    pol_pos=[]
    pol_neg=[]
    for i in pos:
        analysis=TextBlob(i)
        pol_pos.append(analysis.sentiment.polarity)
        avg_positive_polarity=analysis.sentiment.polarity
    for j in neg:
        analysis2=TextBlob(j)
        pol_neg.append(analysis2.sentiment.polarity)
        avg_negative_polarity=analysis2.sentiment.polarity
    min_positive_polarity=min(pol_pos)
    max_positive_polarity=max(pol_pos)
    min_negative_polarity=min(pol_neg)
    max_negative_polarity=max(pol_neg)
    avg_positive_polarity=np.average(pol_pos)
    avg_negative_polarity=np.average(pol_neg)
    return global_rate_positive_words,global_rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity


In [37]:
dataframes=[]
for i in indianews:
    pred_info={}
    article = Article(i, language="en") 
    article.download() 
    article.parse()
    analysis=TextBlob(article.text)
    polarity=analysis.sentiment.polarity
    title_analysis=TextBlob(article.title)
    pred_info['text']=article.text
    pred_info['n_tokens_title']=len(tokenize(article.title))
    pred_info['n_tokens_content']=len(tokenize(article.text))
    pred_info['n_unique_tokens']=rate_unique(article.text)
    pred_info['n_non_stop_words']=rate_nonstop(article.text)[0]
    pred_info['n_non_stop_unique_tokens']=rate_nonstop(article.text)[1]
    pred_info['num_hrefs']=article.html.count("https://timesofindia.indiatimes.com")
    pred_info['num_imgs']=len(article.images)
    pred_info['num_videos']=len(article.movies)
    pred_info['average_token_length']=avg_token(article.text)
    pred_info['num_keywords']=len(article.keywords)
    
    if "life-style" in article.url:
        pred_info['data_channel_is_lifestyle']=1
    else:
        pred_info['data_channel_is_lifestyle']=0
    if "etimes" in article.url:
        pred_info['data_channel_is_entertainment']=1
    else:
        pred_info['data_channel_is_entertainment']=0
    if "business" in article.url:
        pred_info['data_channel_is_bus']=1
    else:
        pred_info['data_channel_is_bus']=0
    if "social media" or "facebook" or "whatsapp" in article.text.lower():
        data_channel_is_socmed=1
        data_channel_is_tech=0
        data_channel_is_world=0
    else:
        data_channel_is_socmed=0
    if ("technology" or "tech" in article.text.lower()) or ("technology" or "tech" in article.url):
        data_channel_is_tech=1
        data_channel_is_socmed=0
        data_channel_is_world=0
    else:
        data_channel_is_tech=0
    if "world" in article.url:
        data_channel_is_world=1
        data_channel_is_tech=0
        data_channel_is_socmed=0
    else:
        data_channel_is_world=0
        
    pred_info['data_channel_is_socmed']=data_channel_is_socmed
    pred_info['data_channel_is_tech']=data_channel_is_tech
    pred_info['data_channel_is_world']=data_channel_is_world
    
    if day(i)=="Monday":
        pred_info['weekday_is_monday']=1
    else:
        pred_info['weekday_is_monday']=0
    if day(i)=="Tuesday":
        pred_info['weekday_is_tuesday']=1
    else:
        pred_info['weekday_is_tuesday']=0
    if day(i)=="Wednesday":
        pred_info['weekday_is_wednesday']=1
    else:
        pred_info['weekday_is_wednesday']=0
    if day(i)=="Thursday":
        pred_info['weekday_is_thursday']=1
    else:
        pred_info['weekday_is_thursday']=0
    if day(i)=="Friday":
        pred_info['weekday_is_friday']=1
    else:
        pred_info['weekday_is_friday']=0
    if day(i)=="Saturday":
        pred_info['weekday_is_saturday']=1
        pred_info['is_weekend']=1
    else:
        pred_info['weekday_is_saturday']=0
    if day(i)=="Sunday":
        pred_info['weekday_is_sunday']=1
        pred_info['is_weekend']=1
    else:
        pred_info['weekday_is_sunday']=0
        pred_info['is_weekend']=0
        
    pred_info['global_subjectivity']=analysis.sentiment.subjectivity
    pred_info['global_sentiment_polarity']=analysis.sentiment.polarity
    pred_info['global_rate_positive_words']=rates(article.text)[0]
    pred_info['global_rate_negative_words']=rates(article.text)[1]
    pred_info['avg_positive_polarity']=rates(article.text)[2]
    pred_info['min_positive_polarity']=rates(article.text)[3]
    pred_info['max_positive_polarity']=rates(article.text)[4]
    pred_info['avg_negative_polarity']=rates(article.text)[5]
    pred_info['min_negative_polarity']=rates(article.text)[6]
    pred_info['max_negative_polarity']=rates(article.text)[7]    
    pred_info['title_subjectivity']=title_analysis.sentiment.subjectivity
    pred_info['title_sentiment_polarity']=title_analysis.sentiment.polarity
    dataframes.append(pred_info)

In [39]:
pred_df=pd.DataFrame(dataframes)
pred_test=pred_df.drop(['text'],axis=1)
pred_df.head(10)

Unnamed: 0,average_token_length,avg_negative_polarity,avg_positive_polarity,data_channel_is_bus,data_channel_is_entertainment,data_channel_is_lifestyle,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,global_rate_negative_words,...,text,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,4.664463,-0.202315,0.257771,0,0,0,0,1,0,0.06,...,Paramedics wearing protective suits amid fears...,-0.071429,0.214286,0,1,0,0,0,0,0
1,4.535697,-0.272855,0.285822,0,0,1,0,1,0,0.65,...,"The researchers worked around the clock, in sh...",0.0,0.0,0,1,0,0,0,0,0
2,4.613221,-0.274257,0.297307,0,0,1,0,1,0,1.97,...,A Chinese drugmaker has received approval from...,0.0,0.0,0,1,0,0,0,0,0
3,4.399364,-0.302664,0.320087,0,0,1,0,1,0,2.44,...,"Right now, there is no cure for the coronaviru...",0.0,0.0,0,1,0,0,0,0,0
4,4.217837,-0.304615,0.344639,0,0,1,0,1,0,3.34,...,Bishnu Virachan was a bicycle deliveryman for ...,-0.2,0.05,0,1,0,0,0,0,0
5,4.072742,-0.302246,0.340834,0,0,1,0,1,0,4.47,...,That frightening idea was raised by Dr. Robert...,0.0,0.0,0,1,0,0,0,0,0
6,4.392086,-0.300911,0.345423,0,0,1,0,1,0,5.17,...,Salman Rushdie is the flavor of the month once...,0.0,0.0,0,1,0,0,0,0,0
7,4.304945,-0.296999,0.351329,0,0,1,0,1,0,5.27,...,If there could be one magic ingredient that co...,0.5,0.5,0,1,0,0,0,0,0
8,4.181287,-0.29136,0.35764,0,0,1,0,1,0,5.62,...,"Let’s face it, as a country, nothing unites us...",0.6,0.9,0,0,1,0,0,0,0
9,4.149606,-0.291247,0.361824,0,0,0,0,1,0,6.08,...,From supporting each other in their respective...,0.268182,0.6,0,1,0,0,0,0,0


# Likelihood of Virality of News

In [40]:
test2=pd.DataFrame(clf.predict(pred_test),pred_df['text'])
test2.reset_index(level=0, inplace=True)
test2 = test2.rename(index=str, columns={"index": "News", 0: "Virality"})
test2.head(5)

Unnamed: 0,text,Virality
0,Paramedics wearing protective suits amid fears...,1414.749769
1,"The researchers worked around the clock, in sh...",1414.749769
2,A Chinese drugmaker has received approval from...,1414.749769
3,"Right now, there is no cure for the coronaviru...",1414.749769
4,Bishnu Virachan was a bicycle deliveryman for ...,1414.749769
