In [1]:
import re
import requests
from scrapy.http import TextResponse
from textblob import TextBlob,Word
import pandas as pd
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

In [2]:
url = "https://www.tert.am/en"

In [3]:
page = requests.get(url)

In [4]:
response = TextResponse(url= page.url,body=page.text,encoding ="utf-8")

In [5]:
news = response.css("ul[class='list list--with-scroll scroller-block show-link-visited']>li[class='list__item']>div>a>span[class='list__title db']::text").extract()

In [6]:
news[99]

"Yerevan-bound plane evacuated at Moscow airport due to 'strange smell'"

In [7]:
len(news)

100

In [8]:
pub_time = response.xpath('//span[@class="list__date db fb fs14"]/text()').re("\d*:\d*")  

In [9]:
pub_time[0]

'15:43'

In [10]:
len(pub_time)

100

# 1-1 Top most frequent words are armenian, us, armenia and minister

In [11]:
sw = stopwords.words("english")

In [12]:
news_str = "  ".join(news)

In [13]:
news_blob = TextBlob(news_str)

In [14]:
def replaceMultiple(old_str, element, newElement):
    for i in element :
        if i in old_str :
            old_str = old_str.replace(i, newElement)
    return  old_str

In [15]:
news_blob = replaceMultiple(news_blob, ["’", "‘", "'",",",":","-","–","—"] ,"")

In [16]:
news_blob

TextBlob("Oil on track for weekly gain as Iran tensions support  6yearold South Korean YouTuber buys $8 million property  Disagreements and problems must be the topic of our discussions but never the individuals  Armen Sarkissian  Palestinians to scrap agreements with Israel  Armenian authorities transitional justice plan extra threat to media  analyst
   Jennifer Lopez given Porsche from her adoring fiance Alex Rodriguez as she celebrates milestone 50th birthday in Miami  North Korea fires new shortrange missile into sea South Korea says  Ardshinbank offers new terms for loans with real estate loans pledge  Armenian Georgian foreign ministers meet in Tbilisi  Armenian Iranian ministers discuss cooperation in IT sector
   Rihanna shares photo of younger lookalike  Europe braces for second heatwave this summer  Iran vows to shoot down more US drones and mockingly portrays British Prime Minister Johnson as Trumps butler in escalating crisis  Russia starts largescale military exercise in 

In [17]:
len(news_blob.words)

1002

In [18]:
words = news_blob.words.lower()

In [19]:
words



In [20]:
i = 0
while i <995:
    if words[i] != "us":
        words[i] = words[i].lemmatize("v")
        words[i] = words[i].lemmatize("n")
    i += 1

In [21]:
words

WordList(['oil', 'on', 'track', 'for', 'weekly', 'gain', 'a', 'iran', 'tension', 'support', '6yearold', 'south', 'korean', 'youtuber', 'buy', '8', 'million', 'property', 'disagreement', 'and', 'problem', 'must', 'be', 'the', 'topic', 'of', 'our', 'discussion', 'but', 'never', 'the', 'individual', 'armen', 'sarkissian', 'palestinian', 'to', 'scrap', 'agreement', 'with', 'israel', 'armenian', 'authority', 'transitional', 'justice', 'plan', 'extra', 'threat', 'to', 'medium', 'analyst', 'foreign', 'ministry', 'negotiate', 'with', 'armenian', 'embassy', 'to', 'egypt', 'to', 'arrange', 'strand', 'tourist', 'return', 'iran', 'fire', 'ballistic', 'missile', '1000km', 'in', 'provocative', 'test', 'amid', 'tension', 'with', 'us', 'and', 'uk', 'iranian', 'smuggler', 'arrest', 'on', 'armenia', 'custom', 'border', 'aravot', 'armenia', 'us', 'far', 'from', 'strategic', 'dialogue', 'say', 'parliament', 'member', 'argentine', 'boxer', 'hugo', 'santillan', 'die', 'after', 'box', 'injury', 'armenian', '

In [22]:
without_sw = [i for i in words if i not in sw]

In [23]:
len(without_sw)

808

In [24]:
Counter(without_sw).most_common(10)

[('armenian', 15),
 ('us', 11),
 ('armenia', 9),
 ('minister', 8),
 ('south', 5),
 ('plan', 5),
 ('new', 5),
 ('fire', 4),
 ('amid', 4),
 ('iranian', 4)]

In [25]:
df = pd.DataFrame(without_sw)

In [26]:
df[0].value_counts().sort_values(ascending=False)[:10]

armenian     15
us           11
armenia       9
minister      8
new           5
plan          5
south         5
president     4
country       4
military      4
Name: 0, dtype: int64

#  1-2 There are 3 Armenian surnames in titles

In [27]:
surnames = [i for i in [re.findall("[A-Z]+[a-z]*yan",n) for n in news] if i]   

In [28]:
surnames

[['Pashinyan'], ['Tovmasyan'], ['Nazaryan']]

In [29]:
len(surnames)

3

# 1-3 Most of the articles are equally published from 10:00 to 10:59 and 12:00 to 12:59

In [30]:
pub_time

['15:43',
 '15:09',
 '14:37',
 '14:14',
 '13:38',
 '12:11',
 '11:44',
 '11:34',
 '10:59',
 '10:08',
 '09:46',
 '09:15',
 '18:11',
 '17:38',
 '17:26',
 '16:49',
 '16:12',
 '15:51',
 '15:10',
 '14:58',
 '14:21',
 '13:49',
 '13:00',
 '11:28',
 '11:20',
 '11:13',
 '16:11',
 '15:59',
 '15:42',
 '15:18',
 '15:07',
 '14:48',
 '13:46',
 '13:30',
 '13:17',
 '13:06',
 '12:29',
 '12:25',
 '12:13',
 '12:00',
 '11:06',
 '10:43',
 '10:28',
 '10:24',
 '10:11',
 '09:48',
 '09:35',
 '09:11',
 '18:17',
 '17:16',
 '16:29',
 '15:28',
 '15:09',
 '14:27',
 '13:42',
 '13:36',
 '13:15',
 '12:56',
 '12:31',
 '12:12',
 '11:39',
 '11:22',
 '10:47',
 '10:22',
 '09:33',
 '09:17',
 '19:37',
 '18:08',
 '16:42',
 '16:10',
 '15:30',
 '14:47',
 '14:13',
 '13:30',
 '12:53',
 '12:44',
 '12:14',
 '11:30',
 '11:07',
 '10:44',
 '10:12',
 '09:47',
 '09:17',
 '16:47',
 '10:58',
 '15:22',
 '14:56',
 '14:31',
 '14:18',
 '14:04',
 '13:53',
 '12:40',
 '12:21',
 '12:12',
 '11:16',
 '10:53',
 '10:36',
 '10:24',
 '09:32',
 '09:16']

In [31]:
h = [re.findall("^\d\d",i) for i in pub_time]

In [32]:
hours = [''.join(i) for i in h]

In [33]:
Counter(hours).most_common()

[('12', 14),
 ('10', 14),
 ('15', 12),
 ('14', 12),
 ('13', 12),
 ('11', 11),
 ('09', 11),
 ('16', 7),
 ('18', 3),
 ('17', 3),
 ('19', 1)]

In [34]:
max(set(hours),key = hours.count)

'12'