# Step 1 : Installing and Importing Libraries

In [322]:
!pip3 install -r requirements.txt

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m


In [362]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import random
import json
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from pylab import rcParams
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px
from tqdm import tqdm
from nltk.corpus import wordnet

# Step 2: Defining Constants

In [150]:
BASE_URL = "https://www.aljazeera.com"
MOZ_URL= BASE_URL+"/where/mozambique/"
ARTICLE_FILE = 'articles.json'

#n = number of articles to scrape
n=10

# STEP 3: Web Scrapping to get n Articles

In [276]:
#function to get links of n articles from MOZ_URL
def get_n_article_links(n):
    req = requests.get(MOZ_URL)
    soup = BeautifulSoup(req.text, "html.parser")
    links = []
    
    #after inspecting the web page we get the info that links are under tag 'a' and class 'u-clickable-card__link'
    card_links=soup.findAll('a',{'class':'u-clickable-card__link'})
    for link in tqdm(card_links[:10]):
        links.append(BASE_URL+link.get('href'))
    return links

In [277]:
#function to get articles from the links
def get_articles(links):
    articles=[]
    for link in tqdm(links):
        req = requests.get(link)
        soup = BeautifulSoup(req.text, "html.parser")
        
        #title
        title = soup.find('title').text
        #removing extra data from end of title
        title = title[:title.index('|')]
        
        #date
        date_div = soup.find('div',{'class':'article-dates'})
        date_text= date_div.findAll('span')[1].text
        
        #images and image captions
        images=[]
        figures= soup.findAll('figure')
        for fig in figures:
            image = fig.find('img')
            if image:
                images.append({'url':BASE_URL+image['src'],'caption':fig.text})
        
        
        #main article
        paras=soup.find('main').findAll('p')
        text=''
        for para in paras:
            text+=para.text+'\n'
            
        article = {'title':title,
                   'date':date_text,
                   'images':images,
                   'text':text
                  }
        articles.append(article)
        
    return articles

In [278]:
#function to save dictionary to file as JSON
def save_dict_to_json(data):
    with open(ARTICLE_FILE, "a") as outfile:
        for d in tqdm(data):
            json.dump(d, outfile,indent=4)
            outfile.write('\n')

In [332]:
#1. Get links for n articles from given URL
links=get_n_article_links(n)

#2. Fetch articles using the links
articles=get_articles(links)

#3. save articles to file 
save_dict_to_json(articles)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 231729.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.01s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 13697.92it/s]


# 4. Preprocessing Data

In [363]:
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [376]:
lemmatizer = WordNetLemmatizer()
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [377]:
def preprocessing(articles):
    new_data=[]
    
    
    for i in tqdm(range(len(articles))):
        article=articles[i]
        
        #1. Converting to lower case
        article['title']=article['title'].lower()
        article['text']=article['text'].lower()
        
        #2. Removing stopwords
        text_tokens = word_tokenize(article['text'])
        tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        article['text']=(" ").join(tokens_without_sw)
        
        #lemmatizing paragraph
        
        article['text']=lemmatize_sentence(article['text'])
        
        
    return articles

In [378]:
articles=preprocessing(articles)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.31it/s]


# 5. Analysis using SentimentIntensityAnalyzer

In [379]:
title_with_sent=[]
analyser = SentimentIntensityAnalyzer()
for i,article in enumerate(articles):
    analysis = analyser.polarity_scores(article['title']+' '+article['text'])
    title_with_sent.append({'title':article['title'], 'compound':analysis['compound']})

In [380]:
fig = px.bar(title_with_sent, x='title', y='compound')
fig.show()

# 6. Analysis using TextBlob

In [381]:
df = pd.DataFrame(articles)

In [382]:
df=df.drop(columns=['date','images'])
df.head

<bound method NDFrame.head of                                                title  \
0  floods hit south africa’s kwazulu-natal provin...   
1  mozambique: cyclone gombe death toll rises to 53    
2  mozambique announces new prime minister after ...   
3  analysis: can african gas replace russian supp...   
4  dozens dead from tropical storm ana in souther...   
5  southern africa bloc sadc extends mozambique m...   
6                         climate change and famine    
7  in mozambique, kagame says rwandan troops’ wor...   
8  rwanda, mozambique forces recapture port city ...   
9  rwanda deploys 1,000 soldiers to mozambique’s ...   

                                                text  
0  flood happen month torrential rain kill 435 pe...  
1  recent year , southern africa suffer repeat de...  
2  new minister major reshuffle president filipe ...  
3  lack infrastructure , capacity , could hurt co...  
4  least 70 people kill storm struck madagascar ,...  
5  regional body say pr

In [383]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [384]:
#adding columns for subjectivity and polarity
df['subjectivity']=(df['title']+' '+df['text']).apply(getSubjectivity)
df['polarity']=(df['title']+' '+df['text']).apply(getPolarity)

df

Unnamed: 0,title,text,subjectivity,polarity
0,floods hit south africa’s kwazulu-natal provin...,flood happen month torrential rain kill 435 pe...,0.4295,0.039402
1,mozambique: cyclone gombe death toll rises to 53,"recent year , southern africa suffer repeat de...",0.238889,-0.0375
2,mozambique announces new prime minister after ...,new minister major reshuffle president filipe ...,0.414205,0.091761
3,analysis: can african gas replace russian supp...,"lack infrastructure , capacity , could hurt co...",0.353585,0.068373
4,dozens dead from tropical storm ana in souther...,"least 70 people kill storm struck madagascar ,...",0.25625,-0.054167
5,southern africa bloc sadc extends mozambique m...,regional body say progress make rebel cabo del...,0.280729,0.071354
6,climate change and famine,madagascar brink famine worst drought 40 year ...,0.55,-0.5
7,"in mozambique, kagame says rwandan troops’ wor...",rwandan president say country ’ force help sec...,0.31498,0.040039
8,"rwanda, mozambique forces recapture port city ...","mocimboa praia , home africa ’ big natural gas...",0.249123,0.055921
9,"rwanda deploys 1,000 soldiers to mozambique’s ...",government kigali say deployment request mozam...,0.370747,-0.015651


In [385]:
#function to compute +ve, -ve or neutral analysis from polarity
def get_analysis(score):
    if score >0:
        return 'positive'
    elif score == 0:
        return 'neutral'
    else:
        return 'negative'

In [386]:
df['analysis'] = df['polarity'].apply(get_analysis)

In [387]:
#scatter plot between polarity and subjectivity

fig = px.scatter(df,x='polarity',y='subjectivity',color='analysis',hover_data=["title"])
fig.update_traces(marker_size=10)
fig.show()