## Imports

In [177]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import json
from pymongo import MongoClient
import re
import pyarabic.trans
from nltk.tokenize import wordpunct_tokenize
import nltk
from nltk.stem.isri import ISRIStemmer

In [178]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/unamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# WebScraping

### import link

In [179]:

url = 'https://www.bbc.com/arabic/'

data = requests.get(url)

In [180]:
my_data=[]
html = BeautifulSoup(data.text, 'html.parser')
articles = html.select('.promo-text')
for article in articles:
    title_element = article.select_one('.focusIndicatorDisplayBlock')
    timestamp_element = article.select_one('.promo-timestamp')
    
    title = title_element.text.strip() if title_element else None
    link = title_element.get('href') if title_element else None
    timestamp = timestamp_element.get('datetime') if timestamp_element else None
    
    my_data.append({"title": title, "link": link, "timestamp": timestamp})


In [181]:
pprint(my_data)

[{'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'title': 'إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" '
           'استعداداً "للحرب" على حدود لبنان'},
 {'link': 'https://www.bbc.com/arabic/articles/c03r5p5qzj0o',
  'timestamp': '2024-04-08',
  'title': 'كعكة الطبقات الماليزية التي تفوز بلقب "أجمل كعكة" لعيد الفطر'},
 {'link': 'https://www.bbc.com/arabic/articles/cg3q3q3zjd0o',
  'timestamp': '2024-04-07',
  'title': 'يجب على قطر أن تستخدم كل مواردها لمعاقبة حماس ونزع سلاحها – مقال '
           'في جيروزالم بوست'},
 {'link': 'https://www.bbc.com/arabic/articles/czqz6j3x2njo',
  'timestamp': '2024-04-07',
  'title': 'قصة الكاميرونية الخبيرة في تحليل صور الأقمار الاصطناعية'},
 {'link': 'https://www.bbc.com/arabic/articles/c97wd1v6gx4o',
  'timestamp': '2024-04-07',
  'title': 'لماذا تهيج بعض الحيوانات أثناء كسوف الشمس؟'},
 {'link': 'https://www.bbc.com/arabic/articles/crg4gkx5ep5o',
  'timestamp': '2024-04-07',
  'title': 'كيف 

In [182]:


def extract_info(soup):
    script_tag = soup.find('script', {'type': 'application/ld+json'})
    # Extract the JSON-LD text
    if script_tag:
        json_ld_text = script_tag.string
        # Load the JSON-LD data
        try:
            json_ld = json.loads(json_ld_text)
            json_ld_data = json_ld.get('@graph', [None])[0]  # Extract data or set to None if not found
            if json_ld_data:
                # Access the relevant information from the JSON-LD data
                headline = json_ld_data.get('headline', None)
                date_published = json_ld_data.get('datePublished', None)
                date_modified = json_ld_data.get('dateModified', None)
                author = json_ld_data['author'].get('name', None)
                url = json_ld_data.get('url', None)
                article_info = {
                    "headline": headline,
                    "date_published": date_published,
                    "date_modified": date_modified,
                    "author": author,
                    "url": url
                }
                return article_info
            else:
                return None  # No @graph data found
        except Exception as e:
            print("Error while extracting info:", e)
            return None  # Return None in case of any exception
    else:
        return None  # No script tag found


In [183]:
def extract_article(soup):
    element = soup.find(id="end-of-recommendations")
    paragraphs = soup.find_all('span', recursive=True)

    # Remove each paragraph found
    for paragraph in paragraphs:
        paragraph.decompose()
    # Remove the element if found
    if element:
        element.extract()
    article = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6','p','b'],recursive=True)
    # Extract text from each tag
    #article_text = ""
    #for tag in article:
     #   article_text += tag.get_text(separator='\n') + '\n'
    #return article_text
    # Extract text from each tag
    article_text=[]
    for tag in article:
        article_text.append(tag.get_text(separator='\n')) 
    return article_text

In [184]:
for e in my_data:
    new_data = requests.get(e['link']) 
    soup = BeautifulSoup(new_data.text, 'html.parser')
    e["article_info"]= extract_info(soup)
    e["article_text"]=  extract_article(soup)

In [185]:
pprint(my_data)

[{'article_info': {'author': 'BBC News عربي',
                   'date_modified': '2024-04-07T17:29:47.481Z',
                   'date_published': '2024-04-07T14:22:07.221Z',
                   'headline': 'خان يونس: إسرائيل تسحب كامل قواتها من جنوبي '
                               'غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" '
                               'على حدود لبنان',
                   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': ['إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة '
                   'أخرى" استعداداً "للحرب" على حدود لبنان',
                   '',
                   'أعلن الجيش الإسرائيلي الأحد أنه استكمل "مرحلة أخرى" في '
                   'إطار استعداداته "للحرب" عند الحدود الشمالية مع لبنان حيث '
                   'يستمر القصف المتبادل مع حزب الله، بينما قال إنه سحب كامل '
                   'قواته من جنوبي قطاع غزة.',
                   'وفي بيان بعنوان "الاستعداد للانتقال من الدفاع إلى الهجوم" '
                   

## Connect to dataBase

In [186]:
# Connect to MongoDB (assuming it's running on localhost)#
client = MongoClient('mongodb://localhost:27017/')

# Select database
db = client['NLP']

# Select collection
collection = db['TP1']
collection.delete_many({})



DeleteResult({'n': 65, 'ok': 1.0}, acknowledged=True)

## Insert to DataBase

In [187]:

# Insert a single document
insert_result = collection.insert_many(my_data)

In [188]:
print("Inserted document ID:", insert_result)

Inserted document ID: InsertManyResult([ObjectId('661362c302569160cffc59c7'), ObjectId('661362c302569160cffc59c8'), ObjectId('661362c302569160cffc59c9'), ObjectId('661362c302569160cffc59ca'), ObjectId('661362c302569160cffc59cb'), ObjectId('661362c302569160cffc59cc'), ObjectId('661362c302569160cffc59cd'), ObjectId('661362c302569160cffc59ce'), ObjectId('661362c302569160cffc59cf'), ObjectId('661362c302569160cffc59d0'), ObjectId('661362c302569160cffc59d1'), ObjectId('661362c302569160cffc59d2'), ObjectId('661362c302569160cffc59d3'), ObjectId('661362c302569160cffc59d4'), ObjectId('661362c302569160cffc59d5'), ObjectId('661362c302569160cffc59d6'), ObjectId('661362c302569160cffc59d7'), ObjectId('661362c302569160cffc59d8'), ObjectId('661362c302569160cffc59d9'), ObjectId('661362c302569160cffc59da'), ObjectId('661362c302569160cffc59db'), ObjectId('661362c302569160cffc59dc'), ObjectId('661362c302569160cffc59dd'), ObjectId('661362c302569160cffc59de'), ObjectId('661362c302569160cffc59df'), ObjectId('

# NLP

## Establishment of NLP Pipeline

#### Get Data from database

In [189]:
# Retrieve all documents from the collection
all_documents = collection.find()
i=0
array=[]
for doc in all_documents:
    array.append(doc)

### Data Normalisation

In [190]:
array

[{'_id': ObjectId('661362c302569160cffc59c7'),
  'title': 'إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" على حدود لبنان',
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': 'خان يونس: إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" على حدود لبنان',
   'date_published': '2024-04-07T14:22:07.221Z',
   'date_modified': '2024-04-07T17:29:47.481Z',
   'author': 'BBC News عربي',
   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': ['إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" على حدود لبنان',
   '',
   'أعلن الجيش الإسرائيلي الأحد أنه استكمل "مرحلة أخرى" في إطار استعداداته "للحرب" عند الحدود الشمالية مع لبنان حيث يستمر القصف المتبادل مع حزب الله، بينما قال إنه سحب كامل قواته من جنوبي قطاع غزة.',
   'وفي بيان بعنوان "الاستعداد للانتقال من الدفاع إلى الهجوم" نشره  الجيش الإسرائيلي، قال إنه "خلال الأيام

In [191]:
def encode(tokens):
    new_data=[]
    for token in tokens:
        new_data.append(token.encode('utf-8', errors='strict'))
    return new_data

### Text Cleaning : Removing all non arabic and emails phone numbers ... etc.

In [192]:
def clean_arabic_text(text):
    #Remove \n
    text=re.sub(r'\n', '', text)
    # Remove links
    text = re.sub(r'http\S+', '', text)
    # Remove emails
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\b(?:0|\+?44)[\d\s-]{9,13}\b', '', text)
    
    # Remove any remaining non-word characters
    text = re.sub(r'[^\w\s]', '', text)
    # Segment the text into Arabic and non-Arabic parts
    segmented_text = pyarabic.trans.segment_language(text)
    
    # Concatenate only the Arabic parts
    arabic_text = ''.join([segment[1] for segment in segmented_text if segment[0] == 'arabic'])
    
    # Remove extra spaces
    arabic_text = re.sub(r'\s+', ' ', arabic_text)
    
    return arabic_text.strip()

### Tokenization

In [193]:
def tokenize(text):
    return wordpunct_tokenize(text)

### Delete Stop Words

In [194]:
def del_stop_words(tokens):
    arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    filtered_tokens = [token for token in tokens if token not in arb_stopwords]
    return filtered_tokens

### Apply the NLP Pipeline

In [195]:
def do_all(array):
    for doc in array:
        if 'title' in doc:
            doc['title'] = clean_arabic_text(doc['title'])
            doc['title'] = tokenize(doc['title'])
            doc['title'] = del_stop_words(doc['title'])
            #doc['title'] =encode(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    article_info[field] = clean_arabic_text(article_info[field])
                    article_info[field] = tokenize(article_info[field])
                    article_info[field] = del_stop_words(article_info[field])
                    #article_info[field] =encode(article_info[field])

        # Check if 'article_text' key exists and process its content
        if 'article_text' in doc:
            doc['article_text'] = [del_stop_words(tokenize(clean_arabic_text(text))) for text in doc['article_text'] if text.strip()]


In [196]:
do_all(array)

## Stemming

In [198]:
def stem(tokens):
    st = ISRIStemmer()
    stemmed_tokens = []
    for token in tokens:
        stemmed_token = st.stem(token)
        stemmed_tokens.append(stemmed_token)
    return stemmed_tokens

stemmed = stem(array[1]['article_text'][1])
print("normal", [token for token in array[1]['article_text'][1]])
print("stemmed", stemmed)


normal ['إعداد', 'الكعك', 'التقليدي', 'شكلا', 'أشكال', 'الفن', 'الرائع', 'يشمله', 'ألوان', 'عديدة', 'ونكهات', 'متنوعة', 'تمتزج', 'إعداده', 'لإكسابه', 'القوام', 'الناعم', 'والمذاق', 'اللذيذ', 'ويخفي', 'داخله', 'تنوعا', 'فريدا', 'التصاميم', 'الهندسية', 'المعقدة']
stemmed ['عدد', 'كعك', 'قلد', 'شكل', 'شكل', 'الف', 'رئع', 'شمل', 'الو', 'عدد', 'ونك', 'تنع', 'مزج', 'عدد', 'إكساب', 'قوم', 'نعم', 'ذاق', 'لذذ', 'يخف', 'دخل', 'تنع', 'فرد', 'صمم', 'هندس', 'عقد']


In [199]:
def do_stem(array):
    for doc in array:
        if 'title' in doc:
            doc['title'] = stem(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    article_info[field] = stem(article_info[field])
                    

        # Check if 'article_text' key exists and process its content
        if 'article_text' in doc:
            doc['article_text'] = [stem(text) for text in doc['article_text']]


In [200]:
array

[{'_id': ObjectId('661362c302569160cffc59c7'),
  'title': ['إسرائيل',
   'تسحب',
   'كامل',
   'قواتها',
   'جنوبي',
   'غزة',
   'وتستكمل',
   'مرحلة',
   'أخرى',
   'استعدادا',
   'للحرب',
   'حدود',
   'لبنان'],
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': ['خان',
    'يونس',
    'إسرائيل',
    'تسحب',
    'كامل',
    'قواتها',
    'جنوبي',
    'غزة',
    'وتستكمل',
    'مرحلة',
    'أخرى',
    'استعدادا',
    'للحرب',
    'حدود',
    'لبنان'],
   'date_published': '2024-04-07T14:22:07.221Z',
   'date_modified': '2024-04-07T17:29:47.481Z',
   'author': ['عربي'],
   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': [['إسرائيل',
    'تسحب',
    'كامل',
    'قواتها',
    'جنوبي',
    'غزة',
    'وتستكمل',
    'مرحلة',
    'أخرى',
    'استعدادا',
    'للحرب',
    'حدود',
    'لبنان'],
   ['أعلن',
    'الجيش',
    'الإسرائيلي',
    'الأحد',
    'أنه',
    'استكمل',
    'مرحلة',
    'أ

In [201]:
do_stem(array)

In [202]:
array

[{'_id': ObjectId('661362c302569160cffc59c7'),
  'title': ['رائيل',
   'سحب',
   'كمل',
   'قوت',
   'جنب',
   'غزة',
   'وتس',
   'رحل',
   'خرى',
   'استعدادا',
   'حرب',
   'حدد',
   'لبن'],
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': ['خان',
    'ونس',
    'رائيل',
    'سحب',
    'كمل',
    'قوت',
    'جنب',
    'غزة',
    'وتس',
    'رحل',
    'خرى',
    'استعدادا',
    'حرب',
    'حدد',
    'لبن'],
   'date_published': '2024-04-07T14:22:07.221Z',
   'date_modified': '2024-04-07T17:29:47.481Z',
   'author': ['عرب'],
   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': [['رائيل',
    'سحب',
    'كمل',
    'قوت',
    'جنب',
    'غزة',
    'وتس',
    'رحل',
    'خرى',
    'استعدادا',
    'حرب',
    'حدد',
    'لبن'],
   ['اعل',
    'جيش',
    'اسرائيلي',
    'احد',
    'انه',
    'است',
    'رحل',
    'خرى',
    'اطر',
    'استعداداته',
    'حرب',
    'حدد',
    'شمل',
    'لبن',

### Rule Based Approach in NLP

In [203]:
import re

def pos_tag_arabic(word):
    # Define regular expression patterns for each POS category
    noun_pattern = r'.*ة$|.*ون$'  # Ends with "-ة" or "-ون"
    verb_pattern = r'.*[آإ]ت.*|.*[اأى]ت.*'  # Contains a form of past tense verb
    adj_pattern = r'.*ي[ةه]$'  # Ends with "-ي" followed by "-ة" or "-ه"
    adv_pattern = r'.*اً?$'  # Ends with "اً"
    prep_pattern = r'^(ب|في|على|من|إلى)'  # Starts with a preposition
    conj_pattern = r'^(و|أو|لكن|إذا)'  # Starts with a conjunction
    pron_pattern = r'^(أنا|أنت|هو|هي|نحن|أنتم|هم|هن|هذا|هذه|ذلك|تلك)$'  # Pronouns
    # Match word against each POS pattern
    if re.match(noun_pattern, word):
        return 'NOUN'
    elif re.match(verb_pattern, word):
        return 'VERB'
    elif re.match(adj_pattern, word):
        return 'ADJ'
    elif re.match(adv_pattern, word):
        return 'ADV'
    elif re.match(prep_pattern, word):
        return 'PREP'
    elif re.match(conj_pattern, word):
        return 'CONJ'
    elif re.match(pron_pattern, word):
        return 'PRON'
    else:
        return 'OTHER'  # If word doesn't match any POS pattern
# Test the function with sample Arabic words
arabic_words = ['مدرسة', 'قرأت', 'جميلة', 'سريعاً', 'في', 'و', 'أنا']
for word in arabic_words:
    print(f'{word}: {pos_tag_arabic(word)}')


مدرسة: NOUN
قرأت: VERB
جميلة: NOUN
سريعاً: ADV
في: PREP
و: CONJ
أنا: ADV


### Another POS Approach in NLP

In [204]:
new_arr=array

In [205]:
def do_pos(array):
    for doc in array:
        if 'title' in doc:
            if doc['title'] is not None:
                doc['title'] =nltk.pos_tag(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    article_info[field] = nltk.pos_tag(article_info[field])

        # Check if 'article_text' key exists and process its content
        if 'article_text' in doc:
            doc['article_text'] = [nltk.pos_tag(text) for text in doc['article_text']]


In [206]:
do_pos(new_arr)

In [207]:
new_arr

[{'_id': ObjectId('661362c302569160cffc59c7'),
  'title': [('رائيل', 'JJ'),
   ('سحب', 'NNP'),
   ('كمل', 'NNP'),
   ('قوت', 'NNP'),
   ('جنب', 'NNP'),
   ('غزة', 'NNP'),
   ('وتس', 'NNP'),
   ('رحل', 'NNP'),
   ('خرى', 'NNP'),
   ('استعدادا', 'NNP'),
   ('حرب', 'NNP'),
   ('حدد', 'NNP'),
   ('لبن', 'NN')],
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': [('خان', 'JJ'),
    ('ونس', 'NNP'),
    ('رائيل', 'NNP'),
    ('سحب', 'NNP'),
    ('كمل', 'NNP'),
    ('قوت', 'NNP'),
    ('جنب', 'NNP'),
    ('غزة', 'NNP'),
    ('وتس', 'NNP'),
    ('رحل', 'NNP'),
    ('خرى', 'NNP'),
    ('استعدادا', 'NNP'),
    ('حرب', 'NNP'),
    ('حدد', 'NNP'),
    ('لبن', 'NN')],
   'date_published': '2024-04-07T14:22:07.221Z',
   'date_modified': '2024-04-07T17:29:47.481Z',
   'author': [('عرب', 'NN')],
   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': [[('رائيل', 'JJ'),
    ('سحب', 'NNP'),
    ('كمل', 'NNP'),

### NER Methods

In [208]:
def do_ner(array):
    for doc in array:
        if 'title' in doc:
            if doc['title'] is not None:
                doc['title'] = nltk.ne_chunk(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    article_info[field] = nltk.ne_chunk(article_info[field])

        # Check if 'article_text' key exists and process its content
        if 'article_text' in doc:
            doc['article_text'] = [nltk.ne_chunk(text) for text in doc['article_text']]


In [209]:
do_ner(new_arr)

In [210]:
new_arr

[{'_id': ObjectId('661362c302569160cffc59c7'),
  'title': Tree('S', [('رائيل', 'JJ'), Tree('ORGANIZATION', [('سحب', 'NNP')]), ('كمل', 'NNP'), ('قوت', 'NNP'), ('جنب', 'NNP'), ('غزة', 'NNP'), ('وتس', 'NNP'), ('رحل', 'NNP'), ('خرى', 'NNP'), ('استعدادا', 'NNP'), ('حرب', 'NNP'), ('حدد', 'NNP'), ('لبن', 'NN')]),
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': Tree('S', [('خان', 'JJ'), Tree('ORGANIZATION', [('ونس', 'NNP')]), ('رائيل', 'NNP'), ('سحب', 'NNP'), ('كمل', 'NNP'), ('قوت', 'NNP'), ('جنب', 'NNP'), ('غزة', 'NNP'), ('وتس', 'NNP'), ('رحل', 'NNP'), ('خرى', 'NNP'), ('استعدادا', 'NNP'), ('حرب', 'NNP'), ('حدد', 'NNP'), ('لبن', 'NN')]),
   'date_published': '2024-04-07T14:22:07.221Z',
   'date_modified': '2024-04-07T17:29:47.481Z',
   'author': Tree('S', [Tree('ORGANIZATION', [('عرب', 'NN')])]),
   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': [Tree('S', [('رائيل', 'JJ'), Tree('ORGANIZATI

# Conclusion

During this lab i had the chance to learn about web scraping, the difference between stemming , and lematization i also managed to comprehend pos and the various challenges that come with implementing a rule based pos tagger , as the lexical and the depth of a language can be vary difficult to put into a simple number of rules .
Overall this lab has been an enlightening introduction into the world of web scraping and NLP Pipelines.