## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import json
from pymongo import MongoClient
import re
import pyarabic.trans
from nltk.tokenize import wordpunct_tokenize
import nltk
from nltk.stem.isri import ISRIStemmer

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/unamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# WebScraping

### import link

In [3]:

url = 'https://www.bbc.com/arabic/'

data = requests.get(url)

In [4]:
my_data=[]
html = BeautifulSoup(data.text, 'html.parser')
articles = html.select('.promo-text')
for article in articles:
    title_element = article.select_one('.focusIndicatorDisplayBlock')
    timestamp_element = article.select_one('.promo-timestamp')
    
    title = title_element.text.strip() if title_element else None
    link = title_element.get('href') if title_element else None
    timestamp = timestamp_element.get('datetime') if timestamp_element else None
    
    my_data.append({"title": title, "link": link, "timestamp": timestamp})


In [5]:
pprint(my_data)

[{'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'title': 'إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" '
           'استعداداً "للحرب" على حدود لبنان'},
 {'link': 'https://www.bbc.com/arabic/articles/cg3q3q3zjd0o',
  'timestamp': '2024-04-07',
  'title': 'يجب على قطر أن تستخدم كل مواردها لمعاقبة حماس ونزع سلاحها – مقال '
           'في جيروزالم بوست'},
 {'link': 'https://www.bbc.com/arabic/articles/czqz6j3x2njo',
  'timestamp': '2024-04-07',
  'title': 'قصة الكاميرونية الخبيرة في تحليل صور الأقمار الاصطناعية'},
 {'link': 'https://www.bbc.com/arabic/articles/c97wd1v6gx4o',
  'timestamp': '2024-04-07',
  'title': 'لماذا تهيج بعض الحيوانات أثناء كسوف الشمس؟'},
 {'link': 'https://www.bbc.com/arabic/articles/crg4gkx5ep5o',
  'timestamp': '2024-04-07',
  'title': 'كيف يمكنك تصوير كسوف الشمس الكلي بأمان؟'},
 {'link': 'https://www.bbc.com/arabic/articles/c876zz8rpp8o',
  'timestamp': '2024-04-07',
  'title': 'حرب غزة بالأرقام بعد ستة 

In [6]:


def extract_info(soup):
    script_tag = soup.find('script', {'type': 'application/ld+json'})
    # Extract the JSON-LD text
    if script_tag:
        json_ld_text = script_tag.string
        # Load the JSON-LD data
        try:
            json_ld = json.loads(json_ld_text)
            json_ld_data = json_ld.get('@graph', [None])[0]  # Extract data or set to None if not found
            if json_ld_data:
                # Access the relevant information from the JSON-LD data
                headline = json_ld_data.get('headline', None)
                date_published = json_ld_data.get('datePublished', None)
                date_modified = json_ld_data.get('dateModified', None)
                author = json_ld_data['author'].get('name', None)
                url = json_ld_data.get('url', None)
                article_info = {
                    "headline": headline,
                    "date_published": date_published,
                    "date_modified": date_modified,
                    "author": author,
                    "url": url
                }
                return article_info
            else:
                return None  # No @graph data found
        except Exception as e:
            print("Error while extracting info:", e)
            return None  # Return None in case of any exception
    else:
        return None  # No script tag found


In [7]:
def extract_article(soup):
    element = soup.find(id="end-of-recommendations")
    paragraphs = soup.find_all('span', recursive=True)

    # Remove each paragraph found
    for paragraph in paragraphs:
        paragraph.decompose()
    # Remove the element if found
    if element:
        element.extract()
    article = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6','p','b'],recursive=True)
    # Extract text from each tag
    #article_text = ""
    #for tag in article:
     #   article_text += tag.get_text(separator='\n') + '\n'
    #return article_text
    # Extract text from each tag
    article_text=[]
    for tag in article:
        article_text.append(tag.get_text(separator='\n')) 
    return article_text

In [8]:
for e in my_data:
    new_data = requests.get(e['link']) 
    soup = BeautifulSoup(new_data.text, 'html.parser')
    e["article_info"]= extract_info(soup)
    e["article_text"]=  extract_article(soup)

In [9]:
pprint(my_data)

[{'article_info': {'author': 'BBC News عربي',
                   'date_modified': '2024-04-07T17:29:47.481Z',
                   'date_published': '2024-04-07T14:22:07.221Z',
                   'headline': 'خان يونس: إسرائيل تسحب كامل قواتها من جنوبي '
                               'غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" '
                               'على حدود لبنان',
                   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': ['إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة '
                   'أخرى" استعداداً "للحرب" على حدود لبنان',
                   '',
                   'أعلن الجيش الإسرائيلي الأحد أنه استكمل "مرحلة أخرى" في '
                   'إطار استعداداته "للحرب" عند الحدود الشمالية مع لبنان حيث '
                   'يستمر القصف المتبادل مع حزب الله، بينما قال إنه سحب كامل '
                   'قواته من جنوبي قطاع غزة.',
                   'وفي بيان بعنوان "الاستعداد للانتقال من الدفاع إلى الهجوم" '
                   

## Connect to dataBase

In [10]:
# Connect to MongoDB (assuming it's running on localhost)#
client = MongoClient('mongodb://localhost:27017/')

# Select database
db = client['NLP']

# Select collection
collection = db['TP1']
collection.delete_many({})



DeleteResult({'n': 64, 'ok': 1.0}, acknowledged=True)

## Insert to DataBase

In [11]:

# Insert a single document
insert_result = collection.insert_many(my_data)

In [12]:
print("Inserted document ID:", insert_result)

Inserted document ID: InsertManyResult([ObjectId('6613488502569160cffc5985'), ObjectId('6613488502569160cffc5986'), ObjectId('6613488502569160cffc5987'), ObjectId('6613488502569160cffc5988'), ObjectId('6613488502569160cffc5989'), ObjectId('6613488502569160cffc598a'), ObjectId('6613488502569160cffc598b'), ObjectId('6613488502569160cffc598c'), ObjectId('6613488502569160cffc598d'), ObjectId('6613488502569160cffc598e'), ObjectId('6613488502569160cffc598f'), ObjectId('6613488502569160cffc5990'), ObjectId('6613488502569160cffc5991'), ObjectId('6613488502569160cffc5992'), ObjectId('6613488502569160cffc5993'), ObjectId('6613488502569160cffc5994'), ObjectId('6613488502569160cffc5995'), ObjectId('6613488502569160cffc5996'), ObjectId('6613488502569160cffc5997'), ObjectId('6613488502569160cffc5998'), ObjectId('6613488502569160cffc5999'), ObjectId('6613488502569160cffc599a'), ObjectId('6613488502569160cffc599b'), ObjectId('6613488502569160cffc599c'), ObjectId('6613488502569160cffc599d'), ObjectId('

# NLP

## Establishment of NLP Pipeline

#### Get Data from database

In [93]:
# Retrieve all documents from the collection
all_documents = collection.find()
i=0
array=[]
for doc in all_documents:
    array.append(doc)

### Data Normalisation

In [79]:
array

[{'_id': ObjectId('6613488502569160cffc5985'),
  'title': 'إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" على حدود لبنان',
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': 'خان يونس: إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" على حدود لبنان',
   'date_published': '2024-04-07T14:22:07.221Z',
   'date_modified': '2024-04-07T17:29:47.481Z',
   'author': 'BBC News عربي',
   'url': 'https://www.bbc.com/arabic/articles/c1vldp9le52o'},
  'article_text': ['إسرائيل تسحب كامل قواتها من جنوبي غزة، وتستكمل "مرحلة أخرى" استعداداً "للحرب" على حدود لبنان',
   '',
   'أعلن الجيش الإسرائيلي الأحد أنه استكمل "مرحلة أخرى" في إطار استعداداته "للحرب" عند الحدود الشمالية مع لبنان حيث يستمر القصف المتبادل مع حزب الله، بينما قال إنه سحب كامل قواته من جنوبي قطاع غزة.',
   'وفي بيان بعنوان "الاستعداد للانتقال من الدفاع إلى الهجوم" نشره  الجيش الإسرائيلي، قال إنه "خلال الأيام

In [80]:
def encode(tokens):
    new_data=[]
    for token in tokens:
        new_data.append(token.encode('utf-8', errors='strict'))
    return new_data

### Text Cleaning : Removing all non arabic and emails phone numbers ... etc.

In [81]:
def clean_arabic_text(text):
    #Remove \n
    text=re.sub(r'\n', '', text)
    # Remove links
    text = re.sub(r'http\S+', '', text)
    # Remove emails
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\b(?:0|\+?44)[\d\s-]{9,13}\b', '', text)
    
    # Remove any remaining non-word characters
    text = re.sub(r'[^\w\s]', '', text)
    # Segment the text into Arabic and non-Arabic parts
    segmented_text = pyarabic.trans.segment_language(text)
    
    # Concatenate only the Arabic parts
    arabic_text = ''.join([segment[1] for segment in segmented_text if segment[0] == 'arabic'])
    
    # Remove extra spaces
    arabic_text = re.sub(r'\s+', ' ', arabic_text)
    
    return arabic_text.strip()

### Tokenization

In [82]:
def tokenize(text):
    return wordpunct_tokenize(text)

### Delete Stop Words

In [83]:
def del_stop_words(tokens):
    arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    filtered_tokens = [token for token in tokens if token not in arb_stopwords]
    return filtered_tokens

### Apply the NLP Pipeline

In [94]:
def do_all(array):
    for doc in array:
        if 'title' in doc:
            doc['title'] = clean_arabic_text(doc['title'])
            doc['title'] = tokenize(doc['title'])
            doc['title'] = del_stop_words(doc['title'])
            doc['title'] =encode(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    article_info[field] = clean_arabic_text(article_info[field])
                    article_info[field] = tokenize(article_info[field])
                    article_info[field] = del_stop_words(article_info[field])
                    article_info[field] =encode(article_info[field])

        # Check if 'article_text' key exists and process its content
        if 'article_text' in doc:
            doc['article_text'] = [encode(del_stop_words(tokenize(clean_arabic_text(text)))) for text in doc['article_text'] if text.strip()]


In [95]:
do_all(array)

## Stemming

In [86]:
def stem(tokens):
    st = ISRIStemmer()
    stemmed_tokens = []
    for token in tokens:
        stemmed_token = st.stem(token.decode('utf-8'))
        stemmed_tokens.append(stemmed_token.encode('utf-8'))
    return stemmed_tokens

stemmed = stem(array[1]['article_text'][1])
print("normal", [token.decode('utf-8') for token in array[1]['article_text'][1]])
print("stemmed", stemmed)


normal ['نبدأ', 'جولة', 'الصحافة', 'التصريحات', 'الإسرائيلية', 'المستمرة', 'حول', 'نظرتها', 'قطر', 'ظل', 'رعايتها', 'لمفاوضات', 'وقف', 'إطلاق', 'النار', 'والإفراج', 'المحتجزين', 'وتبادل', 'الرهائن']
stemmed [b'\xd8\xa8\xd8\xaf\xd8\xa3', b'\xd8\xac\xd9\x88\xd9\x84', b'\xd8\xb5\xd8\xad\xd9\x81', b'\xd8\xb5\xd8\xb1\xd8\xad', b'\xd8\xa7\xd8\xb3\xd8\xb1\xd8\xa7\xd8\xa6\xd9\x8a\xd9\x84\xd9\x8a\xd8\xa9', b'\xd9\x85\xd8\xb1\xd8\xa9', b'\xd8\xad\xd9\x88\xd9\x84', b'\xd9\x86\xd8\xb8\xd8\xb1', b'\xd9\x82\xd8\xb7\xd8\xb1', b'\xd8\xb8\xd9\x84', b'\xd8\xb1\xd8\xb9\xd9\x8a', b'\xd9\x81\xd8\xa7\xd8\xb6', b'\xd9\x88\xd9\x82\xd9\x81', b'\xd8\xb7\xd9\x84\xd9\x82', b'\xd9\x86\xd8\xa7\xd8\xb1', b'\xd9\x81\xd8\xb1\xd8\xac', b'\xd8\xad\xd8\xac\xd8\xb2', b'\xd8\xa8\xd8\xaf\xd9\x84', b'\xd8\xb1\xd9\x87\xd9\x86']


In [96]:
def do_stem(array):
    for doc in array:
        if 'title' in doc:
            doc['title'] = stem(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    article_info[field] = stem(article_info[field])
                    

        # Check if 'article_text' key exists and process its content
        if 'article_text' in doc:
            doc['article_text'] = [stem(text) for text in doc['article_text']]


In [88]:
array

[{'_id': ObjectId('6613488502569160cffc5985'),
  'title': [b'\xd8\xa5\xd8\xb3\xd8\xb1\xd8\xa7\xd8\xa6\xd9\x8a\xd9\x84',
   b'\xd8\xaa\xd8\xb3\xd8\xad\xd8\xa8',
   b'\xd9\x83\xd8\xa7\xd9\x85\xd9\x84',
   b'\xd9\x82\xd9\x88\xd8\xa7\xd8\xaa\xd9\x87\xd8\xa7',
   b'\xd8\xac\xd9\x86\xd9\x88\xd8\xa8\xd9\x8a',
   b'\xd8\xba\xd8\xb2\xd8\xa9',
   b'\xd9\x88\xd8\xaa\xd8\xb3\xd8\xaa\xd9\x83\xd9\x85\xd9\x84',
   b'\xd9\x85\xd8\xb1\xd8\xad\xd9\x84\xd8\xa9',
   b'\xd8\xa3\xd8\xae\xd8\xb1\xd9\x89',
   b'\xd8\xa7\xd8\xb3\xd8\xaa\xd8\xb9\xd8\xaf\xd8\xa7\xd8\xaf\xd8\xa7',
   b'\xd9\x84\xd9\x84\xd8\xad\xd8\xb1\xd8\xa8',
   b'\xd8\xad\xd8\xaf\xd9\x88\xd8\xaf',
   b'\xd9\x84\xd8\xa8\xd9\x86\xd8\xa7\xd9\x86'],
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': [b'\xd8\xae\xd8\xa7\xd9\x86',
    b'\xd9\x8a\xd9\x88\xd9\x86\xd8\xb3',
    b'\xd8\xa5\xd8\xb3\xd8\xb1\xd8\xa7\xd8\xa6\xd9\x8a\xd9\x84',
    b'\xd8\xaa\xd8\xb3\xd8\xad\xd8\xa8',
    

In [97]:
do_stem(array)

In [90]:
array

[{'_id': ObjectId('6613488502569160cffc5985'),
  'title': [b'\xd8\xb1\xd8\xa7\xd8\xa6\xd9\x8a\xd9\x84',
   b'\xd8\xb3\xd8\xad\xd8\xa8',
   b'\xd9\x83\xd9\x85\xd9\x84',
   b'\xd9\x82\xd9\x88\xd8\xaa',
   b'\xd8\xac\xd9\x86\xd8\xa8',
   b'\xd8\xba\xd8\xb2\xd8\xa9',
   b'\xd9\x88\xd8\xaa\xd8\xb3',
   b'\xd8\xb1\xd8\xad\xd9\x84',
   b'\xd8\xae\xd8\xb1\xd9\x89',
   b'\xd8\xa7\xd8\xb3\xd8\xaa\xd8\xb9\xd8\xaf\xd8\xa7\xd8\xaf\xd8\xa7',
   b'\xd8\xad\xd8\xb1\xd8\xa8',
   b'\xd8\xad\xd8\xaf\xd8\xaf',
   b'\xd9\x84\xd8\xa8\xd9\x86'],
  'link': 'https://www.bbc.com/arabic/articles/c1vldp9le52o',
  'timestamp': '2024-04-07',
  'article_info': {'headline': [b'\xd8\xae\xd8\xa7\xd9\x86',
    b'\xd9\x88\xd9\x86\xd8\xb3',
    b'\xd8\xb1\xd8\xa7\xd8\xa6\xd9\x8a\xd9\x84',
    b'\xd8\xb3\xd8\xad\xd8\xa8',
    b'\xd9\x83\xd9\x85\xd9\x84',
    b'\xd9\x82\xd9\x88\xd8\xaa',
    b'\xd8\xac\xd9\x86\xd8\xa8',
    b'\xd8\xba\xd8\xb2\xd8\xa9',
    b'\xd9\x88\xd8\xaa\xd8\xb3',
    b'\xd8\xb1\xd8\xad\xd9\x84',
    b'

### Rule Based Approach in NLP

In [99]:
import re

def pos_tag_arabic(word):
    # Define regular expression patterns for each POS category
    noun_pattern = r'.*ة$|.*ون$'  # Ends with "-ة" or "-ون"
    verb_pattern = r'.*[آإ]ت.*|.*[اأى]ت.*'  # Contains a form of past tense verb
    adj_pattern = r'.*ي[ةه]$'  # Ends with "-ي" followed by "-ة" or "-ه"
    adv_pattern = r'.*اً?$'  # Ends with "اً"
    prep_pattern = r'^(ب|في|على|من|إلى)'  # Starts with a preposition
    conj_pattern = r'^(و|أو|لكن|إذا)'  # Starts with a conjunction
    pron_pattern = r'^(أنا|أنت|هو|هي|نحن|أنتم|هم|هن|هذا|هذه|ذلك|تلك)$'  # Pronouns
    word = word.decode('utf-8')
    # Match word against each POS pattern
    if re.match(noun_pattern, word):
        return 'NOUN'
    elif re.match(verb_pattern, word):
        return 'VERB'
    elif re.match(adj_pattern, word):
        return 'ADJ'
    elif re.match(adv_pattern, word):
        return 'ADV'
    elif re.match(prep_pattern, word):
        return 'PREP'
    elif re.match(conj_pattern, word):
        return 'CONJ'
    elif re.match(pron_pattern, word):
        return 'PRON'
    else:
        return 'OTHER'  # If word doesn't match any POS pattern

def pos_tokens_arabic(tokens):
    for i,word in enumerate(tokens):
        tokens[i]=(word,pos_tag_arabic(word))

In [108]:
new_arr=array.copy()
for i,doc in enumerate(new_arr):
        if 'title' in doc:
            new_arr[i]['title'] = pos_tokens_arabic(doc['title'])

        # Check if 'article_info' key exists and process its fields
        if 'article_info' in doc and doc['article_info'] is not None:
            article_info = doc['article_info']
            for field in article_info:
                if field != 'date_published' and field != 'date_modified' and field != 'url':
                    new_arr[i][field] = pos_tokens_arabic(article_info[field])
        # Check if 'article_text' key exists and process its content
        for text in doc['article_text']:
            pprint(text)
        if 'article_text' in doc:
            new_arr[i]['article_text'] = [pos_tokens_arabic(text) for text in doc['article_text']]

TypeError: 'NoneType' object is not iterable

In [109]:
pprint(new_arr)

[{'_id': ObjectId('6613488502569160cffc5985'),
  'article_info': {'author': [(b'\xd8\xb9\xd8\xb1\xd8\xa8', 'OTHER')],
                   'date_modified': '2024-04-07T17:29:47.481Z',
                   'date_published': '2024-04-07T14:22:07.221Z',
                   'headline': [(b'\xd8\xae\xd8\xa7\xd9\x86', 'OTHER'),
                                (b'\xd9\x88\xd9\x86\xd8\xb3', 'CONJ'),
                                (b'\xd8\xb1\xd8\xa7\xd8\xa6\xd9\x8a\xd9\x84',
                                 'OTHER'),
                                (b'\xd8\xb3\xd8\xad\xd8\xa8', 'OTHER'),
                                (b'\xd9\x83\xd9\x85\xd9\x84', 'OTHER'),
                                (b'\xd9\x82\xd9\x88\xd8\xaa', 'OTHER'),
                                (b'\xd8\xac\xd9\x86\xd8\xa8', 'OTHER'),
                                (b'\xd8\xba\xd8\xb2\xd8\xa9', 'NOUN'),
                                (b'\xd9\x88\xd8\xaa\xd8\xb3', 'CONJ'),
                                (b'\xd8\xb1\xd8\xad\xd9\x84

### ML Based Approach in NLP

In [26]:
tgs = nltk.pos_tag(arabic_words)

### NER Methods

In [27]:
named_entities = ne_chunk(tgs)
print(named_entities)

NameError: name 'ne_chunk' is not defined