### Extract Dataset Zip Folder

In [None]:
import zipfile
import os
with zipfile.ZipFile('./da7ee7_raw.zip', 'r') as zip_ref:
    zip_ref.extractall("./da7ee7_raw")

### Tokenizing Methods

In [None]:
from nltk.tokenize import RegexpTokenizer
import nltk
import re
nltk.download('punkt')

def tokenize_by_split(text):
    split_on = r'[,\!\.\،\:\[\]\(\)\s\'\"؟]'
    tokenized_document=re.split(split_on,text)
    tokenized_document=[token for token in tokenized_document if token!=""]
    return tokenized_document


def tokenizer(text):
    pattern = r'\b\w+\b|[^\w\s]'
    word_tokenizer = RegexpTokenizer(pattern)
    tokens = word_tokenizer.tokenize(text)
    return tokens

In [None]:
def tokenize_all_documents():
    all_documents= dict()
    directory = os.path.join(os.getcwd(), "da7ee7_raw", "Raw Data")
    for name in os.listdir(directory):
        filepath = os.path.join(directory, name)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                print(f"Content of '{name}'")
                all_documents[name]=tokenize_by_split(f.read())
                print(all_documents[name])
        except (PermissionError, IsADirectoryError, UnicodeDecodeError) as e:
            print(f"Skipping '{name}' due to error: {e}")
        print()
    return all_documents

In [None]:
# dictionary with document name as key and array of tokens as value

all_documents_tokenized= tokenize_all_documents() 

In [None]:
import string
import re

## use this method to ensure additional punctuations is removed & diacritics (tashkeel)
def remove_punctuations_and_numbers(all_documents):
    arabic_punctuation = '،؛؟«»ـ'
    additional_punctuation = '[]\\'
    all_punctuation = string.punctuation + arabic_punctuation + additional_punctuation
    arabic_diacritics = re.compile(r'[\u064B-\u065F\u0610-\u061A\u06D6-\u06ED]')
    
    for document in all_documents:
        filtered = []
        for entry in all_documents[document]:
            entry_no_diacritics = re.sub(arabic_diacritics, '', entry)
            if (entry_no_diacritics and all(char not in all_punctuation for char in entry_no_diacritics) and not any(char.isdigit() for char in entry_no_diacritics)
):
                filtered.append(entry_no_diacritics)
        all_documents[document] = filtered  
                
    return all_documents


In [None]:
filtered_all_docs= remove_punctuations_and_numbers(all_documents_tokenized)

In [None]:
filtered_all_docs

### Calculating TDF 

In [None]:
def word_count_per_document(all_docs):
    for document in all_docs:
        tokens= all_docs[document]
        word_count= dict()
        for token in tokens:
            if( not token in word_count):
                word_count[token]=1
            else:
                word_count[token]+=1
        all_docs[document]=word_count
    return all_docs                

In [None]:
word_count_in_doc= word_count_per_document(filtered_all_docs)

In [None]:
word_count_in_doc

### Choosing words appearing across all documents with highest counts

In [None]:
def count_word_across_docs(unique_word_count):
    print("before")
    all_words=dict()
    print("after")
    for doc in unique_word_count:
        tokens= unique_word_count[doc]
        for token in tokens:
            if (not token in all_words):
                all_words[token]=tokens[token]
            else: 
                all_words[token]+=tokens[token]    
    return all_words            

def choose_frequent_words_100(count_all_words):
    stopwords=[]
    for token in count_all_words:
        if(count_all_words[token]>=100):
            stopwords.append(token)
    return stopwords    

def choose_frequent_words_150(count_all_words):
    stopwords=[]
    for token in count_all_words:
        if(count_all_words[token]>=150):
            stopwords.append(token)
    return stopwords 

### Checking Word Frequency Across All Documents (not cleaned)

In [None]:
## not cleaned (every word kept same as in dataset)
word_count_in_all_doc=count_word_across_docs(word_count_in_doc)
stopwords_not_cleaned= choose_frequent_words_100(word_count_in_all_doc)

In [None]:
stopwords_not_cleaned

In [None]:
greater_than_150=choose_frequent_words_150(word_count_in_all_doc)

In [None]:
greater_than_150

In [None]:
# compare words not >=150 but >=100 to see if relevant to remove

def find_extra_words(most_frequent_100, most_frequent_150):
    return set(most_frequent_100) - set(most_frequent_150)   

In [None]:
extra_possible_stopwords= find_extra_words(stopwords_not_cleaned, greater_than_150)

In [None]:
extra_possible_stopwords

### Checking word frequency after cleaning a bit (combining words)

In [None]:
## Trying to see count difference (same word in multiple forms)
def simplified_form(token):
    found = True
    base_word = ""
    if token.startswith("و"):  
        base_word = token[1:] 
    elif token.startswith("ب") and len(token) > 3:  
        base_word = token[1:]  
    elif token.startswith("ك") and len(token) > 3:  
        base_word = token[1:]  
    elif token.startswith("ف") and len(token) > 3:  
        base_word = token[1:]  
    elif token.startswith("ل") and len(token) > 3:  
        base_word = token[1:]      
    else:
        found = False               
    return base_word, found


In [None]:
def token_base_word(token):
    found=True
    base_word=""
    if token.startswith('وال'):  
        base_word = token[3:]   
    elif token.startswith('و') and len(token) > 1:  # Check if the word starts with 'و' and is not just 'و'
        base_word = token[1:] 
    elif token.startswith("ال") and token!="اللى": 
        base_word = token[2:] 
    elif token.startswith("بال") and len(token) > 3:  
        base_word = token[3:]  
    elif token.startswith('ب') and len(token) > 3:  # cases like بشر
        base_word = token[1:]     
    elif token.startswith("كال") and len(token) > 3:  
        base_word = token[3:] 
    elif token.startswith("ك") and len(token) > 2:  # cases like كل
        base_word = token[1:] 
    elif token.startswith("لل") and len(token) > 2:  
        base_word = token[2:]      
    elif token.startswith("ل") and len(token) > 1:  
        base_word = token[1:]     
    else:
        found=False        
    return base_word,found     

In [None]:
#check across all documents الكلمة + with و\ك\ب\ال\ل

def remove_duplicates_with_prefixes(word_count_per_doc):
    all_words = set()  
    removed_words_with_prefixes = []
    for document in word_count_per_doc:
        all_words.update(word_count_per_doc[document].keys())    
    for document in word_count_per_doc:
        tokens = word_count_per_doc[document]
        keys_to_process = list(tokens.keys())  
        for token in keys_to_process:
            base_word,found= token_base_word(token)
            second_base, second_match=simplified_form(token) #finding with ال if no presence of one without
            if base_word in all_words and found:
                if base_word not in tokens:
                    tokens[base_word]=0
                tokens[base_word] += tokens[token]
                removed_words_with_prefixes.append(token)
                del tokens[token]  
            elif second_base in all_words and second_match:
                if second_base not in tokens:
                    tokens[second_base]=0
                removed_words_with_prefixes.append(token)
                tokens[second_base]+=tokens[token]  
                del tokens[token]  
    return word_count_per_doc,removed_words_with_prefixes

    

In [None]:
# applied on every words in each document separately
# count_words_in_doc was a dictionary with key document name, value a dictionary of word in document 
# word in document is (key) and it's count as a value

unique_word_count,removed_words_with_prefixes= remove_duplicates_with_prefixes(word_count_in_doc)

In [None]:
unique_word_count

In [None]:
# Now we count similar words with prefixes removed across all documents again
cleaned_count_per_doc= count_word_across_docs(unique_word_count)

In [None]:
# These are stopwards after prefixes were removed and count across doc recalculated
stopwords_after_cleaning= choose_frequent_words_100(cleaned_count_per_doc)

In [None]:
stopwords_after_cleaning

In [None]:
# 15 extra words after merging prefixes 
len(stopwords_after_cleaning)- len(stopwords_not_cleaned)

In [None]:
greater_than_150_cleaned= choose_frequent_words_150(cleaned_count_per_doc)

In [None]:
extra_stopwards_cleaned=find_extra_words(stopwords_after_cleaning,greater_than_150_cleaned)

In [None]:
# difference is 189 between the cleaned_stopwards (>=100 word count) and (>=150 word count)
extra_stopwards_cleaned

### In how many documents was the same word considered frequent?

Finding min values in IDF (most freq)

In [None]:
import math

def calculate_idf(term_document_count):
    idf_values = dict()  # Dictionary to store the number of documents the word was present in
    documents_count = len(term_document_count)
    for doc in term_document_count:
        tokens = term_document_count[doc]
        for token in tokens:
            if token not in idf_values:
                idf_values[token] = 1
            else:
                idf_values[token] += 1  
    for key in idf_values:
        idf_values[key] = math.log10(documents_count / idf_values[key])     
    sorted_idf = dict(sorted(idf_values.items(), key=lambda item: item[1]))
    return sorted_idf


In [None]:
def find_top_100(idf_for_word):
    top_100_freq=[]
    counter=0
    for entry in idf_for_word :
        top_100_freq.append(entry)
        counter=counter+1
        if(counter==100):
            break
    return top_100_freq

def find_top_150(idf_for_word):
    top_150_freq=[]
    counter=0
    for entry in idf_for_word:
        top_150_freq.append(entry)
        counter=counter+1
        if(counter==150):
            break
    return top_150_freq
 

In [None]:
idf=calculate_idf(word_count_in_doc)
idf_cleaned=calculate_idf(unique_word_count)

### Top 100 and 150 words that are frequent across documents (not cleaned)

In [None]:
top_100_frequent_idf=find_top_100(idf)

In [None]:
top_150_frequent_idf=find_top_150(idf)

### Top 100 and 150 words that are frequent across documents (cleaned)

In [None]:
top_100_freq_idf_cleaned= find_top_100(idf_cleaned)

In [None]:
top_150_freq_idf_cleaned= find_top_150(idf_cleaned)