In [10]:
import os
import re
import string
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mariam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Loading Data

In [11]:
def load_data(dir):
    raw_data = {}
    if os.path.exists(dir):
        for file_name in os.listdir(dir):
            if file_name.endswith('.txt'):
                file_path = os.path.join(dir, file_name)
                with open(file_path, 'r', encoding='utf-8') as f:
                    raw_data[file_name] = f.read()
    
    return raw_data

### Basic Cleaning & Normalization

In [12]:
def remove_timestamps(text):
    pattern = r'\d+\.\d+:\s*'
    text_lines = text.split('\n')
    cleaned_lines = [re.sub(pattern, '', line) for line in text_lines]
    return '\n'.join(cleaned_lines)

In [13]:
def remove_punctuations_and_numbers(text):
    arabic_punctuation = '،؛؟«»ـ'
    additional_punctuation = '[]\\'
    all_punctuation = string.punctuation + arabic_punctuation + additional_punctuation
    arabic_diacritics = re.compile(r'[\u064B-\u065F\u0610-\u061A\u06D6-\u06ED]')
    
    # Split text into lines to preserve line breaks
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Remove diacritics
        line = re.sub(arabic_diacritics, '', line)
        
        # Split into words and remove punctuation/numbers from each word
        cleaned_words = []
        words = line.split()
        
        for word in words:
            # Remove any character that is punctuation or number
            cleaned_word = ''.join(char for char in word 
                                 if char not in all_punctuation 
                                 and not char.isdigit())
            # Only add non-empty words
            if cleaned_word:
                cleaned_words.append(cleaned_word)
                
        cleaned_lines.append(' '.join(cleaned_words))
    
    # Join lines back together with newlines
    return '\n'.join(cleaned_lines)

In [14]:
def remove_single_letters(text):
    return re.sub(r'\b[ء-ي]\b', '', text)

In [15]:
def remove_repeated_words(text):
    return re.sub(r'(\b\w+\b)(\s+\1\b\s*)+', r'\1 ', text).strip()

### Tokenization

In [16]:
from nltk.tokenize import RegexpTokenizer

def tokenize_text(text):
    pattern = r'\b\w+\b|[^\w\s]'
    word_tokenizer = RegexpTokenizer(pattern)
    tokens = word_tokenizer.tokenize(text)
    return tokens

### Stopwords Removal

In [17]:
stopwords = []
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f]

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]


### Stemming

In [18]:
from nltk.stem.isri import ISRIStemmer

stemmer = ISRIStemmer()

def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens


### Putting it all together

In [19]:
import os
def preprocess_transcripts(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    raw_data = load_data(input_dir)

    for filename, text in raw_data.items():
        text = remove_timestamps(text)
        text = remove_punctuations_and_numbers(text)
        text = remove_single_letters(text)
        text = remove_repeated_words(text)

        tokens = tokenize_text(text)
        tokens = remove_stopwords(tokens)
        tokens = stem_tokens(tokens)
        
        output_path = os.path.join(output_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(' '.join(tokens))

raw_dir = 'Raw Data'
preprocessed_dir = 'Preprocessed Data'

preprocess_transcripts(raw_dir, preprocessed_dir)

### Converting into TF-IDF representation

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

In [21]:
import pandas as pd

def process_combined_corpus(preprocessed_files):
    corpus = list(preprocessed_files.values())
    tfidf_matrix_all, vectorizer_all = compute_tfidf(corpus)
    
    df_all = pd.DataFrame(
        tfidf_matrix_all.toarray(),
        columns=vectorizer_all.get_feature_names_out(),
        index=list(preprocessed_files.keys())
    )
    
    df_all.to_csv('all_documents_tfidf.csv')

preprocessed_files = load_data(preprocessed_dir)
process_combined_corpus(preprocessed_files)

# Named Entity Recognition

In [22]:
from transformers import pipeline

# Use a better Arabic NER model
model_name = "AUBMindLab/bert-base-arabert" # AUBMindLab/bert-base-arabert"  # Or try "CAMeL-Lab/bert-base-arabic-camelbert-msa"
ner_pipeline = pipeline("ner", model=model_name, tokenizer=model_name, aggregation_strategy="simple")

# Read text from file
file_path = "./Preprocessed Data/مافيا  الدحيح.txt"
with open(file_path, "r", encoding="utf-8") as file:
    arabic_text = file.read()

# Process text in chunks to avoid truncation
max_length = 510  # Safe limit for transformer models
words = arabic_text.split()
chunks = [" ".join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

# Process each chunk
all_ner_results = []
for chunk in chunks:
    ner_results = ner_pipeline(chunk)
    all_ner_results.extend(ner_results)

    # Sort NER results by the 'start' position
    sorted_ner_results = sorted(all_ner_results, key=lambda x: x['start'])

    # Print sorted results
    for entity in sorted_ner_results:
        print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
# # Print results
# for entity in all_ner_results:
#     print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")



  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForTokenClassification were not initialized from the model checkpoint at AUBMindLab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entity: قلتل تيج وحد, Label: LABEL_1, Score: 0.6203
Entity: جيبش, Label: LABEL_0, Score: 0.5239
Entity: عا, Label: LABEL_1, Score: 0.5556
Entity: ##ك, Label: LABEL_0, Score: 0.5178
Entity: وحد اه, Label: LABEL_1, Score: 0.7013
Entity: ##و, Label: LABEL_0, Score: 0.5937
Entity: ابن جن, Label: LABEL_1, Score: 0.5724
Entity: ##ن, Label: LABEL_0, Score: 0.5544
Entity: عاك, Label: LABEL_1, Score: 0.5694
Entity: سلح, Label: LABEL_0, Score: 0.5052
Entity: خفش معاي, Label: LABEL_1, Score: 0.5940
Entity: ##يش سلح, Label: LABEL_0, Score: 0.5370
Entity: جيب, Label: LABEL_1, Score: 0.5113
Entity: ##تش, Label: LABEL_0, Score: 0.5258
Entity: عاك سلح, Label: LABEL_1, Score: 0.5361
Entity: قبل, Label: LABEL_0, Score: 0.5151
Entity: خطر أقل, Label: LABEL_1, Score: 0.5262
Entity: عنديش غبو, Label: LABEL_0, Score: 0.5305
Entity: جاي قسم نطق اشي, Label: LABEL_1, Score: 0.5966
Entity: هآخد نيوجيرسي, Label: LABEL_0, Score: 0.5264
Entity: ونيويور, Label: LABEL_1, Score: 0.5660
Entity: ##ك, Label: LABEL_0, Sc