In [1]:
import pandas as pd
import nltk
from pyarabic.araby import strip_tashkeel, strip_harakat, strip_lastharaka, strip_tatweel, normalize_hamza
import re

def delete_links(input_text):
    pettern  = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

def delete_repeated_characters(input_text):
    pattern  = r'(.)\1{2,}'
    out_text = re.sub(pattern, r"\1\1", input_text)
    return out_text

def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items())
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

def clean_text(input_text):
    replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]'
    out_text = re.sub(replace, " ", input_text)
    words = nltk.word_tokenize(out_text)
    out_text = ' '.join(words)
    return out_text

def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + ['خلال' , 'الى' , 'ان' , 'او' , 'انه'])
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    out_text = [w for w in tokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text


# improved the rouge L
def preprocess_text(text):
    text = delete_links(text)
    text = delete_repeated_characters(text)
    text = strip_tashkeel(text)
    text = strip_tatweel(text)
    text= clean_text(text) 
    text = remove_vowelization(text)
    text = replace_letters(text)
    text = delete_stopwords(text)
    return text


In [2]:
import glob
import pandas as pd


file_pattern = f"stories*.csv"
csv_files = glob.glob(file_pattern)

dataframes = []
for file in csv_files:
    df = pd.read_csv(file )
    dataframes.append(df)

for df in dataframes:
    df.story = df.story.apply(preprocess_text)

In [3]:
def calculate_word_count(df):
    return df['story'].apply(lambda x: len(x.split()))

# Create a new DataFrame to store word count for each DataFrame
word_count_df = pd.DataFrame()

# Iterate through each DataFrame in the list
for idx, df in enumerate(dataframes, 1):
    # Calculate word count for each row in the current DataFrame
    word_count_col = f"{str(df.topic[0])}"
    word_count_df[word_count_col] = calculate_word_count(df)

# Output the final DataFrame with word count for each DataFrame
word_count_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already loaded your DataFrame
# word_count_df = pd.DataFrame({...})

# Calculate the average of each column
column_averages = word_count_df.mean()

# Define the number of columns in your DataFrame
num_columns = len(word_count_df.columns)

# Calculate the number of rows and columns for subplots
num_rows = (num_columns + 2) // 3
num_cols = min(3, num_columns)

# Create subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

# Flatten the axs array in case we have fewer than 3 columns
axs = axs.flatten()

# Plot histograms and add average annotations for each column
for i, column in enumerate(word_count_df.columns):
    sns.histplot(word_count_df[column], ax=axs[i], kde=True)
    axs[i].set_title(f"Histogram of {column}")
    axs[i].set_xlabel(column)

    # Calculate the average value for the current column
    avg_value = column_averages[column]
    axs[i].axvline(avg_value, color='red', linestyle='dashed', linewidth=2)
    axs[i].text(avg_value, axs[i].get_ylim()[1]*0.9, f'Average: {avg_value:.2f}', color='red', ha='center')

# Remove any empty subplots
for i in range(num_columns, num_rows * num_cols):
    axs[i].axis('off')

# Adjust layout to avoid overlapping titles
plt.tight_layout()

# Show the plot
plt.show()


In [52]:
all_dataframes = pd.concat(dataframes)
# get the most common words
from collections import Counter
all_words = ' '.join(all_dataframes['story']).split()
word_count = Counter(all_words)
word_count.most_common(10)

[('المغرب', 11447),
 ('المغربيه', 10404),
 ('كورونا', 7743),
 ('الامازيغيه', 7561),
 ('المغربي', 6546),
 ('حاله', 6390),
 ('اجل', 6147),
 ('سنه', 5964),
 ('وفي', 5775),
 ('الحكومه', 5667)]

In [53]:
# get most frequent 2-grams
from nltk import ngrams
n = 2
ngrams_freq = Counter(ngrams(all_words, n))
ngrams_freq.most_common(10) 

[(('فيروس', 'كورونا'), 2840),
 (('كورونا', 'المستجد'), 1949),
 (('الحجر', 'الصحي'), 1624),
 (('محمد', 'السادس'), 1602),
 (('الدار', 'البيضاء'), 1451),
 (('النيابه', 'العامه'), 1286),
 (('جائحه', 'كورونا'), 1259),
 (('بفيروس', 'كورونا'), 1233),
 (('رئيس', 'الحكومه'), 1232),
 (('الملك', 'محمد'), 1209)]

In [56]:
# get most frequent 2-grams
from nltk import ngrams
n = 3
ngrams_freq = Counter(ngrams(all_words, n))
ngrams_freq.most_common(10)

[(('الملك', 'محمد', 'السادس'), 1179),
 (('فيروس', 'كورونا', 'المستجد'), 1112),
 (('لجريده', 'هسبريس', 'الالكترونيه'), 827),
 (('تصريح', 'لجريده', 'هسبريس'), 680),
 (('بفيروس', 'كورونا', 'المستجد'), 652),
 (('حاله', 'الطوارئ', 'الصحيه'), 630),
 (('النيابه', 'العامه', 'المختصه'), 616),
 (('سعد', 'الدين', 'العثماني'), 596),
 (('الملكي', 'للثقافه', 'الامازيغيه'), 527),
 (('حزب', 'العداله', 'والتنميه'), 477)]