<a href="https://colab.research.google.com/github/maryamgaber/Detection-of-AI-Generated-Arabic-Text/blob/main/Phase_1%262.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Task 1.2: Use the datasets library from Hugging Face to download the arabic- generated-abstracts dataset directly into a Python environment (By Google Colab).

In [None]:
import os
import json

# 1️⃣ Set the folder where your notebooks are located
# If your notebooks are in the current working directory, keep "."
notebooks_folder = "."  # change to "/content/your_folder" if needed

# 2️⃣ Walk through all files in the folder
for root, dirs, files in os.walk(notebooks_folder):
    for file in files:
        if file.endswith(".ipynb"):
            notebook_path = os.path.join(root, file)

            # Load notebook JSON
            with open(notebook_path, "r", encoding="utf-8") as f:
                nb = json.load(f)

            # Remove widgets metadata if exists
            if "widgets" in nb.get("metadata", {}):
                del nb["metadata"]["widgets"]
                # Save cleaned notebook
                with open(notebook_path, "w", encoding="utf-8") as f:
                    json.dump(nb, f, indent=1)
                print(f"Cleaned widgets metadata in: {notebook_path}")
            else:
                print(f"No widgets metadata found in: {notebook_path}")

print("✅ All notebooks are cleaned and ready for GitHub upload!")


In [None]:
# !pip install datasets
# !pip install python-dotenv


In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")
print(dataset)


### Task 1.3: Perform initial data exploration:
#### 1- Load and inspect the dataset structure (columns, data types).


In [None]:
# Inspect column names and data types for one split (e.g., 'by_polishing')
print("\nFeatures in 'by_polishing':")
print(dataset['by_polishing'].features)

# Check dataset info (shape, structure, statistics)
print("\nDataset info for 'by_polishing':")
print(dataset['by_polishing'])




#### 2- Check the distribution of the target variable (label: human vs. AI- generated) for dataset["by_polishing"].


In [None]:
# Choose one split (e.g., by_polishing)
split1 = dataset["by_polishing"]

# Count human-written abstracts
num_human = len(split1["original_abstract"])

# Count AI-generated abstracts (4 per row)
num_ai = len(split1["allam_generated_abstract"]) \
       + len(split1["jais_generated_abstract"]) \
       + len(split1["llama_generated_abstract"]) \
       + len(split1["openai_generated_abstract"])

print("Number of human abstracts:", num_human)
print("Number of AI-generated abstracts:", num_ai)

# Distribution ratio
total = num_human + num_ai
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

#### 3- Assess data quality: check for missing values, duplicates, and inconsistencies:


Missing values → any None/NaN in columns

Duplicates → same abstract appearing multiple times

Inconsistencies → like empty strings " " or unusual data

In [None]:
import pandas as pd
# Convert to pandas for easier checks
df = pd.DataFrame(split1)

# 1. Missing values
print("Missing values per column:")
print(df.isnull().sum())
print("_________________________________________")

# 2. Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Also check duplicates in each column separately
for col in df.columns:
    print(f"Duplicates in column {col}: {df[col].duplicated().sum()}")
print("_________________________________________")


# 3. Inconsistencies: empty strings or only spaces
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"Empty/blank values in column {col}: {empty_count}")


####Check the distribution of the target variable (label: human vs. AI- generated) for dataset["from_title"].

In [None]:
split2 = dataset["from_title"]

# Count human-written abstracts
num_human = len(split2["original_abstract"])

# Count AI-generated abstracts (4 per row)
num_ai = len(split2["allam_generated_abstract"]) \
       + len(split2["jais_generated_abstract"]) \
       + len(split2["llama_generated_abstract"]) \
       + len(split2["openai_generated_abstract"])

print("Number of human abstracts:", num_human)
print("Number of AI-generated abstracts:", num_ai)

# Distribution ratio
total = num_human + num_ai
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

In [None]:
import pandas as pd
# Convert to pandas for easier checks
df = pd.DataFrame(split2)

# 1. Missing values
print("Missing values per column:")
print(df.isnull().sum())
print("_________________________________________")

# 2. Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Also check duplicates in each column separately
for col in df.columns:
    print(f"Duplicates in column {col}: {df[col].duplicated().sum()}")
print("_________________________________________")


# 3. Inconsistencies: empty strings or only spaces
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"Empty/blank values in column {col}: {empty_count}")


####Check the distribution of the target variable (label: human vs. AI- generated) for dataset["from_title_and_content"].

In [None]:
split3 = dataset["from_title_and_content"]

# Count human-written abstracts
num_human = len(split3["original_abstract"])

# Count AI-generated abstracts (4 per row)
num_ai = len(split3["allam_generated_abstract"]) \
       + len(split3["jais_generated_abstract"]) \
       + len(split3["llama_generated_abstract"]) \
       + len(split3["openai_generated_abstract"])

print("Number of human abstracts:", num_human)
print("Number of AI-generated abstracts:", num_ai)

# Distribution ratio
total = num_human + num_ai
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

In [None]:
import pandas as pd
# Convert to pandas for easier checks
df = pd.DataFrame(split3)

# 1. Missing values
print("Missing values per column:")
print(df.isnull().sum())
print("_________________________________________")

# 2. Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Also check duplicates in each column separately
for col in df.columns:
    print(f"Duplicates in column {col}: {df[col].duplicated().sum()}")
print("_________________________________________")


# 3. Inconsistencies: empty strings or only spaces
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"Empty/blank values in column {col}: {empty_count}")


## Phase 2 -preprocessing

In [None]:
# task 2.1: Arabic Text Preprocessing

import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from datasets import load_dataset


In [None]:
# Download required NLTK resources
nltk.download('stopwords')

In [None]:
# Check columns
print(df.head())

#Define Arabic text cleaning functions

In [None]:
# Remove tashkeel (diacritics)
def remove_diacritics(text):
    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text)

In [None]:
# Normalize Arabic text
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[^؀-ۿ ]+", " ", text)  # remove non-Arabic chars
    return text

In [None]:
# Initialize stopwords and stemmer
arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

In [None]:
# Full preprocessing pipeline
def preprocess_text(text):
    text = str(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in arabic_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

In [None]:
# Apply preprocessing
text_columns = [
    'original_abstract',
    'allam_generated_abstract',
    'jais_generated_abstract',
    'llama_generated_abstract',
    'openai_generated_abstract'
]
for col in text_columns:
    clean_col = col + "_clean"
    df[clean_col] = df[col].apply(preprocess_text)
print(" Preprocessing complete! Here are the new columns:")
print(df.columns)
df.head(2)



# Task 2.2: Exploratory Data Analysis (EDA)



In [None]:

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import numpy as np

In [None]:
# Combine AI abstracts into one column
ai_texts = pd.concat([
    df['allam_generated_abstract_clean'],
    df['jais_generated_abstract_clean'],
    df['llama_generated_abstract_clean'],
    df['openai_generated_abstract_clean']
], axis=0).dropna().tolist()


In [None]:
human_texts = df['original_abstract_clean'].dropna().tolist()


In [None]:
# --- Statistical Analysis ---
def text_stats(texts):
    words = [w for txt in texts for w in txt.split()]  # Split into words
    avg_word_len = np.mean([len(w) for w in words]) #متوسط طول الكلمة
    avg_sent_len = np.mean([len(txt.split()) for txt in texts]) #متوسط طول الجملة
    vocab = set(words)
    ttr = len(vocab) / len(words) #حساب التنوع اللغوي الموجود في النص
    return avg_word_len, avg_sent_len, ttr

In [None]:
stats_human = text_stats(human_texts)
stats_ai = text_stats(ai_texts)

In [None]:
print("\n Statistical Summary:")
print(f"Human-written: Avg word len={stats_human[0]:.2f}, Avg sent len={stats_human[1]:.2f}, TTR={stats_human[2]:.3f}")
print(f"AI-generated : Avg word len={stats_ai[0]:.2f}, Avg sent len={stats_ai[1]:.2f}, TTR={stats_ai[2]:.3f}")

In [None]:
# --- N-gram Frequency ---
def plot_top_ngrams(texts, n=2, top_k=15):
    from sklearn.feature_extraction.text import CountVectorizer
    vec = CountVectorizer(ngram_range=(n, n))
    bag = vec.fit_transform(texts)
    sum_words = bag.sum(axis=0)
    freqs = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    freqs = sorted(freqs, key=lambda x: x[1], reverse=True)[:top_k]
    words, counts = zip(*freqs)
    plt.figure(figsize=(10,4))
    sns.barplot(x=list(counts), y=list(words))
    plt.title(f"Top {top_k} {n}-grams – {n}-grams for {'Human' if texts==human_texts else 'AI'} abstracts")
    plt.show()

print("\n Top Bigrams for Human-written abstracts:")
plot_top_ngrams(human_texts, n=2)

print("\n Top Bigrams for AI-generated abstracts:")
plot_top_ngrams(ai_texts, n=2)


In [None]:
#Sentence Length Distribution

#Purpose: Compare how long the abstracts are (in words or characters).
#AI-generated text might be longer, more repetitive, or more uniform than human-written text.

In [None]:

import matplotlib.pyplot as plt

df["human_length"] = df["original_abstract"].apply(lambda x: len(x.split()))
df["openai_length"] = df["openai_generated_abstract"].apply(lambda x: len(x.split()))

plt.figure(figsize=(8,5))
plt.hist(df["human_length"], bins=30, alpha=0.6, label="Human-written", color='blue')
plt.hist(df["openai_length"], bins=30, alpha=0.6, label="Openai-generated", color='orange')
plt.xlabel("Sentence Length (words)")
plt.ylabel("Frequency")
plt.title("Sentence Length Distribution")
plt.legend()
plt.show()

In [None]:
#Vocabulary Richness (Type–Token Ratio)

#Purpose: See how diverse the vocabulary is — humans often use richer language.

In [None]:
def type_token_ratio(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0

df["human_ttr"] = df["original_abstract"].apply(type_token_ratio)
df["openai_ttr"] = df["openai_generated_abstract"].apply(type_token_ratio)

plt.figure(figsize=(6,5))
plt.boxplot([df["human_ttr"], df["openai_ttr"]], labels=["Human", "Open AI"])
plt.title("Vocabulary Richness (Type–Token Ratio)")
plt.ylabel("TTR Score")
plt.show()

In [None]:
#Word Frequency Comparison (Side-by-Side Bar Plot)

#Purpose: See which words are overused by Open AI vs humans.

In [None]:
from collections import Counter
import pandas as pd

human_words = " ".join(df["original_abstract"]).split()
Openai_words = " ".join(df["openai_generated_abstract"]).split()

human_freq = Counter(human_words)
ai_freq = Counter(Openai_words)

common_words = set(list(human_freq.keys())[:100]) & set(list(ai_freq.keys())[:100])

data = []
for w in common_words:
    data.append((w, human_freq[w], ai_freq[w]))

freq_df = pd.DataFrame(data, columns=["word", "human", "Open ai"]).sort_values("human", ascending=False)[:15]

freq_df.plot(x="word", kind="bar", figsize=(10,5), title="Top Words: Human vs Open AI", rot=45)
plt.ylabel("Frequency")
plt.show()