In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

In [None]:
!pip install torch transformers pandas scikit-learn evaluate openpyxl numpy

In [None]:
!pip install farasa
!pip install arabic-reshaper
!pip install wordcloud
!pip install tashaphyne

In [None]:
!pip install farasapy

In [None]:
import pandas as pd
import re
import string
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

file_path = "comments_data.xlsx"
df = pd.read_excel(file_path)
df = df[['Comment Text']].dropna().reset_index(drop=True)

# Initialize Arabic NLP Tools
segmenter = FarasaSegmenter(interactive=True)
stemmer = FarasaStemmer(interactive=True)

def clean_arabic_text(text):
    text = re.sub(r'[\u064B-\u065F]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = segmenter.segment(text)
    text = " ".join([stemmer.stem(word) for word in text.split()])
    return text

df["Cleaned Post"] = df["Comment Text"].apply(clean_arabic_text)

central_regex = r'\b(دراسة|بحث|بيانات|إحصائيات|لأن|لذلك|بالتالي|حسب|وفقًا ل)\b'
peripheral_regex = r'\b(عاجل|خطير|كارثي|أزمة|الخبراء يقولون|يدعون|تظهر الدراسات)\b'

# Function: Labeling Based on NLP Rules
def label_post(text):
    if re.search(central_regex, text):
        return "Central Cue"
    elif re.search(peripheral_regex, text) or TextBlob(text).sentiment.polarity > 0.5:
        return "Peripheral Cue"
    else:
        return "Neutral"

df["Label"] = df["Cleaned Post"].apply(label_post)
df = df[df["Label"] != "Neutral"].reset_index(drop=True)

# TF-IDF for Feature Extraction
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X = vectorizer.fit_transform(df["Cleaned Post"])
y = df["Label"].map({"Central Cue": 0, "Peripheral Cue": 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM Model
model = SVC(kernel="linear")
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classify New Posts
df["Predicted Label"] = model.predict(vectorizer.transform(df["Cleaned Post"]))
df["Predicted Label"] = df["Predicted Label"].map({0: "Central Cue", 1: "Peripheral Cue"})

df.to_excel("classified_posts.xlsx", index=False)
print("✅ Automated Cue Classification Completed!")


In [None]:
import joblib

joblib.dump(model, "cue_classifier.pkl")

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ Model and Vectorizer Saved Successfully!")


In [None]:
import pandas as pd
import joblib
import re
import string
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer

model = joblib.load("cue_classifier.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

segmenter = FarasaSegmenter(interactive=True)
stemmer = FarasaStemmer(interactive=True)

def clean_arabic_text(text):
    text = re.sub(r'[\u064B-\u065F]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = segmenter.segment(text)
    text = " ".join([stemmer.stem(word) for word in text.split()])
    return text


In [None]:
large_file_path = "comments_data.xlsx"
df_large = pd.read_excel(large_file_path)

df_large = df_large[['Comment Text']].dropna().reset_index(drop=True)

df_large["Cleaned Post"] = df_large["Comment Text"].apply(clean_arabic_text)

X_large = vectorizer.transform(df_large["Cleaned Post"])

df_large["Predicted Label"] = model.predict(X_large)

df_large["Predicted Label"] = df_large["Predicted Label"].map({0: "Central Cue", 1: "Peripheral Cue"})

df_large.to_excel("cues_labeled_comments_dataset.xlsx", index=False)

print("✅ Large Dataset Classification Completed! Results saved in 'labeled_big_dataset.xlsx'.")


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

file_path = "cues_labeled_comments_dataset.xlsx"
df = pd.read_excel(file_path)

df = df.dropna(subset=['Comment Text'])

def combine_labels(row):
    return list(filter(pd.notna, [row["Predicted Label"],]))

df["Labels"] = df.apply(combine_labels, axis=1)

mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(df["Labels"])

labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_)

final_df = pd.concat([df[['Comment Text']].reset_index(drop=True), labels_df], axis=1)

final_df.to_excel("formatted_labeled_cues_comment_dataset.xlsx", index=False)

print("✅ Dataset formatted successfully and saved as formatted_comments_dataset.xlsx")


In [None]:
import pandas as pd

file_cues = "cues_post_data.xlsx"
file_propaganda = "propaganda_post_data.xlsx"

df_cues = pd.read_excel(file_cues)
df_propaganda = pd.read_excel(file_propaganda)

df_merged = pd.merge(df_propaganda, df_cues, on="Post Content", how="inner")

df_merged.to_excel("full_features_labeled_posts.xlsx", index=False)

print("✅ Merged dataset saved as 'merged_dataset.xlsx'")


both features

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from evaluate import load

file_path = "full_features_labeled_posts.xlsx"
df = pd.read_excel(file_path)

def preprocess_text(text):
    import re
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    return text.strip()

df["Processed_Text"] = df["Post Content"].astype(str).apply(preprocess_text)

def integrate_features(row):
    central = "Central Cue: Yes." if row["Central Cue"] == 1 else "Central Cue: No."
    peripheral = "Peripheral Cue: Yes." if row["Peripheral Cue"] == 1 else "Peripheral Cue: No."
    return f"{central} {peripheral} {row['Processed_Text']}"

df["Final_Text"] = df.apply(integrate_features, axis=1)

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["Final_Text"].tolist(), df["Propaganda"].tolist(),
    test_size=0.3, stratify=df["Propaganda"], random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.5, stratify=temp_labels, random_state=42
)

model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Tokenizing Data
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)

class PropagandaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = PropagandaDataset(train_encodings, train_labels)
val_dataset = PropagandaDataset(val_encodings, val_labels)
test_dataset = PropagandaDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    eval_steps=50,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=0.00001,
    weight_decay=0.01,
    warmup_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,
)

train_labels_np = np.array(train_labels)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_np),
    y=train_labels_np
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = precision_metric.compute(predictions=predictions, references=labels, average=None)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average=None)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)["f1"]

    precision_avg = np.mean(precision).item()
    recall_avg = np.mean(recall).item()
    f1_avg = np.mean(f1).item()

    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_avg,
        "recall": recall_avg,
        "f1_score": f1_avg
    }

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

test_results = trainer.evaluate(test_dataset)
print("\n🔍 Test Set Evaluation Results:")
for metric, value in test_results.items():
    print(f"{metric}: {value:.4f}")

model.save_pretrained("arabert_propaganda_model_with_cues")
tokenizer.save_pretrained("arabert_propaganda_model_with_cues")

print("\n✅ Model Training and Evaluation Complete! Model Saved.")


# New section

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from evaluate import load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

file_path = "full_features_labeled_posts.xlsx"
df = pd.read_excel(file_path)

def preprocess_text(text):
    import re
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    return text.strip()

df["Processed_Text"] = df["Post Content"].astype(str).apply(preprocess_text)

def integrate_features(row):
    central = "Central Cue: Yes." if row["Central Cue"] == 1 else "Central Cue: No."
    peripheral = "Peripheral Cue: Yes." if row["Peripheral Cue"] == 1 else "Peripheral Cue: No."
    return f"{central} {peripheral} {row['Processed_Text']}"

df["Final_Text"] = df.apply(integrate_features, axis=1)

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["Final_Text"].tolist(), df["Propaganda"].tolist(),
    test_size=0.3, stratify=df["Propaganda"], random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.5, stratify=temp_labels, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)
test_features = vectorizer.transform(test_texts)

train_labels_np = np.array(train_labels)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_np),
    y=train_labels_np
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    class_weight=class_weights_dict,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(train_features, train_labels)

val_preds = rf_model.predict(val_features)
test_preds = rf_model.predict(test_features)

def compute_rf_metrics(preds, labels):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average=None).mean(),
        "recall": recall_score(labels, preds, average=None).mean(),
        "f1_score": f1_score(labels, preds, average=None).mean()
    }

val_metrics = compute_rf_metrics(val_preds, val_labels)
test_metrics = compute_rf_metrics(test_preds, test_labels)

print("\nValidation Set Evaluation:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nTest Set Evaluation:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nClassification Report:")
print(classification_report(test_labels, test_preds))

joblib.dump(rf_model, "random_forest_propaganda.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("✅ Random Forest Model Training Complete! Model and Vectorizer Saved.")


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from evaluate import load

file_path = "full_labeled_comments_data.xlsx"
df = pd.read_excel(file_path)

def preprocess_text(text):
    import re
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    return text.strip()

df["Processed_Text"] = df["Comment Text"].astype(str).apply(preprocess_text)

def integrate_features(row):
    central = "Central Cue: Yes." if row["Central Cue"] == 1 else "Central Cue: No."
    peripheral = "Peripheral Cue: Yes." if row["Peripheral Cue"] == 1 else "Peripheral Cue: No."
    return f"{central} {peripheral} {row['Processed_Text']}"

df["Final_Text"] = df.apply(integrate_features, axis=1)

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["Final_Text"].tolist(), df["Propaganda"].tolist(),
    test_size=0.3, stratify=df["Propaganda"], random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.5, stratify=temp_labels, random_state=42
)

model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)

class PropagandaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = PropagandaDataset(train_encodings, train_labels)
val_dataset = PropagandaDataset(val_encodings, val_labels)
test_dataset = PropagandaDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    eval_steps=50,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=0.00001,
    weight_decay=0.01,
    warmup_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,
)

train_labels_np = np.array(train_labels)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_np),
    y=train_labels_np
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = precision_metric.compute(predictions=predictions, references=labels, average=None)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average=None)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)["f1"]

    precision_avg = np.mean(precision).item()
    recall_avg = np.mean(recall).item()
    f1_avg = np.mean(f1).item()

    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_avg,
        "recall": recall_avg,
        "f1_score": f1_avg
    }

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

test_results = trainer.evaluate(test_dataset)
print("\n🔍 Test Set Evaluation Results:")
for metric, value in test_results.items():
    print(f"{metric}: {value:.4f}")

model.save_pretrained("arabert_propaganda_model_comments_with_cues")
tokenizer.save_pretrained("arabert_propaganda_model_comments_with_cues")

print("\n✅ Model Training and Evaluation Complete! Model Saved.")


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from evaluate import load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
import re

file_path = "full_labeled_comments_data.xlsx"
df = pd.read_excel(file_path)

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    return text.strip()

df["Processed_Text"] = df["Comment Text"].astype(str).apply(preprocess_text)

def integrate_features(row):
    central = "Central Cue: Yes." if row["Central Cue"] == 1 else "Central Cue: No."
    peripheral = "Peripheral Cue: Yes." if row["Peripheral Cue"] == 1 else "Peripheral Cue: No."
    return f"{central} {peripheral} {row['Processed_Text']}"

df["Final_Text"] = df.apply(integrate_features, axis=1)

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["Final_Text"].tolist(), df["Propaganda"].tolist(),
    test_size=0.3, stratify=df["Propaganda"], random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.5, stratify=temp_labels, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)
test_features = vectorizer.transform(test_texts)

train_labels_np = np.array(train_labels)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_np),
    y=train_labels_np
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    class_weight=class_weights_dict,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(train_features, train_labels)

val_preds = rf_model.predict(val_features)
test_preds = rf_model.predict(test_features)

def compute_rf_metrics(preds, labels):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average=None).mean(),
        "recall": recall_score(labels, preds, average=None).mean(),
        "f1_score": f1_score(labels, preds, average=None).mean()
    }

val_metrics = compute_rf_metrics(val_preds, val_labels)
test_metrics = compute_rf_metrics(test_preds, test_labels)

print("\nValidation Set Evaluation:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nTest Set Evaluation:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nClassification Report:")
print(classification_report(test_labels, test_preds))

joblib.dump(rf_model, "random_forest_propaganda_comments.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer_comments.pkl")
print("✅ Random Forest Model Training Complete! Model and Vectorizer Saved.")

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "arabert_propaganda_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

unlabeled_file = "formatted_comments_dataset.xlsx"
df_unlabeled = pd.read_excel(unlabeled_file)

def preprocess_text(text):
    import re
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ي', 'ى', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    return text.strip()

df_unlabeled["Processed_Text"] = df_unlabeled["Comment Text"].apply(preprocess_text)

def predict_propaganda(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return prediction

df_unlabeled["Predicted_Propaganda"] = df_unlabeled["Processed_Text"].apply(predict_propaganda)

df_unlabeled.to_excel("predicted_propaganda_comments_data.xlsx", index=False)

print("✅ Prediction Complete! Saved results to 'predicted_propaganda_data.xlsx'")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

posts_file_path = "full_features_labeled_posts.xlsx"
comments_file_path = "full_labeled_comments_data.xlsx"

df_posts = pd.read_excel(posts_file_path)
df_comments = pd.read_excel(comments_file_path)

X_posts = df_posts[["Central Cue", "Peripheral Cue"]]
y_posts = df_posts["Propaganda"]

X_comments = df_comments[["Central Cue", "Peripheral Cue"]]
y_comments = df_comments["Propaganda"]

scaler = StandardScaler()
X_posts_scaled = scaler.fit_transform(X_posts)
X_comments_scaled = scaler.fit_transform(X_comments)

log_reg_posts = LogisticRegression()
log_reg_comments = LogisticRegression()

log_reg_posts.fit(X_posts_scaled, y_posts)
log_reg_comments.fit(X_comments_scaled, y_comments)

log_reg_coeff_posts = log_reg_posts.coef_[0]
log_reg_coeff_comments = log_reg_comments.coef_[0]

features = ["Central Cue", "Peripheral Cue"]

log_reg_values_posts = [log_reg_coeff_posts[0], log_reg_coeff_posts[1]]
log_reg_values_comments = [log_reg_coeff_comments[0], log_reg_coeff_comments[1]]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(features, log_reg_values_posts, color=['blue', 'orange'])
plt.title("Logistic Regression - Posts")
plt.ylabel("Coefficient Value")
plt.ylim(-0.5, 0.5)

plt.subplot(1, 2, 2)
plt.bar(features, log_reg_values_comments, color=['blue', 'orange'])
plt.title("Logistic Regression - Comments")
plt.ylim(-0.5, 0.5)

plt.show()
