In [None]:
!pip install datasets scikit-learn lime


In [None]:
!pip install graphviz

In [None]:
!pip install nltk

In [None]:
!pip install -U datasets
!pip install fsspec==2023.9.2

In [None]:
label_mapping = { 0: 'Cause-Effect-e2-e1', 1: 'Cause-Effect-e1-e2', 2: 'Component-Whole', 3: 'Component-Whole', 4: 'Content-Container', 5: 'Content-Container', 6: 'Entity-Destination', 7: 'Entity-Destination', 8: 'Entity-Origin', 9: 'Entity-Origin', 10: 'Instrument-Agency', 11: 'Instrument-Agency', 12: 'Member-Collection', 13: 'Member-Collection', 14: 'Message-Topic', 15: 'Message-Topic', 16: 'Product-Producer', 17: 'Product-Producer', 18: 'Other' }

**ANOVA Method**

In [None]:
import pandas as pd
from datasets import load_dataset
from scipy.stats import f_oneway
from collections import Counter
import numpy as np
import spacy
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # دانلود منابع punkt_tab برای رفع خطا

class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def to_lowercase(self, input_text):
        return input_text.lower()

    def remove_punctuation(self, input_text):
        return input_text.translate(str.maketrans('', '', string.punctuation))

    def remove_whitespace(self, input_text):
        return ' '.join(input_text.split())

    def remove_stopwords(self, input_text):
        words = nltk.word_tokenize(input_text)
        return ' '.join(word for word in words if word not in self.stop_words)

    def stem_text(self, input_text):
        words = nltk.word_tokenize(input_text)
        return ' '.join(self.stemmer.stem(word) for word in words)

    def lemmatize_text(self, input_text):
        doc = self.nlp(input_text)
        return ' '.join(token.lemma_ for token in doc)

    def remove_special_characters(self, input_text):
        return re.sub(r'[^A-Za-z\s]', ' ', input_text)

    def tokenize_text(self, input_text):
        return nltk.word_tokenize(input_text)

    def preprocess(self, input_text, with_tokenize=False):
        input_text = self.to_lowercase(input_text)
        input_text = self.remove_punctuation(input_text)
        input_text = self.remove_special_characters(input_text)
        input_text = self.remove_whitespace(input_text)
        input_text = self.remove_stopwords(input_text)
        if with_tokenize:
            input_text = self.tokenize_text(input_text)
        return input_text

# تعریف کلاس AnoVaText
class AnoVaText:
    def __init__(self, texts, labels, num_important_words):
        if len(texts) != len(labels):
            raise ValueError("The number of texts and labels must be the same.")
        self.texts = texts
        self.labels = labels
        self.num_important_words = num_important_words
        self.unique_words = self.extract_unique_words(texts)
        self.word_counts = self.compute_word_counts(texts)

    def extract_unique_words(self, texts):
        words = []
        for text in texts:
            words.extend(text.split())
        return list(set(words))

    def compute_word_counts(self, texts):
        word_counts = []
        for text in texts:
            count = Counter(text.split())
            word_counts.append(count)
        return word_counts

    def analyze(self):
        label_set = set(self.labels)
        important_words = {}
        for label in label_set:
            label_indices = [i for i, lbl in enumerate(self.labels) if lbl == label]
            other_indices = [i for i, lbl in enumerate(self.labels) if lbl != label]
            label_word_counts = [self.word_counts[i] for i in label_indices]
            other_word_counts = [self.word_counts[i] for i in other_indices]

            word_scores = {}
            for word in self.unique_words:
                label_word_frequencies = [count.get(word, 0) for count in label_word_counts]
                other_word_frequencies = [count.get(word, 0) for count in other_word_counts]

                if np.var(label_word_frequencies) > 0 and np.var(other_word_frequencies) > 0:
                    f_stat, p_value = f_oneway(label_word_frequencies, other_word_frequencies)
                    word_scores[word] = (f_stat, np.var(label_word_frequencies))
                else:
                    word_scores[word] = (0, np.var(label_word_frequencies))

            sorted_words = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)
            important_words[label] = sorted_words[:self.num_important_words]

        return important_words

# بارگیری دیتاست
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
print(ds)
train_data, test_data = ds["train"], ds["test"]
print("length ", len(train_data))

# پیش‌پردازش متن
txt_prs = TextPreprocessor()
text_list, target_list = [], []
for row in train_data:
    sentence = row["sentence"]

    # حذف `and` و کلمه بعد از `</e1>`
    sentence = re.sub(r'</e1>\s*and\s*\w+', '</e1>', sentence)

    pattern = r"<e1>.*?</e1>(.*?)<e2>.*?</e2>"

    match = re.search(pattern, sentence)
    if match:
        text = match.group(1).strip()  # استخراج متن بین تگ‌ها و حذف فاصله‌های اضافی
        label = row["relation"]
        text_list.append(txt_prs.preprocess(text, with_tokenize=False))  # پیش‌پردازش متن
        # شرط برچسب‌ها
        if label == 0 or label == 1:
            target_list.append("cause-effect")
        else:
            target_list.append("others")

# شمارش نمونه‌های هر دسته
cause_effect_count = target_list.count("cause-effect")
others_count = target_list.count("others")

print(f"تعداد نمونه‌های cause-effect: {cause_effect_count}")
print(f"تعداد نمونه‌های others: {others_count}")

# ایجاد یک شیء از TextAnalyzerANOVA
analyzer = AnoVaText(text_list, target_list, 50)
important_words = analyzer.analyze()

# خروجی به فایل اکسل
with pd.ExcelWriter('important_words.xlsx') as writer:
    for key, value in important_words.items():
        df = pd.DataFrame([(word, score[0], score[1]) for word, score in value], columns=['Word', 'Score', 'Variance'])
        df.to_excel(writer, sheet_name=key, index=False)

print("Important words have been written to the Excel file 'important_words.xlsx'.")

In [None]:
import pandas as pd
from datasets import load_dataset
from scipy.stats import f_oneway
from collections import Counter
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

# تعریف کلاس AnoVaTextBigram
class AnoVaTextBigram:
    def __init__(self, bigram_texts, labels, num_important_words):
        if len(bigram_texts) != len(labels):
            raise ValueError("The number of texts and labels must be the same.")
        self.texts = bigram_texts
        self.labels = labels
        self.num_important_words = num_important_words
        self.unique_bigrams = self.extract_unique_bigrams(bigram_texts)
        self.bigram_counts = self.compute_bigram_counts(bigram_texts)

    def extract_unique_bigrams(self, texts):
        bigrams = []
        for text in texts:
            bigrams.extend(text.split())
        return list(set(bigrams))

    def compute_bigram_counts(self, texts):
        bigram_counts = []
        for text in texts:
            count = Counter(text.split())
            bigram_counts.append(count)
        return bigram_counts

    def analyze(self):
        label_set = set(self.labels)
        important_bigrams = {}
        for label in label_set:
            label_indices = [i for i, lbl in enumerate(self.labels) if lbl == label]
            other_indices = [i for i, lbl in enumerate(self.labels) if lbl != label]
            label_bigram_counts = [self.bigram_counts[i] for i in label_indices]
            other_bigram_counts = [self.bigram_counts[i] for i in other_indices]

            bigram_scores = {}
            for bigram in self.unique_bigrams:
                label_bigram_frequencies = [count.get(bigram, 0) for count in label_bigram_counts]
                other_bigram_frequencies = [count.get(bigram, 0) for count in other_bigram_counts]

                if np.var(label_bigram_frequencies) > 0 and np.var(other_bigram_frequencies) > 0:
                    f_stat, p_value = f_oneway(label_bigram_frequencies, other_bigram_frequencies)
                    bigram_scores[bigram] = (f_stat, np.var(label_bigram_frequencies))
                else:
                    bigram_scores[bigram] = (0, np.var(label_bigram_frequencies))

            sorted_bigrams = sorted(bigram_scores.items(), key=lambda item: item[1], reverse=True)
            important_bigrams[label] = sorted_bigrams[:self.num_important_words]

        return important_bigrams

# بارگیری دیتاست
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
print(ds)
train_data, test_data = ds["train"], ds["test"]

# پیش‌پردازش متن
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

text_list, target_list = [], []
for row in train_data:
    sentence = row["sentence"]

    # حذف `and` و کلمه بعد از `</e1>`
    sentence = re.sub(r'</e1>\s*and\s*\w+', '</e1>', sentence)

    pattern = r"<e1>.*?</e1>(.*?)<e2>.*?</e2>"

    match = re.search(pattern, sentence)
    if match:
        text = match.group(1).strip()  # استخراج متن بین تگ‌ها و حذف فاصله‌های اضافی
        label = row["relation"]
        text_list.append(preprocess_text(text))  # پیش‌پردازش متن

        # شرط برچسب‌ها
        if label == 0 or label == 1:
            target_list.append("cause-effect")
        else:
            target_list.append("others")

# تولید bigram ها از رشته‌ها
vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', analyzer='word')
bigram_features = vectorizer.fit_transform(text_list)
bigram_terms = vectorizer.get_feature_names_out()
bigram_terms = [term.replace(' ', '_') for term in bigram_terms]

# ساخت دیتافریم از bigram ها
bigram_data = [' '.join([bigram_terms[i] for i in row.nonzero()[1]]) for row in bigram_features]
df = pd.DataFrame({'bigram_text': bigram_data, 'label': target_list})

# ایجاد یک شیء از TextAnalyzerANOVA با bigram ها
analyzer = AnoVaTextBigram(df['bigram_text'], df['label'], 50)
important_bigrams = analyzer.analyze()

# خروجی به فایل اکسل
with pd.ExcelWriter('important_bigrams.xlsx') as writer:
    for key, value in important_bigrams.items():
        # تبدیل نتیجه به دیتافریم
        df = pd.DataFrame([(bigram, score[0], score[1]) for bigram, score in value], columns=['Bigram', 'Score', 'Variance'])
        # نوشتن دیتافریم به صفحه اکسل
        df.to_excel(writer, sheet_name=key, index=False)

print("Important bigrams have been written to the Excel file 'important_bigrams.xlsx'.")

In [None]:
import pandas as pd

# ترکیب نتایج `unigram` و `bigram`
combined_results = {}

# لیبل‌ها
labels = ['cause-effect', 'others']

for key in labels:
    if key in important_words:
        unigram_data = important_words[key]
        unigram_df = pd.DataFrame(unigram_data, columns=['Term', 'Values'])
        unigram_df[['Score', 'Variance']] = pd.DataFrame(unigram_df['Values'].tolist(), index=unigram_df.index)
        unigram_df = unigram_df.drop(columns=['Values'])
    else:
        unigram_df = pd.DataFrame(columns=['Term', 'Score', 'Variance'])

    if key in important_bigrams:
        bigram_data = important_bigrams[key]
        bigram_df = pd.DataFrame(bigram_data, columns=['Term', 'Values'])
        bigram_df[['Score', 'Variance']] = pd.DataFrame(bigram_df['Values'].tolist(), index=bigram_df.index)
        bigram_df = bigram_df.drop(columns=['Values'])
    else:
        bigram_df = pd.DataFrame(columns=['Term', 'Score', 'Variance'])

    # ترکیب دو دیتافریم
    combined_df = pd.concat([unigram_df, bigram_df], ignore_index=True)

    # مرتب‌سازی نتایج بر اساس Score
    combined_df = combined_df.sort_values(by='Score', ascending=False)

    # ذخیره نتایج ترکیبی در دیکشنری
    combined_results[key] = combined_df

# ذخیره نتایج در فایل اکسل با برگه‌های جداگانه برای هر کلاس لیبل
with pd.ExcelWriter('important_words_and_bigrams_combined.xlsx') as writer:
  for key, df in combined_results.items():
    df.to_excel(writer, sheet_name=key, index=False)

print("Important words and bigrams have been written to the Excel file 'important_words_and_bigrams_combined.xlsx'.")

In [None]:
import pandas as pd
from datasets import load_dataset
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import re
import numpy as np
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
import seaborn as sns

class TextPreprocessor:
    def __init__(self):
        pass

    def preprocess_text(self, text):
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text

# گام ۱: بارگذاری دیتاست
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
train_data, test_data = ds["train"], ds["test"]

# گام ۲: استخراج کلمات مهم از فایل اکسل
cause_effect_df = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='anova terms')
important_terms = set(cause_effect_df['Term'].tolist())

# گام ۳: آماده‌سازی داده‌ها
txt_prs = TextPreprocessor()

def extract_entities(sentence):
    try:
        entity1 = re.search(r"<e1>(.*?)</e1>", sentence).group(1)
        entity2 = re.search(r"<e2>(.*?)</e2>", sentence).group(1)
        return entity1, entity2
    except AttributeError:
        return "", ""

def get_terms_between_entities(sentence):
    entity1, entity2 = extract_entities(sentence)
    if entity1 and entity2:
        terms_between_entities = re.search(r'<e1>.*?</e1>(.*?)<e2>.*?</e2>', sentence)
        if terms_between_entities:
            return terms_between_entities.group(1).strip()
    return ""

# ایجاد لیست‌های یکتا برای داده‌های آموزشی
train_texts_labels = list(set((get_terms_between_entities(row['sentence']), 'cause-effect' if label == 0 or label == 1 else 'others') for row, label in zip(train_data, train_data['relation'])))
train_texts_labels = [(text, label) for text, label in train_texts_labels if text and label]  # حذف سطرهای خالی
train_texts, train_labels = zip(*train_texts_labels)

# ایجاد لیست‌های یکتا برای داده‌های تست
test_texts_labels = list(set((get_terms_between_entities(row['sentence']), 'cause-effect' if label == 0 or label == 1 else 'others') for row, label in zip(test_data, test_data['relation'])))
test_texts_labels = [(text, label) for text, label in test_texts_labels if text and label]  # حذف سطرهای خالی
test_texts, test_labels = zip(*test_texts_labels)

# متعادل‌سازی داده‌های آموزشی
np.random.seed(42)  # تنظیم random_state برای داده‌های آموزشی
cause_effect_train = [(text, label) for text, label in train_texts_labels if label == 'cause-effect']
others_train = [(text, label) for text, label in train_texts_labels if label == 'others']

min_train_samples = min(len(cause_effect_train), len(others_train))
balanced_train_data = cause_effect_train[:min_train_samples] + others_train[:min_train_samples]
np.random.shuffle(balanced_train_data)  # به‌هم‌زدن ترتیب داده‌ها پس از متعادل‌سازی

# متعادل‌سازی داده‌های تست
np.random.seed(42)  # تنظیم random_state برای داده‌های تست
cause_effect_test = [(text, label) for text, label in test_texts_labels if label == 'cause-effect']
others_test = [(text, label) for text, label in test_texts_labels if label == 'others']

min_test_samples = min(len(cause_effect_test), len(others_test))
balanced_test_data = cause_effect_test[:min_test_samples] + others_test[:min_test_samples]
np.random.shuffle(balanced_test_data)  # به‌هم‌زدن ترتیب داده‌ها پس از متعادل‌سازی

# جدا کردن متن‌ها و برچسب‌ها
balanced_train_texts, balanced_train_labels = zip(*balanced_train_data)
balanced_test_texts, balanced_test_labels = zip(*balanced_test_data)

# تبدیل داده‌ها به DataFrame
train_data_df = pd.DataFrame({
    'Text': balanced_train_texts,
    'Label': balanced_train_labels
})

test_data_df = pd.DataFrame({
    'Text': balanced_test_texts,
    'Label': balanced_test_labels
})

# تمیز کردن داده‌ها و حذف مقادیر NaN
train_data_df.dropna(subset=['Text', 'Label'], inplace=True)
test_data_df.dropna(subset=['Text', 'Label'], inplace=True)

# ذخیره داده‌ها به یک فایل اکسل با صفحات مختلف
with pd.ExcelWriter('balanced_data.xlsx') as writer:
    train_data_df.to_excel(writer, sheet_name='Train Data', index=False)
    test_data_df.to_excel(writer, sheet_name='Test Data', index=False)

**Ensemble Model - First Try**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from lime.lime_text import LimeTextExplainer
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier



# بارگذاری داده‌ها از فایل اکسل
balanced_train_texts = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced train data')['Text'].tolist()
balanced_train_labels = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced train data')['Label'].tolist()
# تغییر در بخش بارگذاری و فیلتر کردن داده‌های تست
balanced_test_df = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced test data')


balanced_test_texts = balanced_test_df['Text'].tolist()
balanced_test_labels = balanced_test_df['Label'].tolist()
balanced_test_labels = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced test data')['Label'].tolist()
important_terms = set(pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='anova terms')['Term'].tolist())

# تبدیل کلمات مهم به یک وکتور
vectorizer = CountVectorizer(vocabulary=important_terms, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(balanced_train_texts).toarray()
X_test = vectorizer.transform(balanced_test_texts).toarray()

# تبدیل برچسب‌ها به آرایه numpy
y_train = np.array(balanced_train_labels)
y_test = np.array(balanced_test_labels)

# تعریف مدل‌ها با پارامترهای پیش‌فرض
logreg = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# تعریف پارامترهای Grid Search برای هر مدل
param_grid = {
    'logreg': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'rf': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'gb': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5]
    },
}

# لیست مدل‌ها برای Grid Search
models = {
    "Logistic Regression": (logreg, param_grid['logreg']),
    "Random Forest": (rf, param_grid['rf']),
    "Gradient Boosting": (gb, param_grid['gb']),
}


# آموزش و ارزیابی هر مدل با Grid Search
best_models = {}
for name, (model, params) in models.items():
    print(f"\nPerforming Grid Search for {name}...")

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    # ذخیره بهترین مدل
    best_models[name] = grid_search.best_estimator_

    # ارزیابی بهترین مدل
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    print(f"\nBest parameters for {name}:")
    print(grid_search.best_params_)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# ایجاد مدل انسامبل با بهترین مدل‌های یافت شده
# بعد از انجام Grid Search برای همه مدل‌ها
ensemble = VotingClassifier(
    estimators=[
        ('lr', best_models['Logistic Regression']),
        ('rf', best_models['Random Forest']),
        ('gb', best_models['Gradient Boosting']),
    ],
    voting='soft',
    weights=[1, 1.2, 1.2]  # تنظیم وزن‌ها
)

# آموزش مدل انسامبل
ensemble.fit(X_train, y_train)

# پیش‌بینی و ارزیابی مدل انسامبل
predictions = ensemble.predict(X_test)
probabilities = ensemble.predict_proba(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

# چاپ نتایج ارزیابی
print("\nEnsemble Model Performance:")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# تحلیل False Positive ها با LIME
false_positives = [(text, true_label, pred_label) for text, true_label, pred_label in zip(balanced_test_texts, y_test, predictions) if true_label != pred_label and pred_label == 'cause-effect']

lime_explainer = LimeTextExplainer(class_names=['cause-effect', 'others'])
for i, (text, true_label, pred_label) in enumerate(false_positives[:5]):  # محدود کردن به 5 نمونه برای نمایش
    print(f"\nFalse Positive Example {i+1}:")
    print(f"Text: {text}")
    print(f"True Label: {true_label}, Predicted Label: {pred_label}")
    exp = lime_explainer.explain_instance(text, lambda x: ensemble.predict_proba(vectorizer.transform(x)), num_features=6)
    exp.show_in_notebook(text=True)

# ذخیره نتایج پیش‌بینی شده به فایل اکسل
results_df = pd.DataFrame({
    'Sentence': balanced_test_texts,
    'True Label': y_test,
    'Predicted Label': predictions,
    'Prediction Probability': probabilities.max(axis=1)
})

with pd.ExcelWriter('improved_prediction_results.xlsx') as writer:
    results_df.to_excel(writer, index=False)

# ترسیم ماتریس درهم‌ریختگی
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['cause-effect', 'others'],
            yticklabels=['cause-effect', 'others'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Ensemble Model')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

**Ensemble Model - second Try**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# بردارسازی پیشرفته با کلمات کلیدی
vectorizer = CountVectorizer(
    vocabulary=important_terms,
    ngram_range=(1, 2),
    #sublinear_tf=True
)
X_train = vectorizer.fit_transform(balanced_train_texts).toarray()
X_test = vectorizer.transform(balanced_test_texts).toarray()

# مقیاس‌گذاری برای مدل‌هایی که لازم دارن
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# مدل‌ها
models_to_test = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# ارزیابی
results = []
for name, model in models_to_test.items():
    print(f"Training {name}...")
    # اعمال اسکیل فقط برای مدل‌هایی که نیاز دارن
    if "Scaled" in name:
        X_tr = X_train_scaled
        X_te = X_test_scaled
    else:
        X_tr = X_train
        X_te = X_test

    model.fit(X_tr, y_train)
    preds = model.predict(X_te)

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average='weighted')
    rec = recall_score(y_test, preds, average='weighted')
    f1 = f1_score(y_test, preds, average='weighted')

    results.append((name, acc, prec, rec, f1))

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, preds))

    # confusion matrix
    conf = confusion_matrix(y_test, preds)
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf, annot=True, fmt='d', cmap='Blues',
                xticklabels=['cause-effect', 'others'],
                yticklabels=['cause-effect', 'others'])
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
    plt.show()

# جدول نتایج
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"])
print("\n🔬 Model Comparison with important_terms Vocabulary:")
print(results_df)

# ترسیم نمودار
results_df.set_index("Model").plot(kind='bar', figsize=(12, 6), ylim=(0.5, 1.0), colormap='tab10', grid=True)
plt.title("📊 Performance Comparison (Preserving important_terms)")
plt.ylabel("Score")
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.ensemble import VotingClassifier

# مدل‌های پایه برای Ensemble
ensemble_models = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
]

# ساخت VotingClassifier با نسخه scale‌شده برای مدل‌های حساس
# نسخه خاص برای مقیاس‌گذاری
ensemble = VotingClassifier(
    estimators=ensemble_models,
    voting='soft',
    n_jobs=-1
)

# آموزش روی نسخه متناسب
ensemble.fit(X_train_scaled, y_train)  # چون مدل‌هایی هستن که اسکیل نیاز دارن

# پیش‌بینی و ارزیابی
ensemble_preds = ensemble.predict(X_test_scaled)
ensemble_probs = ensemble.predict_proba(X_test_scaled)
acc = accuracy_score(y_test, ensemble_preds)
prec = precision_score(y_test, ensemble_preds, average='weighted')
rec = recall_score(y_test, ensemble_preds, average='weighted')
f1 = f1_score(y_test, ensemble_preds, average='weighted')

# چاپ نتایج
print("\n🔗 Voting Ensemble Performance (Preserving important_terms):")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:")
print(classification_report(y_test, ensemble_preds))

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

# مدل‌های پایه (Base learners)
base_estimators = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
]

# مدل نهایی: شبکه عصبی چندلایه
meta_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', max_iter=300, random_state=42)

# ساخت مدل Stacking با شبکه عصبی
stacking_nn = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    passthrough=True,
    cv=5
)

# آموزش روی داده
stacking_nn.fit(X_train, y_train)

# ارزیابی
nn_preds = stacking_nn.predict(X_test)
nn_probs = stacking_nn.predict_proba(X_test)

# متریک‌ها
print("\n🧠 Stacking with Neural Network Performance:")
print(f"Accuracy: {accuracy_score(y_test, nn_preds):.4f}")
print(f"Precision: {precision_score(y_test, nn_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, nn_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, nn_preds, average='weighted'):.4f}")
print("Classification Report:")
print(classification_report(y_test, nn_preds))

**Random Model**

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# بارگذاری داده‌ها از فایل اکسل
balanced_train_texts = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced train data')['Text'].tolist()
balanced_train_labels = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced train data')['Label'].tolist()
balanced_test_texts = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced test data')['Text'].tolist()
balanced_test_labels = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced test data')['Label'].tolist()

# تبدیل برچسب‌ها به آرایه numpy
y_train = np.array(balanced_train_labels)
y_test = np.array(balanced_test_labels)

# ایجاد پیش‌بینی‌های تصادفی
unique_classes = np.unique(y_train)
random_predictions = np.array([random.choice(unique_classes) for _ in range(len(y_test))])

# ارزیابی مدل
accuracy = accuracy_score(y_test, random_predictions)
precision = precision_score(y_test, random_predictions, average='weighted')
recall = recall_score(y_test, random_predictions, average='weighted')
f1 = f1_score(y_test, random_predictions, average='weighted')
conf_matrix = confusion_matrix(y_test, random_predictions)
class_report = classification_report(y_test, random_predictions)

# چاپ نتایج ارزیابی
print("=== Random Baseline Results ===")
print(f"Test Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# ذخیره نتایج پیش‌بینی شده به فایل اکسل
results_df = pd.DataFrame({
    'Sentence': balanced_test_texts,
    'True Label': y_test,
    'Predicted Label': random_predictions,
    'Prediction Probability': 0.5  # احتمال 50% برای پیش‌بینی تصادفی
})

with pd.ExcelWriter('random_baseline_results.xlsx') as writer:
    results_df.to_excel(writer, index=False)

# ترسیم ماتریس درهم‌ریختگی
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['cause-effect', 'others'],
            yticklabels=['cause-effect', 'others'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Random Baseline')
plt.show()

**Rule-Based Model**

In [None]:
import pandas as pd
import numpy as np
import random
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from lime.lime_text import LimeTextExplainer

# بارگذاری داده‌ها از فایل اکسل
balanced_train_texts = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced train data')['Text'].tolist()
balanced_train_labels = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced train data')['Label'].tolist()
balanced_test_texts = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced test data')['Text'].tolist()
balanced_test_labels = pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='balanced test data')['Label'].tolist()
important_terms = set(pd.read_excel('ACORD_Data_040613.xlsx', sheet_name='anova terms')['Term'].tolist())

# تبدیل برچسب‌ها به آرایه numpy
y_train = np.array(balanced_train_labels)
y_test = np.array(balanced_test_labels)

# ==============================================
# 1. مدل مبتنی بر کلیدواژه‌های ساده (Rule-Based)
# ==============================================
print("\n=== Rule-Based Baseline ===")
rule_based_predictions = [
    "cause-effect" if any(term in text.lower() for term in important_terms)
    else "others"
    for text in balanced_test_texts
]

# ارزیابی مدل
accuracy = accuracy_score(y_test, rule_based_predictions)
precision = precision_score(y_test, rule_based_predictions, average='weighted')
recall = recall_score(y_test, rule_based_predictions, average='weighted')
f1 = f1_score(y_test, rule_based_predictions, average='weighted')
conf_matrix = confusion_matrix(y_test, rule_based_predictions)

print(f"Test Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)



# ترسیم ماتریس درهم‌ریختگی
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['cause-effect', 'others'], yticklabels=['cause-effect', 'others'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

**Graph**

In [None]:
import pandas as pd
import inflect

# Load the Excel file
excel_file = 'all_sentences_with_original.xlsx'
df = pd.read_excel(excel_file)

# Extract the entities and labels
df['Entity1'] = df['e1']
df['Entity2'] = df['e2']

# Filter out rows where Entity1 or Entity2 is empty
df = df.dropna(subset=['Entity1', 'Entity2'])

# Initialize inflect engine
p = inflect.engine()

# # Normalize entities to singular form
def normalize_entity(entity):
    entity = str(entity).strip()  # تبدیل مقدار به رشته و حذف فضاهای اضافی
    words = entity.split()  # بررسی تعداد کلمات
    if len(words) > 1:  # اگر شامل چند کلمه باشد، تغییر ندهیم
        return entity
    singular = p.singular_noun(entity)
    return singular if isinstance(singular, str) else entity  # تغییر فقط در صورت معتبر بودن مقدار

# Apply normalization safely
df['Entity1'] = df['Entity1'].apply(normalize_entity)
df['Entity2'] = df['Entity2'].apply(normalize_entity)

# Select relevant columns for Cytoscape and rename columns to source and target
cytoscape_df = df[['Entity1', 'Entity2', 'Predicted Label']]
cytoscape_df.columns = ['source', 'target', 'label']

# Filter out rows where label is 'others'
cytoscape_df = cytoscape_df[cytoscape_df['label'] != 'others']

# Save the data to a CSV file
cytoscape_df.to_csv('cytoscape_data_causal_V040520.csv', index=False)

**False Positives and False Negatives**

In [None]:
import pandas as pd

# خواندن فایل اکسل
results_df = pd.read_excel('all_sentences_with_original.xlsx')

# تعریف False Positive و False Negative
false_positives = results_df[
    (results_df['Predicted Label'] == 'cause-effect') &
    (results_df['True Label'] == 'others')
]
print(f"تعداد کل موارد FP شناسایی شده: {len(false_positives)}")
false_negatives = results_df[
    (results_df['True Label'] == 'cause-effect') &
    (results_df['Predicted Label'] == 'others')
]

# ایجاد یک فایل اکسل جدید با تب‌های جداگانه
with pd.ExcelWriter('error_analysis.xlsx') as writer:
    # ذخیره تمام داده‌ها در تب اول
    results_df.to_excel(writer, sheet_name='All Data', index=False)

    # ذخیره False Positive در تب دوم
    false_positives.to_excel(writer, sheet_name='False Positives', index=False)

    # ذخیره False Negative در تب سوم
    false_negatives.to_excel(writer, sheet_name='False Negatives', index=False)

print("✅ فایل error_analysis.xlsx با موفقیت ایجاد شد!")
print(f"🔍 تعداد False Positives: {len(false_positives)}")
print(f"🔍 تعداد False Negatives: {len(false_negatives)}")