In [None]:
# System libraries
import re
import os
import unicodedata
import itertools as itt

# Library for file manipulation
import pandas as pd
import numpy as np

# Data visualization
import plotly
import seaborn as sns
import matplotlib.pylab as pl
import matplotlib as m
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from matplotlib import pyplot as plt
from IPython.display import SVG

# Configuration for graphs width and layout of graphs
sns.set_theme(style='whitegrid')
palette='viridis'

# Warnings remove warnings
import warnings
warnings.filterwarnings("ignore")

# Python version
from platform import python_version
print('Python version in this Jupyter Notebook:', python_version())

# Load library versions
import watermark

# Library versions
%reload_ext watermark
%watermark -a "Library versions" --iversions

In [None]:
file_path_csv = 'E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\Complete_Updated_Autoimmune_Disorder_Dataset.csv'
df = pd.read_csv(file_path_csv)

## 1. Exploratory Data Analysis

In [None]:
# Viewing the first 5 data
df.head()

In [None]:
# Viewing the liast 5 data
df.tail()

In [None]:
df.describe()

In [None]:
# Rows and columns
df.shape

In [None]:
# Data type
df.dtypes

In [None]:
df['Diagnosis'].unique()

##### Tablodaki sütun isimlerinin açıklamaları şu şekildedir:

**Patient_ID:** Hastanın kimlik numarası.<br/>
**Age:** Hastanın yaşı.<br/>
**Gender:** Hastanın cinsiyeti (Male/Female).<br/>
**Diagnosis:** Hastalığın teşhisi.<br/>
**Sickness_Duration_Months:** Hastalığın süresi (ay olarak).<br/>
**RBC_Count:** Kırmızı kan hücresi (alyuvar) sayısı.<br/>
**Hemoglobin:** Kanın hemoglobin seviyesi.<br/>
**Hematocrit:** Kan hacminde kırmızı kan hücrelerinin oranı.<br/>
**MCV:** Ortalama alyuvar hacmi.<br/>
**MCH:** Ortalama alyuvar hemoglobin miktarı.<br/>
**MCHC:** Alyuvarlarda hemoglobin konsantrasyonu.<br/>
**RDW:** Alyuvar dağılım genişliği.<br/>
**Reticulocyte_Count:** Retikülosit (genç alyuvar) sayısı.<br/>
**WBC_Count:** Beyaz kan hücresi (akyuvar) sayısı.<br/>
**Neutrophils:** Nötrofil yüzdesi.<br/>
**Lymphocytes:** Lenfosit yüzdesi.<br/>
**Monocytes:** Monosit yüzdesi.<br/>
**Eosinophils:** Eozinofil yüzdesi.<br/>
**Basophils:** Bazofil yüzdesi.<br/>
**PLT_Count:** Trombosit (platelet) sayısı.<br/>
**MPV:** Ortalama trombosit hacmi.<br/>
**ANA:** Antinükleer antikor testi sonucu.<br/>
**Esbach:** Esbach testi sonucu (protein kaçağı için).<br/>
**MBL_Level:** Mannoz bağlayıcı lektin seviyesi.<br/>
**ESR:** Eritrosit sedimentasyon hızı.<br/>
**C3:** Kompleman C3 seviyesi.<br/>
**C4:** Kompleman C4 seviyesi.<br/>
**CRP:** C-reaktif protein seviyesi (iltihap göstergesi).<br/>
**Anti-dsDNA:** Çift sarmallı DNA'ya karşı antikorlar.<br/>
**Anti-Sm:** Sm antijenine karşı antikorlar.<br/>
**Rheumatoid factor:** Romatoid faktör seviyesi.<br/>
**ACPA:** Anti-sitrülinlenmiş protein antikorları.<br/>
**Anti-TPO:** Anti-tiroid peroksidaz antikorları.<br/>
**Anti-Tg:** Anti-tiroglobulin antikorları.<br/>
**Anti-SMA:** Anti-düz kas antikorları.<br/>
**Low-grade fever:** Hafif dereceli ateş.<br/>
**Fatigue or chronic tiredness:** Yorgunluk veya kronik halsizlik.<br/>
**Dizziness:** Baş dönmesi.<br/>
**Weight loss:** Kilo kaybı.<br/>
**Rashes and skin lesions:** Döküntü ve deri lezyonları.<br/>
**Stiffness in the joints:** Eklemlerde sertlik.<br/>
**Brittle hair or hair loss:** Kırılgan saç veya saç dökülmesi.<br/>
**Dry eyes and/or mouth:** Kuru gözler ve/veya ağız.<br/>
**General 'unwell' feeling:** Genel bir "hasta hissetme" durumu.<br/>
**Joint pain:** Eklem ağrısı.<br/>
**Anti_enterocyte_antibodies:** Enterositlere karşı antikorlar.<br/>
**Anti_LKM1:** Anti-larence-karaciğer-antikorları (LKM1).<br/>
**Anti_RNP:** Ribonükleoproteinlere karşı antikorlar.<br/>
**ASCA:** Anti-Saccharomyces cerevisiae antikorları (Crohn hastalığı ile ilişkili).<br/>
**Anti_Ro_SSA:** Ro/SSA antijenine karşı antikorlar.<br/>
**Anti_CBir1:** CBir1 antijenine karşı antikorlar.<br/>
**Anti_BP230:** BP230 antijenine karşı antikorlar (bullöz hastalıklarla ilişkili).<br/>
**Anti_tTG:** Anti-doku transglutaminaz antikorları.<br/>
**DGP:** Deamidated gliadin peptid antikorları.<br/>
**Anti_BP180:** BP180 antijenine karşı antikorlar (pemfigoid ile ilişkili).<br/>
**ASMA:** Anti-düz kas antikorları (tekrar olabilir).<br/>
**Anti_IF:** İçsel faktörlere (Intrinsic Factor) karşı antikorlar.<br/>
**IgG_IgE_receptor:** IgG veya IgE reseptörüne karşı antikorlar.<br/>
**Anti_SRP:** Sinyal tanıma partikülüne karşı antikorlar.<br/>
**Anti_desmoglein_3:** Desmoglein 3'e karşı antikorlar (pemfigus ile ilişkili).<br/>
**Anti_La_SSB:** La/SSB antijenine karşı antikorlar.<br/>
**Anti_Jo1:** Jo-1 antijenine karşı antikorlar (dermatomiyozit ile ilişkili).<br/>
**ANCA:** Antinötrofil sitoplazmik antikorlar.<br/>
**Anti_centromere:** Sentromer proteinlerine karşı antikorlar.<br/>
**Anti_desmoglein_1:** Desmoglein 1'e karşı antikorlar (pemfigus ile ilişkili).<br/>
**EMA:** Anti-endomisyum antikorları (çölyak hastalığı ile ilişkili).<br/>
**Anti_type_VII_collagen:** Tip VII kollajene karşı antikorlar (epidermolizis bullosa ile ilişkili).<br/>
**C1_inhibitor:** C1 inhibitör seviyesi (kompleman sistemiyle ilişkili).<br/>
**Anti_TIF1:** TIF1 antijenine karşı antikorlar.<br/>
**Anti_epidermal_basement_membrane_IgA:** Epidermal bazal membrana karşı IgA antikorları.<br/>
**Anti_OmpC:** Dış membran protein C'ye karşı antikorlar.<br/>
**pANCA:** Perinükleer antinötrofil sitoplazmik antikorlar.<br/>
**Anti_tissue_transglutaminase:** Anti-doku transglutaminaz (tekrar olabilir).<br/>
**Anti_Scl_70:** Scl-70 antijenine karşı antikorlar (sistemik skleroz ile ilişkili).<br/>
**Anti_Mi2:** Mi-2 antijenine karşı antikorlar.<br/>
**Anti_parietal_cell:** Parietal hücrelere karşı antikorlar.<br/>
**Progesterone_antibodies:** Progesterona karşı antikorlar.<br/>
**Anti_Sm:** Sm antijenine karşı antikorlar (tekrar olabilir).<br/>
**MBL:** Mannoz bağlayıcı lektin (tekrar olabilir).

##### İleri Analizler için Sütun Seçme

In [None]:
# Orijinal dosyayı yükleme
file_path_csv = 'E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\Complete_Updated_Autoimmune_Disorder_Dataset.csv'
data_filtered = pd.read_csv(file_path_csv, dtype={"Anti-Tg": int, "Anti-TPO": int,"Dry eyes and mouth": int, 
                                                  "Joint pain": int, "ACPA":int})  # INteger olarak oku

# İstediğiniz sütunları seçin
selected_columns = ['Gender', 'Diagnosis', "RBC_Count", "Hemoglobin","Hematocrit", "CRP", "Esbach",
                    "Anti-Tg", "Anti-TPO", "Dry eyes and mouth", "Joint pain","ACPA","Fatigue or chronic tiredness"]  

df = data_filtered[selected_columns].copy()

"""
# Seçilen sütunlarla yeni bir CSV dosyası oluşturma
output_path = 'E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\Selected_Columns_Autoimmune_Dataset.csv'
df.to_csv(output_path, index=False)

print(f"Yeni CSV dosyası oluşturuldu: {output_path}")
"""
df

## Graves ve Normal Verileri Ayırma

In [None]:
# "Graves' disease" ve "Normal" için veriyi filtreleme
selected_conditions = ["Graves' disease", "Normal"]
graves_disease_and_normal_data = df[df["Diagnosis"].isin(selected_conditions)]

# Dosya adını oluşturma
file_name_selected = "E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\Graves_and_Normal_String_CBC_Data.csv"

"""
# Veriyi CSV olarak kaydetme
graves_disease_and_normal_data.to_csv(file_name_selected, index=False)

print(f"CSV file created for 'Graves' Disease and 'Normal': {file_name_selected}")
"""

graves_disease_and_normal_data

**Rounded_Float_Columns_Autoimmune_Dataset.csv** dosyasındaki verileri kullanarak Graves' Hastalığı için ayrı bir csv dosyası oluşturdum

## Basic Statistics

In [None]:
# Diagnosis değerlerini normalize et (küçük harfe çevir, boşlukları temizle)
graves_disease_and_normal_data['Diagnosis'] = graves_disease_and_normal_data['Diagnosis'].astype(str).str.strip().str.lower()

# Graves' Disease ve Normal hastaları ayırma
normal_group = graves_disease_and_normal_data[graves_disease_and_normal_data['Diagnosis'] == "normal"]
graves_group = graves_disease_and_normal_data[graves_disease_and_normal_data['Diagnosis'] == "graves' disease"]

# Eğer herhangi bir grup boşsa, hata almamak için kontrol ekleyelim
if normal_group.empty:
    print("Hata: Normal grubu için veri bulunamadı!")
if graves_group.empty:
    print("Hata: Graves' Disease grubu için veri bulunamadı!")

# Temel istatistikleri hesaplama
summary_stats_graves = graves_group.describe()
summary_stats_normal = normal_group.describe()

# Sonuçları kullanıcıya gösterme
import ace_tools_open as tools
tools.display_dataframe_to_user(name="Graves için Temel İstatistikler", dataframe=summary_stats_graves)
tools.display_dataframe_to_user(name="Normal için Temel İstatistikler", dataframe=summary_stats_normal)


## Correlation Matrix

In [None]:
# Korelasyon matrisini hesaplama
numeric_df = graves_disease_and_normal_data.select_dtypes(include=["number"])  # Sadece sayısal sütunları seç
correlation_matrix = numeric_df.corr()

# Korelasyon matrisini görselleştirme
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Korelasyon Matrisi")
plt.show()

## Histplot

In [None]:
# Sayısal değişkenlerin histogramlarını çizme
numeric_df.hist(figsize=(15, 12), bins=30, edgecolor="black")
plt.suptitle("Sayısal Değişkenlerin Dağılımı", fontsize=16)
plt.show()

### Patients Count for Graves Disease and Normal

In [None]:
selected_conditions = ["Graves' disease", "Normal"]
graves_disease_and_normal_data = df[df["Diagnosis"].isin(selected_conditions)]

# Diagnosis değişkeni için Count Plot oluşturma
plt.figure(figsize=(16, 6))
sns.countplot(x=graves_disease_and_normal_data["Diagnosis"], palette='Set2')

# Başlık ve etiketleri ekleyelim
plt.title('Target Variable: Diagnosis', fontsize=18, fontweight='bold')
plt.xlabel("Diagnosis", fontsize=14)
plt.ylabel('Count', fontsize=14)

# Sütunların üstüne sayıları ekleme
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().annotate(f'{height}', (p.get_x() + p.get_width() / 2., height), 
                       ha='center', va='baseline', fontsize=12, color='black', xytext=(0, 5), 
                       textcoords='offset points')

# Grid kaldır
plt.grid(False)

# Grafiği göster
plt.show()


## Graves ve Normal Grubun CBC Değerleri Grafikleri

In [None]:
# Normalize the Diagnosis column
graves_disease_and_normal_data['Diagnosis'] = graves_disease_and_normal_data['Diagnosis'].astype(str).str.strip().str.lower()

# Kullanıcıdan alt klasör dizinini belirtmesini isteyin
output_folder = 'E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\'
output_folder = os.path.join(output_folder, "Graves_and_Normal_Analysis")
os.makedirs(output_folder, exist_ok=True)

# Y ekseni için birden fazla değişken tanımlayın
y_axis_variables = [
'Gender', 'Diagnosis', "RBC_Count", "Hemoglobin","Hematocrit", "CRP", "Esbach",
"Anti-Tg", "Anti-TPO", "Dry eyes and mouth", "Joint pain","ACPA","Fatigue or chronic tiredness"
]

# Define a function to clean file names
def clean_filename(name):
    return name.replace("/", "_").replace("\\", "_").replace(" ", "_").replace(":", "_").replace("?", "_").replace("*", "_")

# Plot each Y-axis variable and save individual Excel tables
for y_var in y_axis_variables:
    if y_var in graves_disease_and_normal_data.columns:
        # Create the bar plot
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Diagnosis', y=y_var, data=graves_disease_and_normal_data, palette='Set2', ci=None)

        # Add title and labels
        plt.title(f'{y_var} by Diagnosis', fontsize=16, fontweight='bold')
        plt.xlabel('Diagnosis', fontsize=12)
        plt.ylabel(y_var, fontsize=12)

        # Annotate bar plot with values
        for p in plt.gca().patches:
            height = p.get_height()
            if not pd.isna(height):  # Ensure height is not NaN
                plt.gca().annotate(f'{height:.2f}', (p.get_x() + p.get_width() / 2., height), 
                                   ha='center', va='baseline', fontsize=10, color='black', xytext=(0, 5), 
                                   textcoords='offset points')

        # Clean the variable name for safe file naming
        clean_y_var = clean_filename(y_var)

        """
        # Save the plot as an image
        plot_file_path = os.path.join(output_folder, f"{clean_y_var}_by_Diagnosis.png")
        plt.savefig(plot_file_path, bbox_inches='tight')
        plt.close()

        # Save the corresponding data to an Excel file
        data_to_save = graves_disease_and_normal_data.groupby('Diagnosis')[y_var].mean().reset_index()
        excel_file_path = os.path.join(output_folder, f"{clean_y_var}_by_Diagnosis.xlsx")
        data_to_save.to_excel(excel_file_path, index=False)

        print(f"Graph saved at: {plot_file_path}")
        print(f"Data table saved at: {excel_file_path}")

print(f"All graphs and individual tables have been saved in the folder: {output_folder}")
"""

graves_disease_and_normal_data_string = graves_disease_and_normal_data.copy()

In [None]:
graves_disease_and_normal_data_string['Diagnosis'].unique()

## CBC Values for Gender with Standard Deviation

In [None]:
def save_and_plot_gender_groups_with_cbc_values(graves_disease_and_normal_data, target_column, save_tables=False, save_plots=False, output_path=None):
    """
    Gender gruplarına göre CBC değerlerini analiz eder, ortalama ve standart sapmalarını hesaplar.
    Tabloyu kaydeder ve grafik oluşturur.
    
    Parametreler:
    - graves_disease_and_normal_data: Pandas DataFrame, analiz edilecek veri seti.
    - target_column: Hedef sütun (örn: "Diagnosis").
    - save_tables: Tabloları kaydetmek için True/False.
    - save_plots: Grafikleri kaydetmek için True/False.
    - output_path: Çıktının kaydedileceği ana klasör.
    """

    # Sadece float sütunları seç
    float_columns = graves_disease_and_normal_data.select_dtypes(include=['float']).columns

    # Binary sütunları çıkar (sadece 0 ve 1 içerenleri tespit et)
    binary_columns = [col for col in float_columns if graves_disease_and_normal_data[col].dropna().nunique() == 2]
    float_columns = [col for col in float_columns if col not in binary_columns]  # Binary olanları çıkart

    # Çıktı klasörünü belirle
    if output_path:
        output_folder = os.path.join(output_path, "CBC_Values_by_Gender")
        os.makedirs(output_folder, exist_ok=True)
    else:
        raise ValueError("Output path cannot be None or empty.")

    # İşlem yapılan dosyaların kaydedildiği dizinleri takip etmek için liste
    saved_files = []

    # Her float sütun için analiz yap
    for col in float_columns:
        # Cinsiyete (Gender) göre gruplama ve ortalama & standart sapma hesaplama
        grouped_data = graves_disease_and_normal_data.groupby('Gender')[col].agg(['mean', 'std']).reset_index()
        grouped_data.columns = ['Gender', f'{col}_mean', f'{col}_std']  # Sütun isimlerini düzenle

        # Tabloyu CSV veya Excel olarak kaydet
        if save_tables:
            output_file = os.path.join(output_folder, f"{col}_gender_group_table.xlsx")
            grouped_data.to_excel(output_file, index=False)
            saved_files.append(output_file)

        # Grafik çizimi ve kaydetme
        if save_plots:
            plt.figure(figsize=(12, 6))

            # Hata çubuklarıyla ortalama değerleri gösteren çubuk grafik çizimi
            plt.bar(
                grouped_data['Gender'], 
                grouped_data[f'{col}_mean'], 
                yerr=grouped_data[f'{col}_std'], 
                capsize=5, 
                color=['#4C72B0', '#C44E52'],  # Erkek ve kadın için farklı renkler
                alpha=0.7
            )

            # Başlık ve etiketler
            plt.title(f'{col} Levels by Gender', fontsize=16)
            plt.xlabel('Gender', fontsize=12)
            plt.ylabel(f'{col} Level', fontsize=12)
            plt.xticks(fontsize=10)
            plt.tight_layout()

            """
            # Grafik dosyasını kaydetme
            plot_file = os.path.join(output_folder, f"{col}_plot_by_gender.png")
            plt.savefig(plot_file, bbox_inches='tight')
            plt.close()
            saved_files.append(plot_file)

    # İşlem tamamlandığında kaydedilen dosyaların listesini yazdır
    print(f"Files saved to {output_folder}:\n" + "\n".join(saved_files))
    """

# Kullanım
output_path = "E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\"

save_and_plot_gender_groups_with_cbc_values(
    graves_disease_and_normal_data,
    target_column='Diagnosis',
    save_tables=True,
    save_plots=True,
    output_path=output_path
)

## Graves için Her Bir Gruptaki Veri Sayısı

In [None]:
# Diagnosis sütunundaki boşlukları ve harf büyüklüğünü normalize et
graves_disease_and_normal_data['Diagnosis'] = graves_disease_and_normal_data['Diagnosis'].astype(str).str.strip().str.lower()

# Sadece Graves' disease grubunu seç
graves_data = graves_disease_and_normal_data[graves_disease_and_normal_data['Diagnosis'] == "graves' disease"]

# İncelenecek CBC değişkenlerini tanımla
cbc_variables = ['Gender', 'Diagnosis', "RBC_Count", "Hemoglobin", "Hematocrit", "CRP", "Esbach",
                 "Anti-Tg", "Anti-TPO", "Dry eyes and mouth", "Joint pain", "ACPA", "Fatigue or chronic tiredness"]

# Graves' disease grubunda cinsiyete göre non-null değerleri say
cbc_counts = graves_data.groupby('Gender')[cbc_variables].count()

# Sonuçları ekrana yazdır
print("Graves' disease grubundaki CBC değişkenlerinin cinsiyete göre dağılımı:")
print(cbc_counts)

## Normal için Her Bir Gruptaki Veri Sayısı

In [None]:
# Diagnosis sütunundaki boşlukları ve harf büyüklüğünü normalize et
graves_disease_and_normal_data['Diagnosis'] = graves_disease_and_normal_data['Diagnosis'].astype(str).str.strip().str.lower()

# Sadece Graves' disease grubunu seç
normal_data = graves_disease_and_normal_data[graves_disease_and_normal_data['Diagnosis'] == "normal"]

# İncelenecek CBC değişkenlerini tanımla
cbc_variables = ['Gender', 'Diagnosis', "RBC_Count", "Hemoglobin", "Hematocrit", "CRP", "Esbach",
                 "Anti-Tg", "Anti-TPO", "Dry eyes and mouth", "Joint pain", "ACPA", "Fatigue or chronic tiredness"]

# Graves' disease grubunda cinsiyete göre non-null değerleri say
cbc_counts = normal_data.groupby('Gender')[cbc_variables].count()

# Sonuçları ekrana yazdır
print("Normal disease grubundaki CBC değişkenlerinin cinsiyete göre dağılımı:")
print(cbc_counts)

## Convert from String to Integer

In [None]:
# Ensure Gender and Diagnosis columns are strings and normalized
graves_disease_and_normal_data['Gender'] = graves_disease_and_normal_data['Gender'].astype(str).str.strip().str.lower()
graves_disease_and_normal_data['Diagnosis'] =graves_disease_and_normal_data['Diagnosis'].astype(str).str.strip().str.lower()

# Mapping for Gender and Diagnosis columns
gender_mapping = {'male': 0, 'female': 1}
diagnosis_mapping = {"graves' disease": 0, 'normal': 1}

# Apply the mappings
graves_disease_and_normal_data['Gender'] = graves_disease_and_normal_data['Gender'].map(gender_mapping)
graves_disease_and_normal_data['Diagnosis'] = graves_disease_and_normal_data['Diagnosis'].map(diagnosis_mapping)

# Check for NaN values after mapping
if graves_disease_and_normal_data['Diagnosis'].isnull().sum() > 0:
    print("Warning: Unmapped values found in Diagnosis column!")
    print(graves_disease_and_normal_data['Diagnosis'].unique())  # Display all unique values for debugging

# Remove duplicates if any
graves_disease_and_normal_data = graves_disease_and_normal_data.drop_duplicates()

"""
# Save the updated dataset
output_file_path = 'E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\Graves_and_Normal_Integer_CBC_Data.csv'
graves_disease_and_normal_data.to_csv(output_file_path, index=False)
print(f"File saved: {output_file_path}")
"""

graves_disease_and_normal_data

# Save the File

In [None]:
"""
import nbformat
import os
from datetime import datetime
import ipynbname

def save_current_notebook_as_backup(save_directory):
    try:
        # Find current notebook
        notebook_path = ipynbname.path()

        # Create backup name with timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"cbc_values_statistics_to_gender_{timestamp}.ipynb"

        # Create backup index and save to notebook
        os.makedirs(save_directory, exist_ok=True)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook_content = nbformat.read(f, as_version=4)

        new_file_path = os.path.join(save_directory, filename)
        with open(new_file_path, 'w', encoding='utf-8') as f:
            nbformat.write(notebook_content, f)

        print(f"Notebook {new_file_path} olarak kaydedildi.")
        return new_file_path
    except Exception as e:
        print(f"Hata oluştu: {e}")

# Usage
save_current_notebook_as_backup("E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\")
"""

## Select Column for Machine Learning

In [None]:
# İstenilen sütunları seçme
selected_columns_for_ML = ["Gender","Diagnosis","CRP", "Esbach","Anti-Tg","Anti-TPO", 
                           "Dry eyes and mouth", "Joint pain","ACPA","Fatigue or chronic tiredness"]

ML_filtered_df_str = graves_disease_and_normal_data_string[selected_columns_for_ML].copy()

"""
# Seçilen sütunlarla yeni bir CSV dosyası oluşturma
output_path = 'E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\Selected_Columns_for_ML_Autoimmune_Dataset.csv'
ML_filtered_df.to_csv(output_path, index=False)

print(f"Yeni CSV dosyası oluşturuldu: {output_path}")
"""

ML_filtered_df_str

## Statistics for ML Columns

In [None]:
# Diagnosis sütununu normalize et (küçük harfe çevir, boşlukları temizle)
graves_disease_and_normal_data['Diagnosis'] = graves_disease_and_normal_data['Diagnosis'].astype(str).str.strip().str.lower()

# Analiz edilecek değişkenler (sadece veri setinde bulunanlar seçilecek)
y_axis_variables = ['Gender', 'Diagnosis', "RBC_Count", "Hemoglobin", "Hematocrit", "CRP", "Esbach",
                 "Anti-Tg", "Anti-TPO", "Dry eyes and mouth", "Joint pain", "ACPA", "Fatigue or chronic tiredness"]

# Mevcut sütunları kontrol et, olmayan sütunları çıkar
y_axis_variables = [col for col in y_axis_variables if col in graves_disease_and_normal_data.columns]


# Tüm sütunları sayısala çevir (yanlış format hatalarını önlemek için)
for col in y_axis_variables:
    graves_disease_and_normal_data[col] = pd.to_numeric(graves_disease_and_normal_data[col], errors='coerce')

# Sonuçları saklamak için boş bir sözlük başlatma
results = {
    "Variable": [],
    "Normal_Mean": [],
    "Normal_Std": [],
    "Graves_Mean": [],
    "Graves_Std": []
}

# Diagnosis değerlerini normalize et ve filtreleme yap
normal_group = graves_disease_and_normal_data[graves_disease_and_normal_data['Diagnosis'] == "normal"]
graves_group = graves_disease_and_normal_data[graves_disease_and_normal_data['Diagnosis'] == "graves' disease"]

# Her değişken için istatistik hesaplama
for y_var in y_axis_variables:
    if y_var in graves_disease_and_normal_data.columns:
        results["Variable"].append(y_var)
        results["Normal_Mean"].append(normal_group[y_var].mean())
        results["Normal_Std"].append(normal_group[y_var].std())
        results["Graves_Mean"].append(graves_group[y_var].mean())
        results["Graves_Std"].append(graves_group[y_var].std())

# DataFrame oluşturma
stats_df = pd.DataFrame(results)

# Sonuçları ekrana göster
import ace_tools_open as tools
tools.display_dataframe_to_user(name="Graves and Normal Statistics", dataframe=stats_df)


In [None]:
graves_disease_and_normal_data.dtypes

## Columns that Choosen with ML in Gender Group

In [None]:
def choosen_with_cbc_values(graves_disease_and_normal_data, target_column, save_tables=False, save_plots=False, output_path=None):
    """
    Gender gruplarına göre CBC değerlerini analiz eder, ortalama ve standart sapmalarını hesaplar.
    Tabloyu kaydeder ve grafik oluşturur.
    
    Parametreler:
    - graves_disease_and_normal_data: Pandas DataFrame, analiz edilecek veri seti.
    - target_column: Hedef sütun (örn: "Diagnosis").
    - save_tables: Tabloları kaydetmek için True/False.
    - save_plots: Grafikleri kaydetmek için True/False.
    - output_path: Çıktının kaydedileceği ana klasör.
    """

    # Sadece float sütunları seç
    float_columns = graves_disease_and_normal_data.select_dtypes(include=['float']).columns

    # Binary sütunları çıkar (sadece 0 ve 1 içerenleri tespit et)
    binary_columns = [col for col in float_columns if graves_disease_and_normal_data[col].dropna().nunique() == 2]
    float_columns = [col for col in float_columns if col not in binary_columns]  # Binary olanları çıkart

    # Çıktı klasörünü belirle
    if output_path:
        output_folder = os.path.join(output_path, "CBC_Values_by_Gender")
        os.makedirs(output_folder, exist_ok=True)
    else:
        raise ValueError("Output path cannot be None or empty.")

    # İşlem yapılan dosyaların kaydedildiği dizinleri takip etmek için liste
    saved_files = []

    # Her float sütun için analiz yap
    for col in float_columns:
        # Cinsiyete (Gender) göre gruplama ve ortalama & standart sapma hesaplama
        grouped_data = graves_disease_and_normal_data.groupby('Gender')[col].agg(['mean', 'std']).reset_index()
        grouped_data.columns = ['Gender', f'{col}_mean', f'{col}_std']  # Sütun isimlerini düzenle

        # Tabloyu CSV veya Excel olarak kaydet
        if save_tables:
            output_file = os.path.join(output_folder, f"{col}_gender_group_table.xlsx")
            grouped_data.to_excel(output_file, index=False)
            saved_files.append(output_file)

        # Grafik çizimi ve kaydetme
        if save_plots:
            plt.figure(figsize=(12, 6))

            # Hata çubuklarıyla ortalama değerleri gösteren çubuk grafik çizimi
            plt.bar(
                grouped_data['Gender'], 
                grouped_data[f'{col}_mean'], 
                yerr=grouped_data[f'{col}_std'], 
                capsize=5, 
                color=['#4C72B0', '#C44E52'],  # Erkek ve kadın için farklı renkler
                alpha=0.7
            )

            """
            # Başlık ve etiketler
            plt.title(f'{col} Levels by Gender', fontsize=16)
            plt.xlabel('Gender', fontsize=12)
            plt.ylabel(f'{col} Level', fontsize=12)
            plt.xticks(fontsize=10)
            plt.tight_layout()

            # Grafik dosyasını kaydetme
            plot_file = os.path.join(output_folder, f"{col}_plot_by_gender.png")
            plt.savefig(plot_file, bbox_inches='tight')
            plt.close()
            saved_files.append(plot_file)

    # İşlem tamamlandığında kaydedilen dosyaların listesini yazdır
    print(f"Files saved to {output_folder}:\n" + "\n".join(saved_files))
    """

# Kullanım
output_path = "E:\\Veri Bilimi Topluluğu\\proje-AID\\github\\"

choosen_with_cbc_values(
    graves_disease_and_normal_data,
    target_column='Diagnosis',
    save_tables=True,
    save_plots=True,
    output_path=output_path
)

## Correlation Analysis

In [None]:
num_cols = ["Diagnosis","CRP", "Esbach","Anti-Tg","Anti-TPO", "Dry eyes and mouth", "Joint pain","ACPA",
            "Fatigue or chronic tiredness"]

# Check for columns that exist in the DataFrame
num_cols = [col for col in num_cols if col in graves_disease_and_normal_data.columns]

corr = graves_disease_and_normal_data[num_cols].corr()
corr

## Correlation Matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix', fontsize=16)
plt.show()

**General Description:**<br/>

1. Matrix Structure:

- Each cell in the heatmap represents the correlation coefficient between two variables.<br/>
- Variables are listed along both the x-axis and y-axis.<br/>

2. Color Scheme (cmap="coolwarm"):

The RdBu colormap is used:<br/>
- Blue (Positive Correlation): Indicates that two variables are positively correlated (as one increases, the other also tends to increase).<br/>
- Red (Negative Correlation): Indicates a negative correlation (as one increases, the other tends to decrease).<br/>
- White/Light Shades: Near-zero correlations, implying no strong linear relationship.<br/>

3. Annotations (annot=corr_values):<br/>

-Each cell is annotated with the numerical value of the correlation coefficient (rounded to two decimal places).

**Key Correlation Coefficients:**<br/>

1. Range of Values:

Values range from -1 to 1:<br/>
   - +1: Perfect positive linear relationship.<br/>
   - -1: Perfect negative linear relationship.<br/>
   -  0: No linear relationship.<br/>
     
2. Strong Correlations:

- Cells with values close to +1 or -1 (intense red or blue shades) indicate strong relationships.<br/>
- Look for clusters of strong correlations to identify groups of variables that are highly related.

3. Weak/No Correlations:

- Cells near 0 (white or light shades) suggest weak or no linear relationship between the variables.

**How to Use the Heatmap:**<br/>

1. Identify Highly Correlated Variables:

- Strong positive correlations (e.g., close to +1) suggest variables that might contain redundant information.
- Strong negative correlations (e.g., close to -1) suggest inverse relationships.<br/>

2. Feature Selection:

- If two variables are strongly correlated, consider removing one from your model to reduce multicollinearity.

3. Data Relationships:

- The heatmap helps you quickly spot trends and relationships in the data that might not be immediately apparent.

# TENSORFLOW

## Extract the data from the csv

In [39]:
"""
raw_csv_data = pd.read_csv('E:\\Veri Bilimi Topluluğu\\proje-AID\\github-Graves Disease\\Graves_and_Normal_Integer_CBC_Data.csv')

# The inputs are all columns in the csv, except for the first one [:,0]
# (which is just the arbitrary customer IDs that bear no useful information),
# and the last one [:,-1] (which is our targets)

unscaled_inputs_all = raw_csv_data.iloc[:, 1:-1].values

# The targets are in the last column. That's how datasets are conventionally organized.
targets_all = raw_csv_data.iloc[:,-1].values
"""

## Balance the dataset

In [40]:
"""
# Count how many targets are 1 (meaning that the customer did convert)
num_one_targets = int(np.sum(targets_all))

# Set a counter for targets that are 0 (meaning that the customer did not convert)
zero_targets_counter = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
indices_to_remove = []

# Count the number of targets that are 0. 
# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)
"""

## Standardize the inputs

In [41]:
"""
from sklearn import preprocessing

# That's the only place we use sklearn functionality. We will take advantage of its preprocessing capabilities
# It's a simple line of code, which standardizes the inputs, as we explained in one of the lectures.
# At the end of the business case, you can try to run the algorithm WITHOUT this line of code. 
# The result will be interesting.
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)
"""

## Shuffle the data

In [42]:
"""
# When the data was collected it was actually arranged by date
# Shuffle the indices of the data, so the data is not arranged in any way when we feed it.
# Since we will be batching, we want the data to be as randomly spread out as possible
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]
"""

## Split the dataset into train, validation, and test

In [43]:
"""
# Count the total number of samples
samples_count = shuffled_inputs.shape[0]

# Count the samples in each subset, assuming we want 80-10-10 distribution of training, validation, and test.
# Naturally, the numbers are integers.
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

# The 'test' dataset contains all remaining data.
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Create variables that record the inputs and targets for training
# In our shuffled dataset, they are the first "train_samples_count" observations
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

# Create variables that record the inputs and targets for validation.
# They are the next "validation_samples_count" observations, folllowing the "train_samples_count" we already assigned
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

# Create variables that record the inputs and targets for test.
# They are everything that is remaining.
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

# We balanced our dataset to be 50-50 (for targets 0 and 1), but the training, validation, and test were 
# taken from a shuffled dataset. Check if they are balanced, too. Note that each time you rerun this code, 
# you will get different values, as each time they are shuffled randomly.
# Normally you preprocess ONCE, so you need not rerun this code once it is done.
# If you rerun this whole sheet, the npzs will be overwritten with your newly preprocessed data.

# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)
"""

1841 3537 0.5204975968334747
239 442 0.5407239819004525
245 443 0.5530474040632054


## Save the three datasets in *.npz

In [44]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!
"""
np.savez('AutoImmuneDisease_data_train', inputs=train_inputs, targets=train_targets)
np.savez('AutoImmuneDisease_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('AutoImmuneDisease_data_test', inputs=test_inputs, targets=test_targets)
"""

In [45]:
"""
# let's create a temporary variable npz, where we will store each of the three Audiobooks datasets
npz = np.load('AutoImmuneDisease_data_train.npz')

# we extract the inputs using the keyword under which we saved them
# to ensure that they are all floats, let's also take care of that
train_inputs = npz['inputs'].astype(float)
# targets must be int because of sparse_categorical_crossentropy (we want to be able to smoothly one-hot encode them)
train_targets = npz['targets'].astype(int)

# we load the validation data in the temporary variable
npz = np.load('AutoImmuneDisease_data_validation.npz')
# we can load the inputs and the targets in the same line
validation_inputs, validation_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

# we load the test data in the temporary variable
npz = np.load('AutoImmuneDisease_data_test.npz')
# we create 2 variables that will contain the test inputs and the test targets
test_inputs, test_targets = npz['inputs'].astype(float), npz['targets'].astype(int)
"""

In [46]:
"""
import tensorflow as tf

# Set the input and output sizes
input_size = 10
output_size = 2
# Use same hidden layer size for both hidden layers. Not a necessity.
hidden_layer_size = 50
    
# define how the model will look like
model = tf.keras.Sequential([
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    # the final layer is no different, we just make sure to activate it with softmax
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])


### Choose the optimizer and the loss function

# we define the optimizer we'd like to use, 
# the loss function, 
# and the metrics we are interested in obtaining at each iteration
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Training
# That's where we train the model we have built.

# set the batch size
batch_size = 100

# set a maximum number of training epochs
max_epochs = 100

# set an early stopping mechanism
# let's set patience=2, to be a bit tolerant against random validation loss increases
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

# fit the model
# note that this time the train, validation and test data are not iterable
model.fit(train_inputs, # train inputs
          train_targets, # train targets
          batch_size=batch_size, # batch size
          epochs=max_epochs, # epochs that we will train for (assuming early stopping doesn't kick in)
          # callbacks are functions called by a task when a task is completed
          # task here is to check if val_loss is increasing
          callbacks=[early_stopping], # early stopping
          validation_data=(validation_inputs, validation_targets), # validation data
          verbose = 2 # making sure we get enough information about the training process
          )  
"""

Epoch 1/100
36/36 - 2s - 49ms/step - accuracy: 0.6127 - loss: 0.6340 - val_accuracy: 0.6946 - val_loss: 0.5791
Epoch 2/100
36/36 - 0s - 3ms/step - accuracy: 0.7334 - loss: 0.5491 - val_accuracy: 0.7557 - val_loss: 0.5201
Epoch 3/100
36/36 - 0s - 2ms/step - accuracy: 0.7908 - loss: 0.4887 - val_accuracy: 0.8145 - val_loss: 0.4639
Epoch 4/100
36/36 - 0s - 2ms/step - accuracy: 0.8329 - loss: 0.4336 - val_accuracy: 0.8235 - val_loss: 0.4110
Epoch 5/100
36/36 - 0s - 2ms/step - accuracy: 0.8606 - loss: 0.3772 - val_accuracy: 0.8665 - val_loss: 0.3529
Epoch 6/100
36/36 - 0s - 2ms/step - accuracy: 0.8892 - loss: 0.3232 - val_accuracy: 0.8937 - val_loss: 0.3108
Epoch 7/100
36/36 - 0s - 2ms/step - accuracy: 0.9050 - loss: 0.2780 - val_accuracy: 0.9118 - val_loss: 0.2587
Epoch 8/100
36/36 - 0s - 2ms/step - accuracy: 0.9174 - loss: 0.2363 - val_accuracy: 0.9412 - val_loss: 0.2196
Epoch 9/100
36/36 - 0s - 3ms/step - accuracy: 0.9384 - loss: 0.2058 - val_accuracy: 0.9412 - val_loss: 0.1946
Epoch 10/

<keras.src.callbacks.history.History at 0x13f7c785df0>

## Test the model

In [47]:
# test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step - accuracy: 0.9525 - loss: 0.0938


In [48]:
# print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.10. Test accuracy: 95.26%


# Save the Model

In [49]:
"""
import nbformat
import os
from datetime import datetime
import ipynbname

def save_current_notebook_as_backup(save_directory):
    try:
        # Find current notebook
        notebook_path = ipynbname.path()

        # Create backup name with timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"AID_Prediction_with_tensor_flow_{timestamp}.ipynb"

        # Create backup index and save to notebook
        os.makedirs(save_directory, exist_ok=True)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook_content = nbformat.read(f, as_version=4)

        new_file_path = os.path.join(save_directory, filename)
        with open(new_file_path, 'w', encoding='utf-8') as f:
            nbformat.write(notebook_content, f)

        print(f"Notebook {new_file_path} olarak kaydedildi.")
        return new_file_path
    except Exception as e:
        print(f"Hata oluştu: {e}")

# Usage
save_current_notebook_as_backup("E:\\Veri Bilimi Topluluğu\\proje-AID\\github-Graves Disease\\")
"""

Notebook E:\Veri Bilimi Topluluğu\proje-AID\github-Graves Disease\AID_Prediction_with_tensor_flow_20250131_134055.ipynb olarak kaydedildi.


'E:\\Veri Bilimi Topluluğu\\proje-AID\\github-Graves Disease\\AID_Prediction_with_tensor_flow_20250131_134055.ipynb'