In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score

In [2]:
data = pd.read_csv('/kaggle/input/adult-income-dataset/adult.csv')

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

In [5]:
data = data.replace('?', np.nan).dropna()

In [6]:
# Kategorik değişkenleri kodla
le = LabelEncoder()
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 
                    'relationship', 'race', 'gender', 'native-country']
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

In [7]:
# Hedef değişkeni kodla
data['income'] = le.fit_transform(data['income'])  # <=50K: 0, >50K: 1

In [8]:
# Özellikler ve hedef
X = data.drop('income', axis=1)
y = data['income']

In [9]:
# Veriyi eğitim ve test olarak ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Veriyi ölçeklendir 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=500, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Naive Bayes': GaussianNB()
}

In [12]:
# Sonuçları saklamak için bir sözlük
results = {}

In [13]:
for name, model in models.items():
    print(f"Çalışıyor: {name}", flush=True)
    try:
        # Çapraz doğrulama ile doğruluk ve F1 skoru
        cv_accuracy = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=-1)
        cv_f1 = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1', n_jobs=-1)
        # Test setinde doğruluk ve F1 skoru
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        test_accuracy = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)
        # Sonuçları sakla
        results[name] = {
            'CV Accuracy': cv_accuracy.mean(),
            'CV Accuracy Std': cv_accuracy.std(),
            'CV F1 Score': cv_f1.mean(),
            'Test Accuracy': test_accuracy,
            'Test F1 Score': test_f1
        }
        print(f"{name} tamamlandı!", flush=True)
    except Exception as e:
        print(f"Hata {name}: {str(e)}", flush=True)

Çalışıyor: Logistic Regression
Logistic Regression tamamlandı!
Çalışıyor: Decision Tree
Decision Tree tamamlandı!
Çalışıyor: Naive Bayes
Naive Bayes tamamlandı!


In [14]:
# Sonuçları DataFrame olarak yazdır
if results:
    results_df = pd.DataFrame(results).T
    print("\nAlgoritma Başarı Oranları:", flush=True)
    print(results_df.round(4), flush=True)


Algoritma Başarı Oranları:
                     CV Accuracy  CV Accuracy Std  CV F1 Score  Test Accuracy  \
Logistic Regression       0.8202           0.0047       0.5602         0.8211   
Decision Tree             0.8463           0.0021       0.6331         0.8517   
Naive Bayes               0.7985           0.0029       0.4593         0.7989   

                     Test F1 Score  
Logistic Regression         0.5488  
Decision Tree               0.6390  
Naive Bayes                 0.4453  


In [15]:
best_model = results_df['Test Accuracy'].idxmax()
best_score = results_df['Test Accuracy'].max()
print(f"\nEn Başarılı Algoritma: {best_model} (Test Doğruluğu: {best_score:.4f})")


En Başarılı Algoritma: Decision Tree (Test Doğruluğu: 0.8517)
