https://archive.ics.uci.edu/ml//datasets/Adult

In [1]:
import pandas as pd
from tpot import TPOTClassifier
import numpy as np

In [2]:
names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", 
         "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"] #names

In [3]:
train = pd.read_csv("adult.data", names = names)
test = pd.read_csv("adult.test", names = names, skiprows = 1)  #data

In [4]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object

In [6]:
def get_one_hot(dataframe, col_name):                          #funkcja
    tmp = pd.get_dummies(dataframe[col_name], prefix = col_name)
    dataframe.drop(col_name, axis=1, inplace=True)
    dataframe = dataframe.join(tmp)
    return dataframe

In [7]:
train = get_one_hot(train, "workclass")
test = get_one_hot(test, "workclass")                           #treningowe, testowe

train = get_one_hot(train, "education")
test = get_one_hot(test, "education")

train = get_one_hot(train, "marital-status")
test = get_one_hot(test, "marital-status")


train = get_one_hot(train, "occupation")
test = get_one_hot(test, "occupation")


train = get_one_hot(train, "relationship")
test = get_one_hot(test, "relationship")


train = get_one_hot(train, "native-country")
test = get_one_hot(test, "native-country")

train = get_one_hot(train, "race")
test = get_one_hot(test, "race")

In [8]:
train["sex"] = train["sex"].astype("category").cat.codes
test["sex"] = test["sex"].astype("category").cat.codes           #kody kategorii

train["class"] = train["class"].astype("category").cat.codes
test["class"] = test["class"].astype("category").cat.codes

In [9]:
train.dtypes

age                                           int64
fnlwgt                                        int64
education-num                                 int64
sex                                            int8
capital-gain                                  int64
capital-loss                                  int64
hours-per-week                                int64
class                                          int8
workclass_ ?                                  uint8
workclass_ Federal-gov                        uint8
workclass_ Local-gov                          uint8
workclass_ Never-worked                       uint8
workclass_ Private                            uint8
workclass_ Self-emp-inc                       uint8
workclass_ Self-emp-not-inc                   uint8
workclass_ State-gov                          uint8
workclass_ Without-pay                        uint8
education_ 10th                               uint8
education_ 11th                               uint8
education_ 1

In [10]:
# Brakujące kolumny
missing_cols = set( train.columns ) - set( test.columns )
# Dodawanie brakujących kolumn
for c in missing_cols:
    test[c] = 0
# Upewnienie się, że kolejność będzie taka sama
test = test[train.columns]

In [11]:
y_train = train["class"]
X_train = train.drop("class", axis=1)                            #???

y_test = test["class"]
X_test = test.drop("class", axis=1)


In [15]:
classifier_config = {

                                                                 # Classifiers
    'sklearn.naive_bayes.GaussianNB': {
    },
    
    
    'sklearn.tree.DecisionTreeClassifier': {
        'criterion': ["gini", "entropy"],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },


    'sklearn.neighbors.KNeighborsClassifier': {
        'n_neighbors': range(1, 101),
        'weights': ["uniform", "distance"],
        'p': [1, 2]
    },

    
    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [10],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    },
    
    
    'sklearn.svm.LinearSVC': {
        'penalty': ["l1", "l2"],
        'loss': ["hinge", "squared_hinge"],
        'dual': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
    },
}

In [16]:
classifier = TPOTClassifier(generations=5, population_size=5, verbosity=2, random_state=42, 
                            periodic_checkpoint_folder = "../output/", config_dict = classifier_config)

In [17]:
classifier.fit(features = X_train, target = y_train)            #fit

Optimization Progress:  33%|███▎      | 10/30 [03:32<05:59, 18.00s/pipeline]

Generation 1 - Current best internal CV score: 0.8630263759006273


Optimization Progress:  53%|█████▎    | 16/30 [05:41<03:02, 13.01s/pipeline]

Generation 2 - Current best internal CV score: 0.8630263759006273


Optimization Progress:  67%|██████▋   | 20/30 [06:33<01:59, 11.96s/pipeline]

Generation 3 - Current best internal CV score: 0.8630263759006273


Optimization Progress:  87%|████████▋ | 26/30 [08:12<00:51, 12.75s/pipeline]

Generation 4 - Current best internal CV score: 0.8630263759006273


                                                                            

Generation 5 - Current best internal CV score: 0.8630263759006273

Best pipeline: RandomForestClassifier(GaussianNB(input_matrix), bootstrap=True, criterion=gini, max_features=0.5, min_samples_leaf=16, min_samples_split=16, n_estimators=10)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.tree.DecisionTreeClassifier': {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)}, 'sklearn.neighbors.KNeighborsClassifier': {'n_neighbors': range(1, 101), 'weig... 0.0001, 0.001, 0.01, 0.1], 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0]}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=5, periodic_checkpoint_folder='../output/',
        population_size=5, random_state=42, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [18]:
classifier.score(testing_features = X_test, testing_target = y_test)           #score

0.8638290031324857