# Income Predictor Experiment - Tugas Besar 2A IF3170 AI

**Kelompok 23 - Saturnus**
- 13515001 (K-01) - Jonathan Christopher
- 13515008 (K-02) - Kanisius Kenneth Halim
- 13515052 (K-01) - Kevin Jonathan
- 13515064 (K-01) - Tasya
- 13515065 (K-02) - Felix Limanta

## Import data

In [1]:
import pandas
import numpy as np

training_data = np.array(pandas.read_csv('./data/CensusIncome.data.txt', header=None))
test_data = np.array(pandas.read_csv('./data/CensusIncome.test.txt', header=None))

feature_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
discrete_feature_indices = [1, 3, 5, 6, 7, 8, 9, 13]
discrete_feature_domains = {
    'workclass': ['Private',  'Self-emp-not-inc',  'Self-emp-inc',  'Federal-gov',  'Local-gov',  'State-gov',  'Without-pay',  'Never-worked'],
    'education': ['Bachelors',  'Some-college',  '11th',  'HS-grad',  'Prof-school',  'Assoc-acdm',  'Assoc-voc',  '9th',  '7th-8th',  '12th',  'Masters',  '1st-4th',  '10th',  'Doctorate',  '5th-6th',  'Preschool'],
    'marital-status': ['Married-civ-spouse',  'Divorced',  'Never-married',  'Separated',  'Widowed',  'Married-spouse-absent',  'Married-AF-spouse'],
    'occupation': ['Tech-support',  'Craft-repair',  'Other-service',  'Sales',  'Exec-managerial',  'Prof-specialty',  'Handlers-cleaners',  'Machine-op-inspct',  'Adm-clerical',  'Farming-fishing',  'Transport-moving',  'Priv-house-serv',  'Protective-serv',  'Armed-Forces'],
    'relationship': ['Wife',  'Own-child',  'Husband',  'Not-in-family',  'Other-relative',  'Unmarried'],
    'race': ['White',  'Asian-Pac-Islander',  'Amer-Indian-Eskimo',  'Other',  'Black'],
    'sex': ['Female',  'Male'],
    'native-country': ['United-States',  'Cambodia',  'England',  'Puerto-Rico',  'Canada',  'Germany',  'Outlying-US(Guam-USVI-etc)',  'India',  'Japan',  'Greece',  'South',  'China',  'Cuba',  'Iran',  'Honduras',  'Philippines',  'Italy',  'Poland',  'Jamaica',  'Vietnam',  'Mexico',  'Portugal',  'Ireland',  'France',  'Dominican-Republic',  'Laos',  'Ecuador',  'Taiwan',  'Haiti',  'Columbia',  'Hungary',  'Guatemala',  'Nicaragua',  'Scotland',  'Thailand',  'Yugoslavia',  'El-Salvador',  'Trinadad&Tobago',  'Peru',  'Hong',  'Holand-Netherlands']
}

## Preprocess data

### Trim whitespace from data

In [2]:
training_data = [[item.strip() if isinstance(item, str) else item for item in row] for row in training_data]
test_data = [[item.strip() if isinstance(item, str) else item for item in row] for row in test_data]

### Separate feature and target labels

In [3]:
training_features = np.array([row[:-1] for row in training_data])
training_targets = np.array([row[-1] for row in training_data])

test_features = np.array([row[:-1] for row in test_data])
test_targets = np.array([row[-1] for row in test_data])

### Replace missing feature values with mode

In [4]:
from collections import Counter

training_features_modes = [Counter(filter(lambda x : x != '?', column)).most_common(1)[0][0] for column in training_features.transpose()]
for r in range(0, len(training_features)):
    for c in range(0, len(training_features[r])):
        if training_features[r][c] == '?':
            training_features[r][c] = training_features_modes[c]
            
test_features_modes = [Counter(filter(lambda x : x != '?', column)).most_common(1)[0][0] for column in test_features.transpose()]
for r in range(0, len(test_features)):
    for c in range(0, len(test_features[r])):
        if test_features[r][c] == '?':
            test_features[r][c] = test_features_modes[c]

### Convert categorical feature names to integer

In [5]:
for r in range(0, len(training_features)):
    for c in range(0, len(training_features[r])):
        if c in discrete_feature_indices:
            domain = discrete_feature_domains[feature_names[c]]
            training_features[r][c] = domain.index(training_features[r][c])
            
for r in range(0, len(test_features)):
    for c in range(0, len(test_features[r])):
        if c in discrete_feature_indices:
            domain = discrete_feature_domains[feature_names[c]]
            test_features[r][c] = domain.index(test_features[r][c])

### Encode categorical features using one hot encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder

oneHotEncoderTraining = OneHotEncoder(categorical_features=discrete_feature_indices)
oneHotEncoderTraining.fit(training_features)
training_features = oneHotEncoderTraining.transform(training_features).toarray().astype(int)

oneHotEncoderTest = OneHotEncoder(categorical_features=discrete_feature_indices)
oneHotEncoderTest.fit(test_features)
test_features = oneHotEncoderTest.transform(test_features).toarray().astype(int)

### Scale continuous feature values

In [7]:
from sklearn import preprocessing

training_features = training_features.astype(np.float64)
training_features = np.hsplit(training_features, [-6])
training_features[1] = preprocessing.scale(training_features[1])
training_features = np.concatenate(training_features, axis=1)

print(training_features[0])

[ 0.          0.          0.          0.          0.          1.          0.
  0.          1.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          1.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          1.          0.          0.
  0.          0.          0.          0.          0.          0.          1.
  0.          0.          1.          0.          0.          0.          0.
  0.          1.          1.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.

## Try different classifiers

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(random_state=123)
scores = cross_val_score(clf, training_features, training_targets, cv=10)
print("Kinerja rata-rata: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

Kinerja rata-rata: 0.847425 (+/- 0.009808)


### K-Nearest Neighbor

In [None]:
from sklearn import neighbors
cv_knn = neighbors.KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(cv_knn, training_features, training_targets, cv=10)
print("Kinerja rata-rata: %f (+/- %f)" % (scores.mean(), scores.std() * 2))