# Model Generation

In [2]:
import numpy
import pandas
import keras 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.constraints import maxnorm

In [None]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

In [None]:
dataframe = pandas.read_csv("composite.csv")
dataframe = dataframe.replace({'?': numpy.nan}).dropna()
dataset = dataframe.values

X = dataset[:,:63]
Y = dataset[:,63] 
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.3)

model = Sequential()
model.add(Dense(28, input_dim=63, activation='relu', kernel_initializer="uniform"))
model.add(Dropout(0.2))
model.add(Dense(20, activation='relu', kernel_constraint=maxnorm(3), kernel_initializer="uniform"))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu', kernel_initializer="uniform"))
model.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_Train, Y_Train, epochs=300, batch_size=10)

scores = model.evaluate(X_Test, Y_Test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

# Experimentation

## Data Generation

In [70]:
import random as r
import pandas as pd

NUM_GROUPS = 500

def random_age_gen():
    return [r.randint(18, 65)]

def random_nominal(size):
    output = [0] * size
    output[r.randint(0, size - 1)] = 1
    return output

def generate_age_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constant = [0, 0, 40, 0] + \
                   random_nominal(7) + \
                   random_nominal(16) + \
                   random_nominal(7) + [0] + \
                   random_nominal(14) + \
                   random_nominal(6) + \
                   random_nominal(5) + \
                   random_nominal(2)
        for age in range(20, 66, 5):
            data = [age] + constant
            total_data.append(data)
    with open("/content/age.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_workclass_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constantA = random_age_gen() + \
                    [0, 0, 40, 0]

        constantB = random_nominal(16) + \
                    random_nominal(7) + [0] + \
                    random_nominal(14) + \
                    random_nominal(6) + \
                    random_nominal(5) + \
                    random_nominal(2)
        for i in range(0, 7):
            onehot = [0] * 7
            onehot[i] = 1
            data = constantA + onehot + constantB
            total_data.append(data)
    with open("/content/workclass.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_education_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constantA = random_age_gen() + \
                    [0, 0, 40, 0] + \
                    random_nominal(7)

        constantB = random_nominal(7) + [0] + \
                    random_nominal(14) + \
                    random_nominal(6) + \
                    random_nominal(5) + \
                    random_nominal(2)
        for i in range(0, 16):
            onehot = [0] * 16
            onehot[i] = 1
            data = constantA + onehot + constantB
            total_data.append(data)
    with open("/content/education.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_marital_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constantA = random_age_gen() + \
                    [0, 0, 40, 0] + \
                    random_nominal(7) + \
                    random_nominal(16)

        constantB = [0] + \
                    random_nominal(14) + \
                    random_nominal(6) + \
                    random_nominal(5) + \
                    random_nominal(2)
        for i in range(0, 7):
            onehot = [0] * 7
            onehot[i] = 1
            data = constantA + onehot + constantB
            total_data.append(data)
    with open("/content/marital.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_job_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constantA = random_age_gen() + \
                    [0, 0, 40, 0] + \
                    random_nominal(7) + \
                    random_nominal(16) + \
                    random_nominal(7) + [0]

        constantB = random_nominal(6) + \
                    random_nominal(5) + \
                    random_nominal(2)
        for i in range(0, 14):
            onehot = [0] * 14
            onehot[i] = 1
            data = constantA + onehot + constantB
            total_data.append(data)
    with open("/content/job.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_relationship_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constantA = random_age_gen() + \
                    [0, 0, 40, 0] + \
                    random_nominal(7) + \
                    random_nominal(16) + \
                    random_nominal(7) + [0] + \
                    random_nominal(14)

        constantB = random_nominal(5) + \
                    random_nominal(2)
        for i in range(0, 6):
            onehot = [0] * 6
            onehot[i] = 1
            data = constantA + onehot + constantB
            total_data.append(data)
    with open("/content/relationship.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_race_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constantA = random_age_gen() + \
                    [0, 0, 40, 0] + \
                    random_nominal(7) + \
                    random_nominal(16) + \
                    random_nominal(7) + [0] + \
                    random_nominal(14) + \
                    random_nominal(6)

        constantB = random_nominal(2)
        for i in range(0, 5):
            onehot = [0] * 5
            onehot[i] = 1
            data = constantA + onehot + constantB
            total_data.append(data)
    with open("/content/race.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

def generate_gender_data():
    total_data = []
    for i in range(NUM_GROUPS):
        constant = random_age_gen() + \
                   [0, 0, 40, 0] + \
                    random_nominal(7) + \
                    random_nominal(16) + \
                    random_nominal(7) + [0] + \
                    random_nominal(14) + \
                    random_nominal(6) + \
                    random_nominal(5)
        for i in range(0, 2):
            onehot = [0] * 2
            onehot[i] = 1
            data = constant + onehot
            total_data.append(data)
    with open("/content/gender.csv", 'w') as f:
      pd.DataFrame(total_data).to_csv(f, index=False, header=False)

generate_age_data()
generate_workclass_data()
generate_education_data()
generate_marital_data()
generate_job_data()
generate_relationship_data()
generate_race_data()
generate_gender_data()

## Predictions

In [71]:
THRESHOLD = 0.22

def categorize_data(dataset):
  dataset[dataset <= THRESHOLD] = 0
  dataset[dataset > THRESHOLD] = 1
  return dataset
  
def predict_data(filepath, input_headers):
  dataframe = pandas.read_csv(filepath, header=None)
  dataframe = dataframe.replace({'?': numpy.nan}).dropna()
  dataset = dataframe.values

  headers = input_headers

  size = len(headers)
  for i in range(size):
    filtered = dataset[i::size]
    results = categorize_data(model.predict(filtered, verbose = 0))
    print(f"{headers[i]}: {numpy.count_nonzero(results)}")

In [72]:
predict_data("/content/age.csv", ['age20', 'age25', 'age30', 'age35', 'age40', 'age45', 'age50', 'age55', 'age60', 'age65'])

age20: 17
age25: 33
age30: 41
age35: 47
age40: 56
age45: 60
age50: 60
age55: 55
age60: 49
age65: 45


In [73]:
predict_data("/content/education.csv", ['10th','11th','12th','1st-4th','5th-6th','7th-8th','9th','Assoc-acdm',
                                        'Assoc-voc','Bachelors','Doctorate','HS-grad','Masters','Preschool',
                                        'Prof-school','Some-college'])

10th: 0
11th: 8
12th: 8
1st-4th: 5
5th-6th: 12
7th-8th: 18
9th: 7
Assoc-acdm: 43
Assoc-voc: 47
Bachelors: 62
Doctorate: 152
HS-grad: 32
Masters: 69
Preschool: 0
Prof-school: 68
Some-college: 53


In [74]:
predict_data("/content/gender.csv", ['Female', 'Male'])

Female: 46
Male: 52


In [75]:
predict_data("/content/job.csv", ['Adm-clerical','Armed-Forces','Craft-repair','Exec-managerial','Farming-fishing',
                                  'Handlers-cleaners','Machine-op-inspct','Other-service','Priv-house-serv','Prof-specialty',
                                  'Protective-serv','Sales','Tech-support','Transport-moving'])

Adm-clerical: 46
Armed-Forces: 62
Craft-repair: 36
Exec-managerial: 64
Farming-fishing: 19
Handlers-cleaners: 22
Machine-op-inspct: 19
Other-service: 16
Priv-house-serv: 0
Prof-specialty: 109
Protective-serv: 33
Sales: 41
Tech-support: 53
Transport-moving: 40


In [76]:
predict_data("/content/marital.csv", ['Divorced','Married-AF-spouse','Married-civ-spouse',
                                      'Married-spouse-absent','Never-married','Separated','Widowed'])

Divorced: 31
Married-AF-spouse: 69
Married-civ-spouse: 82
Married-spouse-absent: 25
Never-married: 15
Separated: 10
Widowed: 34


In [77]:
predict_data("/content/race.csv", ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'])

Amer-Indian-Eskimo: 0
Asian-Pac-Islander: 56
Black: 37
Other: 72
White: 59


In [78]:
predict_data("/content/relationship.csv", ['Husband','Not-in-family','Other-relative','Own-child','Unmarried','Wife'])

Husband: 42
Not-in-family: 43
Other-relative: 5
Own-child: 10
Unmarried: 11
Wife: 126


In [79]:
predict_data("/content/workclass.csv", ['Federal-govpay','Local-gov','Private','Self-emp-inc','Self-emp-not-inc',
                                        'State-gov','Without-pay'])

Federal-govpay: 60
Local-gov: 58
Private: 30
Self-emp-inc: 49
Self-emp-not-inc: 52
State-gov: 15
Without-pay: 11
