# Model Generation

In [2]:
import numpy
import pandas
import keras 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.constraints import maxnorm

In [None]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

In [None]:
dataframe = pandas.read_csv("composite.csv")
dataframe = dataframe.replace({'?': numpy.nan}).dropna()
dataset = dataframe.values

X = dataset[:,:63]
Y = dataset[:,63] 
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.3)

model = Sequential()
model.add(Dense(28, input_dim=63, activation='relu', kernel_initializer="uniform"))
model.add(Dropout(0.2))
model.add(Dense(20, activation='relu', kernel_constraint=maxnorm(3), kernel_initializer="uniform"))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu', kernel_initializer="uniform"))
model.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_Train, Y_Train, epochs=300, batch_size=10)

scores = model.evaluate(X_Test, Y_Test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

# Experimentation

In [53]:
THRESHOLD = 0.22

def categorize_data(dataset):
  dataset[dataset <= THRESHOLD] = 0
  dataset[dataset > THRESHOLD] = 1
  return dataset
  
def predict_data(filepath, input_headers):
  dataframe = pandas.read_csv(filepath, header=None)
  dataframe = dataframe.replace({'?': numpy.nan}).dropna()
  dataset = dataframe.values

  headers = input_headers

  size = len(headers)
  for i in range(size):
    filtered = dataset[i::size]
    results = categorize_data(model.predict(filtered, verbose = 0))
    print(f"{headers[i]}: {numpy.count_nonzero(results)}")

In [54]:
predict_data("/content/age.csv", ['age20', 'age25', 'age30', 'age35', 'age40', 'age45', 'age50', 'age55', 'age60', 'age65'])

age20: 16
age25: 26
age30: 35
age35: 43
age40: 48
age45: 53
age50: 55
age55: 51
age60: 49
age65: 46


In [55]:
predict_data("/content/education.csv", ['10th','11th','12th','1st-4th','5th-6th','7th-8th','9th','Assoc-acdm',
                                        'Assoc-voc','Bachelors','Doctorate','HS-grad','Masters','Preschool',
                                        'Prof-school','Some-college'])

10th: 0
11th: 11
12th: 24
1st-4th: 6
5th-6th: 18
7th-8th: 23
9th: 11
Assoc-acdm: 59
Assoc-voc: 69
Bachelors: 78
Doctorate: 173
HS-grad: 47
Masters: 85
Preschool: 0
Prof-school: 77
Some-college: 61


In [56]:
predict_data("/content/gender.csv", ['Female', 'Male'])

Female: 34
Male: 38


In [57]:
predict_data("/content/job.csv", ['Adm-clerical','Armed-Forces','Craft-repair','Exec-managerial','Farming-fishing',
                                  'Handlers-cleaners','Machine-op-inspct','Other-service','Priv-house-serv','Prof-specialty',
                                  'Protective-serv','Sales','Tech-support','Transport-moving'])

Adm-clerical: 42
Armed-Forces: 59
Craft-repair: 29
Exec-managerial: 65
Farming-fishing: 15
Handlers-cleaners: 13
Machine-op-inspct: 14
Other-service: 12
Priv-house-serv: 0
Prof-specialty: 103
Protective-serv: 30
Sales: 35
Tech-support: 46
Transport-moving: 38


In [58]:
predict_data("/content/marital.csv", ['Divorced','Married-AF-spouse','Married-civ-spouse',
                                      'Married-spouse-absent','Never-married','Separated','Widowed'])

Divorced: 32
Married-AF-spouse: 59
Married-civ-spouse: 68
Married-spouse-absent: 24
Never-married: 16
Separated: 9
Widowed: 33


In [59]:
predict_data("/content/race.csv", ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'])

Amer-Indian-Eskimo: 0
Asian-Pac-Islander: 67
Black: 38
Other: 72
White: 61


In [60]:
predict_data("/content/relationship.csv", ['Husband','Not-in-family','Other-relative','Own-child','Unmarried','Wife'])

Husband: 37
Not-in-family: 39
Other-relative: 7
Own-child: 14
Unmarried: 11
Wife: 119


In [61]:
predict_data("/content/workclass.csv", ['Federal-govpay','Local-gov','Private','Self-emp-inc','Self-emp-not-inc',
                                        'State-gov','Without-pay'])

Federal-govpay: 50
Local-gov: 55
Private: 31
Self-emp-inc: 40
Self-emp-not-inc: 58
State-gov: 14
Without-pay: 12
