# American Census

In [37]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

## Global config

In [38]:
# Data config
path_to_csv = "adult.csv"

## Load data

In [39]:
pandas_X_train = pd.read_csv("X_train.csv")
pandas_y_train = pd.read_csv("y_train.csv")

pandas_X_test = pd.read_csv("X_test.csv")
print(pandas_X_train.shape)
print(pandas_X_train)

(16280, 11)
       age  education-num  hours-per-week     workclass      education  \
0       30            9.0              40       Private        HS-grad   
1       77           10.0               6           NaN   Some-college   
2       44           12.0              50       Private     Assoc-acdm   
3       53           10.0              50       Private   Some-college   
4       41           10.0              40   Federal-gov   Some-college   
...    ...            ...             ...           ...            ...   
16275   31           11.0              30       Private      Assoc-voc   
16276   37           13.0              40     State-gov      Bachelors   
16277   45           14.0              40     State-gov        Masters   
16278   32            9.0              40       Private        HS-grad   
16279   58            7.0              50       Private           11th   

            marital-status          occupation    relationship  \
0       Married-civ-spouse       

## Helpers

In [40]:
# TODO
X_train = pandas_X_train.to_numpy(copy=True)
y_train = pandas_y_train.to_numpy(copy=True)

X_test = pandas_X_test.to_numpy(copy=True)

## Data exploration

### Question 1

In [41]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))
print("From above we can derive that there are an unequal number of samples belonging to each class.")
print("Class distribution for salary <50.000$: ", counts_elements[0]/counts_elements.sum())
print("Class distribution for salary >50.000$: ", counts_elements[1]/counts_elements.sum())

[[    0     1]
 [12360  3920]]
From above we can derive that there are an unequal number of samples belonging to each class.
Class distribution for salary <50.000$:  0.7592137592137592
Class distribution for salary >50.000$:  0.24078624078624078


### Question 2

In [43]:
exteded_data = np.c_[X_train, y_train]

num_men = np.where((exteded_data[:, 9] == " Male"))[0].size
num_female = np.where((exteded_data[:, 9] == " Female"))[0].size

females_under_50 = np.where((exteded_data[:, 9] == " Female") & (exteded_data[:, 11] == 0))[0].size
females_over_50 = np.where((exteded_data[:, 9] == " Female") & (exteded_data[:, 11] == 1))[0].size

men_under_50 = np.where((exteded_data[:, 9] == " Male") & (exteded_data[:, 11] == 0))[0].size
men_over_50 = np.where((exteded_data[:, 9] == " Male") & (exteded_data[:, 11] == 1))[0].size

diff_over = men_over_50 - females_over_50
print("Number of men: ", num_men)
print("Number of women: ", num_female)
print()
print("Number of women with salary under $50k: ", females_under_50)
print("Number of women with salary over $50k: ", females_over_50)
print("Number of men with salary under $50k: ", men_under_50)
print("Number of men with salary over $50k: ", men_over_50)
print()
print("For over $50k salary a year, the pay gap between men and women seems rather big (",diff_over,").")
print("Based on training data: Men >$50k: ", round((men_over_50/num_men) * 100, 2) , "%, Women >$50k: ", round(females_over_50/num_female * 100, 2), "%.")
print("Scaled to account for the difference between the number of men and women: Men >$50k: ", round((men_over_50/num_men)/(num_men/num_female) * 100, 2) , "%, Women >$50k: ", round(females_over_50/num_female * 100, 2), "%.")
print("This means that the difference is quite insignificant, and thus this parameter isn't necessarily curcial.")

print()
print("Distribution of races:")
unique_races, counts_races = np.unique(X_train[:, 8], return_counts=True)
zipped = zip(unique_races, counts_races)
print(list(zipped))
print("Since there is a clearly unbalanced dataset of races, the trained model will be highly biased towards the white race! Thus, race is also not quite a good paramter. For example, the white race is found accross",
     round(counts_races[4] / X_train.shape[0] * 100, 2), "% of the training data!")

Number of men:  10883
Number of women:  5397

Number of women with salary under $50k:  4797
Number of women with salary over $50k:  600
Number of men with salary under $50k:  7563
Number of men with salary over $50k:  3320

For over $50k salary a year, the pay gap between men and women seems rather big ( 2720 ).
Based on training data: Men >$50k:  30.51 %, Women >$50k:  11.12 %.
Scaled to account for the difference between the number of men and women: Men >$50k:  15.13 %, Women >$50k:  11.12 %.
This means that the difference is quite insignificant, and thus this parameter isn't necessarily curcial.

Distribution of races:
[(' Amer-Indian-Eskimo', 146), (' Asian-Pac-Islander', 506), (' Black', 1537), (' Other', 138), (' White', 13953)]
Since there is a clearly unbalanced dataset of races, the trained model will be highly biased towards the white race! Thus, race is also not quite a good paramter. For example, the white race is found accross 85.71 % of the training data!


## Data preparations

### Question 1

In [108]:
# TODO

### Question 2

In [8]:
# TODO

### Question 3

In [9]:
# TODO

## Experiments

### Question 1

In [10]:
# TODO

### Question 2

In [11]:
# TODO

### Question 3

In [12]:
# TODO

### Question 4

In [13]:
#TODO

### Question 5

In [14]:
#TODO
prediction = np.array([-1] * len(X_test)) #TODO replace this with you own prediction
pd.DataFrame(prediction).to_csv("GROUP_classes_problem_census.txt", index=False, header=False)