# Workclass Imputation 

### Step 2

Find what variable most heavily correlates (Pearson Correlation Coefficient) with workclass
via filter method and predict for missing values

In [108]:
import pandas as pd, numpy as np

In [109]:
train_df = pd.read_csv('train_workclass_predict.csv', index_col = 0)

In [110]:
test_df = pd.read_csv('test_workclass_predict.csv', index_col = 0)

In [111]:
train_features = train_df.iloc[:,:-2]
train_labels = train_df.iloc[:,-2]
train_target = pd.Series(pd.factorize(train_labels)[0])


In [112]:
train_df

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,...,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,>50k,workclass_or,workclass_int
0,39,77516,2174,0,40,0,0,0,0,1,...,0,0,0,0,1,0,0,0,State-gov,0
1,50,83311,0,0,13,0,0,1,0,0,...,0,0,0,0,1,0,0,0,Self-emp-not-inc,1
2,38,215646,0,0,40,1,0,0,0,0,...,0,0,0,0,1,0,0,0,Private,2
3,53,234721,0,0,40,0,0,1,0,0,...,0,0,0,0,1,0,0,0,Private,2
4,28,338409,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Private,2
5,37,284582,0,0,40,0,0,1,0,0,...,0,0,0,0,1,0,0,0,Private,2
6,49,160187,0,0,16,0,0,0,1,0,...,0,0,0,0,0,0,0,0,Private,2
7,52,209642,0,0,45,0,0,1,0,0,...,0,0,0,0,1,0,0,1,Self-emp-not-inc,1
8,31,45781,14084,0,50,0,0,0,0,1,...,0,0,0,0,1,0,0,1,Private,2
9,42,159449,5178,0,40,0,0,1,0,0,...,0,0,0,0,1,0,0,1,Private,2


In [113]:
test_features = test_df.iloc[:,:-2]
test_target = test_df.iloc[:,-2]

# PCC

In [114]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

In [115]:
def ppc_features(x,y):

    a = np.array(y)
    filt_feat = []

    for i in range(x.shape[1]):
        b = np.array(x.iloc[:,i])
        val = np.abs(pearson(a,b)) # absolute value of R
        filt_feat.append([val,i])

    filt_sort = sorted(filt_feat, reverse=True)
    
    rank = np.array(list(range(len(filt_sort))))
    rank = [x+1 for x in rank]
    feat_n = []
    r_score = []
    for i in range(len(filt_sort)):
        feat_n.append(filt_sort[i][1])
        r_score.append(filt_sort[i][0])
        
    Filter_Ranks = pd.DataFrame(data=np.column_stack((rank, feat_n, r_score)),columns=['Rank','Feature #','R_score'])
    Filter_Ranks["Feature #"] = Filter_Ranks["Feature #"].astype(int)
    Filter_Ranks["Rank"] = Filter_Ranks["Rank"].astype(int)
    
    return Filter_Ranks

In [116]:
trial = ppc_features(train_features, train_target)

In [117]:
list(trial.iloc[:4,1])

[66, 4, 0, 2]

In [118]:
trial

Unnamed: 0,Rank,Feature #,R_score
0,1,66,0.094732
1,2,4,0.064048
2,3,0,0.063199
3,4,2,0.048097
4,5,9,0.046450
5,6,7,0.041645
6,7,12,0.037236
7,8,15,0.027589
8,9,13,0.024982
9,10,3,0.022893


# Family, Salary, Capital, Hours Per Week

In [119]:
sub_train_feat = train_features.iloc[:,list(trial.iloc[:20,1])]
sub_train_feat.head()

Unnamed: 0,>50k,hours_per_week,age,capital_gain,Never-married,Married-civ-spouse,Husband,Own-child,Not-in-family,capital_loss,Widowed,China,Wife,Other-relative,Cuba,United-States,Columbia,Married-spouse-absent,Black,Mexico
0,0,40,39,2174,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,0,13,50,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,40,38,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,0,40,53,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0,40,28,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0


In [120]:
sub_test_feat = test_features.iloc[:,list(trial.iloc[:20,1])]
sub_test_feat.head()

Unnamed: 0,>50k,hours_per_week,age,capital_gain,Never-married,Married-civ-spouse,Husband,Own-child,Not-in-family,capital_loss,Widowed,China,Wife,Other-relative,Cuba,United-States,Columbia,Married-spouse-absent,Black,Mexico
27,1,60,54,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
61,0,40,32,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
69,0,40,25,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
77,0,2,67,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
106,0,32,17,34095,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0


# Split

In [121]:
from sklearn.model_selection import train_test_split

In [122]:
X_train, X_test, y_train, y_test = train_test_split(sub_train_feat, train_target, random_state=42, test_size = 0.3)

In [123]:
#y_train = np.array(y_train).reshape(-1)

In [124]:
(y_train.shape, X_train.shape)

((21507,), (21507, 20))

In [125]:
(y_test.shape, X_test.shape)

((9218,), (9218, 20))

# Naive Bayes

In [126]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score

In [127]:
NB = GaussianNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)

In [128]:
accuracy_score(y_pred, y_test)

0.22087220655239748

# Logistic Regression train-train

Seems to be the way to go

In [None]:
from sklearn.linear_model import LogisticRegression 

In [105]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [106]:
accuracy_score(y_pred, y_test)

0.74159253634193967

In [107]:
pd.Series(y_pred)

0       2
1       2
2       1
3       2
4       2
5       2
6       2
7       2
8       2
9       2
10      2
11      2
12      2
13      2
14      2
15      2
16      2
17      2
18      2
19      2
20      2
21      2
22      2
23      2
24      2
25      2
26      2
27      2
28      2
29      2
       ..
9188    2
9189    2
9190    2
9191    2
9192    2
9193    2
9194    2
9195    2
9196    2
9197    2
9198    2
9199    2
9200    2
9201    2
9202    2
9203    2
9204    2
9205    2
9206    2
9207    2
9208    2
9209    2
9210    2
9211    2
9212    2
9213    2
9214    2
9215    2
9216    2
9217    2
dtype: int64

# Logistic Regression train-test

In [50]:
lr = LogisticRegression()
lr.fit(sub_train_feat,train_target)
y_pred = lr.predict(sub_test_feat)

In [51]:
y_pred

array([2, 2, 2, ..., 2, 2, 2])

# Conclusion

Seems like Private is the best fit workclass for all missing values
It also seems to be the majority count with 22696 instances over 30725
which amounts to 73.87% of the instances

In [66]:
(train_labels.value_counts(), len(train_labels))

( Private             22696
  Self-emp-not-inc     2541
  Local-gov            2093
  State-gov            1298
  Self-emp-inc         1116
  Federal-gov           960
  Without-pay            14
  Never-worked            7
 Name: workclass_or, dtype: int64, 30725)

In [67]:
('Private for {}% of workclass instances').format(22696/30725*100)

'Private for 73.86818551668023% of workclass instances'