# Occupation Imputation 

Find what variable most heavily correlates (Pearson Correlation Coefficient) with occupation
via filter method and predict for missing values

In [153]:
import pandas as pd, numpy as np

In [154]:
train_df = pd.read_csv('train_test_occupation_imp.csv', index_col = 0)

In [155]:
test_df = pd.read_csv('test_test_occupation_imp.csv', index_col = 0)

In [156]:
len(test_df.columns) == len(train_df.columns)

True

In [157]:
train_features = train_df.drop('occupation', axis = 1)
train_labels = train_df.occupation
train_target = pd.Series(pd.factorize(train_labels)[0])


In [158]:
test_features = test_df.drop('occupation', axis = 1)
test_labels = test_df.occupation
test_target = pd.Series(pd.factorize(test_labels)[0])

# PCC

In [159]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

In [160]:
def ppc_features(x,y):

    a = np.array(y)
    filt_feat = []

    for i in range(x.shape[1]):
        b = np.array(x.iloc[:,i])
        val = np.abs(pearson(a,b)) # absolute value of R
        filt_feat.append([val,i])

    filt_sort = sorted(filt_feat, reverse=True)
    
    rank = np.array(list(range(len(filt_sort))))
    rank = [x+1 for x in rank]
    feat_n = []
    r_score = []
    for i in range(len(filt_sort)):
        feat_n.append(filt_sort[i][1])
        r_score.append(filt_sort[i][0])
        
    Filter_Ranks = pd.DataFrame(data=np.column_stack((rank, feat_n, r_score)),columns=['Rank','Feature #','R_score'])
    Filter_Ranks["Feature #"] = Filter_Ranks["Feature #"].astype(int)
    Filter_Ranks["Rank"] = Filter_Ranks["Rank"].astype(int)
    
    return Filter_Ranks

In [161]:
trial = ppc_features(train_features, train_target)



In [162]:
list(trial.iloc[:4,1])

[24, 23, 22, 20]

In [163]:
trial

Unnamed: 0,Rank,Feature #,R_score
0,1,24,0.031987
1,2,23,0.031987
2,3,22,0.029807
3,4,20,0.026621
4,5,15,0.024109
5,6,16,0.024056
6,7,12,0.023425
7,8,4,0.021827
8,9,13,0.020067
9,10,27,0.017893


# Family, Salary, Capital, Hours Per Week

In [164]:
sub_train_feat = train_features.iloc[:,list(trial.iloc[:20,1])]
sub_train_feat.head()

Unnamed: 0,Male,Female,White,Black,Own-child,Unmarried,Husband,hours_per_week,Not-in-family,China,fnlwgt,Wife,Married-civ-spouse,Widowed,Canada,Divorced,capital_loss,Asian-Pac-Islander,El-Salvador,Dominican-Republic
0,1,0,0,1,1,0,0,40,0,0,226802,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,1,50,0,0,89814,0,1,0,0,0,0,0,0,0
2,1,0,1,0,0,0,1,40,0,0,336951,0,1,0,0,0,0,0,0,0
3,1,0,0,1,0,0,1,40,0,0,160323,0,1,0,0,0,0,0,0,0
5,1,0,1,0,0,0,0,30,1,0,198693,0,0,0,0,0,0,0,0,0


In [165]:
sub_test_feat = test_features.iloc[:,list(trial.iloc[:20,1])]
sub_test_feat.head()

Unnamed: 0,Male,Female,White,Black,Own-child,Unmarried,Husband,hours_per_week,Not-in-family,China,fnlwgt,Wife,Married-civ-spouse,Widowed,Canada,Divorced,capital_loss,Asian-Pac-Islander,El-Salvador,Dominican-Republic
4,0,1,1,0,1,0,0,30,0,0,103497,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,1,0,40,0,0,227026,0,0,0,0,0,0,0,0,0
13,1,0,1,0,0,0,1,35,0,0,299831,0,1,0,0,0,0,0,0,0
22,0,1,1,0,0,0,0,6,1,0,132015,0,0,0,0,1,0,0,0,0
35,1,0,1,0,0,0,1,40,0,0,191846,0,1,0,0,0,0,0,0,0


# Split

In [166]:
from sklearn.model_selection import train_test_split

In [167]:
X_train, X_test, y_train, y_test = train_test_split(sub_train_feat, train_target, random_state=42, test_size = 0.3)

In [168]:
(y_train.shape, X_train.shape)

((10720,), (10720, 20))

In [169]:
(y_test.shape, X_test.shape)

((4595,), (4595, 20))

# KNN

In [170]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score

In [171]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [172]:
accuracy_score(y_pred, y_test)

0.11099020674646355

# Logistic Regression train-train

Seems to be the way to go

In [173]:
from sklearn.linear_model import LogisticRegression 

In [174]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [175]:
accuracy_score(y_pred, y_test)

0.13362350380848748

# Random Forest

In [176]:
from sklearn.ensemble import RandomForestClassifier

In [177]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [178]:
accuracy_score(y_pred, y_test)

0.1616974972796518

# Ensemble with Naive Bayes, Logistic Regression and Random Forest

In [179]:
from scipy.stats import mode

In [180]:
def ensemble_nblrrf(X_train, X_test, y_train):
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    lr_pred = np.array(lr.predict(X_test))
 
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    knn_pred = np.array(knn.predict(X_test))
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    rf_pred = np.array(rf.predict(X_test))
    
    democratic_y = []
    
    for i in range(len(lr_pred)):
        vote = mode([lr_pred[i], knn_pred[i], rf_pred[i]])[0][0]
        democratic_y.append(vote)
        
    return pd.Series(democratic_y)

In [181]:
dem_y = ensemble_nblrrf(X_train, X_test, y_train)

In [182]:
accuracy_score(dem_y, y_test)

0.13340587595212186

# Conclusion

Data much more uniformly distributed

In [183]:
(train_labels.value_counts(), len(train_labels))

( Prof-specialty       2032
  Exec-managerial      2020
  Craft-repair         2013
  Sales                1854
  Adm-clerical         1841
  Other-service        1628
  Machine-op-inspct    1020
  Transport-moving      758
  Handlers-cleaners     702
  Tech-support          518
  Farming-fishing       496
  Protective-serv       334
  Priv-house-serv        93
  Armed-Forces            6
 Name: occupation, dtype: int64, 15315)

In [184]:
print(('Prof-specialty for {}% of workclass instances').format(np.round(2032/15315*100)))
print(('Exec-managerial for {}% of workclass instances').format(np.round(2020/15315*100)))
print(('Craft-repair for {}% of workclass instances').format(np.round(2013/15315*100)))

Prof-specialty for 13.0% of workclass instances
Exec-managerial for 13.0% of workclass instances
Craft-repair for 13.0% of workclass instances


In [185]:
%matplotlib inline
import matplotlib.pyplot as plt

#plt.hist(test_df.iloc[:,-1], edgecolor = 'black')

In [186]:
pd.factorize(train_labels)

(array([0, 1, 2, ..., 4, 6, 7]),
 Index([' Machine-op-inspct', ' Farming-fishing', ' Protective-serv',
        ' Other-service', ' Prof-specialty', ' Craft-repair', ' Adm-clerical',
        ' Exec-managerial', ' Tech-support', ' Sales', ' Priv-house-serv',
        ' Transport-moving', ' Handlers-cleaners', ' Armed-Forces'],
       dtype='object'))

# Ensemble train-test

In [187]:
dem_y = ensemble_nblrrf(sub_train_feat, sub_test_feat, train_target)

In [188]:
occupation_map = {0:' Machine-op-inspct',1: ' Farming-fishing', 2: ' Protective-serv', 3:' Other-service',
                  4: ' Prof-specialty', 5: ' Craft-repair', 6:' Adm-clerical', 7: ' Exec-managerial',
                  8: ' Tech-support', 9: ' Sales', 10: ' Priv-house-serv', 11: ' Transport-moving',
                  12:' Handlers-cleaners', 13: ' Armed-Forces'}

In [189]:
print(sorted(dem_y.unique()))

[0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12]


# Insert np.zero

In [190]:
test_features.insert(loc = 75, column='occupation', value = pd.Series(np.zeros(len(train_labels)), dtype = int))

In [191]:
test_features.iloc[:, 75] = pd.Series(np.zeros(len(train_labels)), dtype = int)

In [192]:
index_ms = test_features.iloc[:, 75].index.values

In [193]:
test_features.iloc[:, 75][index_ms] = dem_y

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [194]:
test_features.occupation = test_features.occupation.replace(occupation_map)

In [195]:
len(test_features.columns) == len(train_df.columns)

True

In [208]:
train_features.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,...,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,>50k,occupation
0,25,226802,0,0,40,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,Machine-op-inspct
1,38,89814,0,0,50,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,Farming-fishing
2,28,336951,0,0,40,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,Protective-serv
3,44,160323,7688,0,40,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,Machine-op-inspct
5,34,198693,0,0,30,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,Other-service


# Rest of DataFrame

In [207]:
train_features = train_df.drop('occupation', axis = 1)
train_labels = train_df.occupation
train_target = pd.Series(pd.factorize(train_labels)[0])
train_features.insert(loc = 75, column='occupation', value = train_labels)

In [209]:
len(train_features.columns)

76

In [210]:
len(train_features.columns) == len(test_features.columns)

True

# Reorder columns

In [211]:
train_target = train_features['>50k']
train_features = train_features.drop('>50k', axis = 1)
test_target = test_features['>50k']
test_features = test_features.drop('>50k', axis = 1)

In [212]:
test_features.insert(loc = 75, column='>50k', value = test_target)
train_features.insert(loc = 75, column='>50k', value = train_target)

# Append two datasets

In [213]:
imputed_df_unsorted = test_features.append(train_features)

In [214]:
imputed_df = imputed_df_unsorted.sort_index()

In [215]:
imputed_df.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,...,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,occupation,>50k
0,25,226802,0,0,40,0,0,0,0,1,...,0,0,0,1,0,0,0,0,Machine-op-inspct,0
1,38,89814,0,0,50,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Farming-fishing,0
2,28,336951,0,0,40,0,0,1,0,0,...,0,1,0,0,0,0,0,0,Protective-serv,1
3,44,160323,7688,0,40,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Machine-op-inspct,1
4,18,103497,0,0,30,0,0,0,0,1,...,0,0,0,1,0,0,0,0,Other-service,0


In [216]:
imputed_df.iloc[:,-2].to_csv('test_occupation_imputed.csv')

# Dummify and Reinsert

In [229]:
occupation_str = imputed_df.iloc[:,-1]
occupation_str.unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv', ' Machine-op-inspect'], dtype=object)

In [230]:
occupation = pd.get_dummies(occupation_str)

In [231]:
occupation.head()

Unnamed: 0,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Machine-op-inspect,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [232]:
imputed_df = imputed_df.iloc[:,:-2]
target = imputed_df.iloc[:,-2]

In [233]:
imputed_df = imputed_df.merge(occupation, left_index=True, right_index = True)

In [235]:
imputed_df.insert(loc = 89, column='>50k', value = target)

In [236]:
imputed_df.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,...,Machine-op-inspct,Machine-op-inspect,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,>50k
0,39,77516,2174,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,50,83311,0,0,13,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38,215646,0,0,40,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,53,234721,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,338409,0,0,40,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
