# Country Imputation 

### Step 2

Find what variable most heavily correlates (Pearson Correlation Coefficient) with native_country 
via filter method and predict for missing values

In [10]:
import pandas as pd, numpy as np

In [11]:
train_df = pd.read_csv('train_country_predict.csv', index_col = 0)

In [12]:
test_df = pd.read_csv('test_country_predict.csv', index_col = 0)

In [14]:
train_features = train_df.iloc[:,:-2]
train_labels = train_df.iloc[:,-1]
train_target = pd.Series(pd.factorize(train_labels)[0])


In [15]:
test_features = test_df.iloc[:,:-2]
test_target = test_df.iloc[:,-2]

# PCC

In [16]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

In [17]:
def ppc_features(x,y):

    a = np.array(y)
    filt_feat = []

    for i in range(x.shape[1]):
        b = np.array(x.iloc[:,i])
        val = np.abs(pearson(a,b)) # absolute value of R
        filt_feat.append([val,i])

    filt_sort = sorted(filt_feat, reverse=True)
    
    rank = np.array(list(range(len(filt_sort))))
    rank = [x+1 for x in rank]
    feat_n = []
    r_score = []
    for i in range(len(filt_sort)):
        feat_n.append(filt_sort[i][1])
        r_score.append(filt_sort[i][0])
        
    Filter_Ranks = pd.DataFrame(data=np.column_stack((rank, feat_n, r_score)),columns=['Rank','Feature #','R_score'])
    Filter_Ranks["Feature #"] = Filter_Ranks["Feature #"].astype(int)
    Filter_Ranks["Rank"] = Filter_Ranks["Rank"].astype(int)
    
    return Filter_Ranks

In [18]:
trial = ppc_features(train_features, train_target)



In [19]:
trial

Unnamed: 0,Rank,Feature #,R_score
0,1,5,
1,2,20,
2,3,21,
3,4,41,0.364816
4,5,44,0.182171
5,6,43,0.105897
6,7,16,0.085343
7,8,36,0.082436
8,9,28,0.061449
9,10,26,0.049524


# Isolate race variables

In [20]:
sub_train_feat = train_features.iloc[:,[40,41,42,43,44]]
sub_train_feat.head()

Unnamed: 0,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,1,0,0
4,0,0,1,0,0
5,0,0,0,0,1


In [21]:
sub_test_feat = test_features.iloc[:,[40,41,42,43,44]]
sub_test_feat.head()

Unnamed: 0,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
14,0,1,0,0,0
38,0,0,0,0,1
51,0,0,0,0,1
61,0,0,0,0,1
93,0,1,0,0,0


# Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, precision_score

In [23]:
NB = GaussianNB()
NB.fit(sub_train_feat,train_target)
y_pred = NB.predict(sub_train_feat)

In [24]:
accuracy_score(y_pred, train_target)

0.004560260586319218

# Logistic Regression train-train

Seems to be the way to go

In [25]:
lr = LogisticRegression()
lr.fit(sub_train_feat,train_target)
y_pred = lr.predict(sub_train_feat)

In [26]:
accuracy_score(y_pred, train_target)

0.90951863916033293

In [27]:
precision_score(y_pred, train_target, average = 'micro')

0.90951863916033293

In [30]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

# Logistic Regression train-test

In [31]:
lr = LogisticRegression()
lr.fit(sub_train_feat,train_target)
y_pred = lr.predict(sub_test_feat)

In [32]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

# Conclusion

Seems like the United States is the best fit country for all missing values
It also seems to be the majority count with 25130 instances over 27630
which amounts to 91.85 % of the instances

In [40]:
(train_labels.value_counts()[0], len(train_labels))

(25130, 27630)

In [51]:
('United States accounts for {}% of native-country instances').format(np.round(25130/27360*100))

'United States accounts for 92.0% of native-country instances'