In [137]:
import csv
import sys
import random
import numpy as np

In [138]:
print(sys.version)

3.7.4 (v3.7.4:e09359112e, Jul  8 2019, 14:54:52) 
[Clang 6.0 (clang-600.0.57)]


In [139]:
male_data = np.genfromtxt('Data/MALE.csv', delimiter=',', skip_header=1)
female_data = np.genfromtxt('Data/FEMALE.csv', delimiter=',', skip_header=1)
mixed_data = np.genfromtxt('Data/MIXED.csv', delimiter=',', skip_header=1)

male_x = male_data[:,:-1]
male_y = male_data[:,-1:]
female_x = female_data[:,:-1]
female_y = female_data[:,-1:]
mixed_x = mixed_data[:,:-1]
mixed_y = mixed_data[:,-1:]

In [140]:
from sklearn.model_selection import train_test_split

male_x_train, male_x_test, male_y_train, male_y_test = \
train_test_split(male_x, male_y, test_size=0.10, random_state=random.randint(1,101))

male_x_dev = male_x_test[:100]
male_y_dev = male_y_test[:100]
male_x_test = male_x_test[100:]
male_y_test = male_y_test[100:]

female_x_train, female_x_test, female_y_train, female_y_test = \
train_test_split(female_x, female_y, test_size=0.10, random_state=random.randint(1,101))

female_x_dev = female_x_test[:100]
female_y_dev = female_y_test[:100]
female_x_test = female_x_test[100:]
female_y_test = female_y_test[100:]


mixed_x_train, mixed_x_test, mixed_y_train, mixed_y_test = \
train_test_split(mixed_x, mixed_y, test_size=0.10, random_state=random.randint(1,101))

mixed_x_dev = mixed_x_test[:100]
mixed_y_dev = mixed_y_test[:100]
mixed_x_test = mixed_x_test[100:]
mixed_y_test = mixed_y_test[100:]

In [141]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Ridge
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
neural_net = MLPClassifier(hidden_layer_sizes=(16,16,16), max_iter=250, activation = 'relu', learning_rate_init=.001)

In [142]:
@ignore_warnings(category=ConvergenceWarning)
def baseline(training_x, training_y, test_x):
    neural_net.fit(training_x, training_y)
    ridge.fit(training_x, training_y)
    rr_predictions = ridge.predict(test_x)
    nn_predictions = neural_net.predict(test_x)
    return rr_predictions, nn_predictions

def mse(rr_predictions, nn_predictions, test_y):
    mse_rr1 = mean_squared_error(test_y, rr_predictions)
    mse_nn1 = mean_squared_error(test_y, nn_predictions)
    print("Ridge regression mse: " + str(mse_rr1))
    print("Neural net mse: " + str(mse_nn1))

In [143]:
def get_subset(data):
    return data[:100]

In [144]:
# three possible splits for f-fold cross validation:
# { (male, female), (male, mixed), (female, mixed) }
def src_data_format(x1, x2, y1, y2):
    x = np.concatenate([x1, x2])
    y = np.concatenate([y1, y2]).ravel()
    return x, y

In [145]:
def tgt_data_format(x, y):
    return x[:100], y[:100].ravel()

In [146]:
def all_data_format(x1, x2, x3, y1, y2, y3):
    x3 = get_subset(x3)
    y3 = get_subset(y3)
    x = np.concatenate([x1, x2, x3])
    y = np.concatenate([y1, y2, y3]).ravel()
    return x, y

In [147]:
def weighted(x1, x2, x3, y1, y2, y3):
    source = np.concatenate([x1, x2])
    target = get_subset(x3)
    N = len(source)
    M = len(target)
    weight = int(N/M)
    x = np.concatenate([source, target])
    for i in range(1, weight):
        x = np.concatenate([x, target])
    return x

def update_weighted_y(x, y, y_data):
    while(len(y) < len(x)):
        y = np.concatenate([y, y_data[:100].ravel()])
    return y

In [148]:
def linint(srconly_pred, tgtonly_pred, target_dev_x, target_dev_y, test_dev_x):
    combined_models = np.column_stack((srconly_pred,tgtonly_pred))
    neural_net.fit(combined_models, target_dev_y.ravel())
    ridge.fit(combined_models, target_dev_y)
    
    rr_predictions, nn_predictions = baseline(target_dev_x, target_dev_y, test_dev_x)
    return rr_predictions, nn_predictions
    


In [149]:
# hyper param tuning for ridge regression
x, y = src_data_format(male_x_train, female_x_train, male_y_train, female_y_train)
params = {"alpha": [.0001, .001, .01, .1, 1, 10, 100, 400]}
grid = GridSearchCV(ridge, params)
grid.fit(x, y)

x, y = src_data_format(male_x_dev, female_x_dev, male_y_dev, female_y_dev)
ridge.alpha = grid.best_params_['alpha']



In [150]:
# hyper param tuning for the neural net
x, y = src_data_format(male_x_train, female_x_train, male_y_train, female_y_train)
# tune shuffle, learning_rate_init, hidden_layer_sizes, max_iter
params = {"learning_rate_init": [.0001, .001], "hidden_layer_sizes" : [(16,16,16,16,16),(16,16,16), (64,64,64)]}
#grid = GridSearchCV(neural_net, params)
#grid.fit(x, y)

x, y = src_data_format(male_x_dev, female_x_dev, male_y_dev, female_y_dev)
#print(grid.best_params_)

In [151]:
# mixed is target

x1, y1 = src_data_format(male_x_train, female_x_train, male_y_train, female_y_train)
rr_predictions_src, nn_predictions_src = baseline(x1, y1, mixed_x_dev)

x1, y1 = tgt_data_format(mixed_x_train, mixed_y_train)
rr_predictions_tgt, nn_predictions_tgt = baseline(x1, y1, mixed_x_dev)

print("Mixed target Ridge Regression LININT model:")
rr_predictions, nn_predictions = linint(rr_predictions_src, rr_predictions_tgt, mixed_x_dev, mixed_y_dev, mixed_x_test)
mse(rr_predictions, nn_predictions, mixed_y_test)

print("Mixed target Neural Net LININT model:")
rr_predictions, nn_predictions = linint(nn_predictions_src, nn_predictions_tgt, mixed_x_dev, mixed_y_dev, mixed_x_test)
mse(rr_predictions, nn_predictions, mixed_y_test)


Mixed target Ridge Regression LININT model:


  y = column_or_1d(y, warn=True)


Ridge regression mse: 148.25096155009936
Neural net mse: 187.99683042789223
Mixed target Neural Net LININT model:


  y = column_or_1d(y, warn=True)


Ridge regression mse: 148.25096155009936
Neural net mse: 206.42630744849444


In [152]:
x1, y1 = src_data_format(male_x_train, female_x_train, male_y_train, female_y_train)
x2, y2 = src_data_format(male_x_train, mixed_x_train, male_y_train, mixed_y_train)
x3, y3 = src_data_format(female_x_train, mixed_x_train, female_y_train, mixed_y_train)

In [153]:
print("Mixed target SRCONLY model:")
rr_predictions, nn_predictions = baseline(x1, y1, mixed_x_test)
mse(rr_predictions, nn_predictions, mixed_y_test)

print("Female target SRCONLY model:")
rr_predictions, nn_predictions = baseline(x2, y2, female_x_test)
mse(rr_predictions, nn_predictions, female_y_test)

print("Male target SRCONLY model:")
rr_predictions, nn_predictions = baseline(x3, y3, male_x_test)
mse(rr_predictions, nn_predictions, male_y_test)

Mixed target SRCONLY model:
Ridge regression mse: 147.59657530306265
Neural net mse: 152.06814580031696
Female target SRCONLY model:
Ridge regression mse: 158.66084038432143
Neural net mse: 150.30205278592376
Male target SRCONLY model:
Ridge regression mse: 166.49754333672334
Neural net mse: 155.18045112781954


In [155]:
x1, y1 = tgt_data_format(male_x_train, male_y_train)
x2, y2 = tgt_data_format(female_x_train, female_y_train)
x3, y3 = tgt_data_format(mixed_x_train, mixed_y_train)

In [156]:
print("Male target TGTONLY model:")
rr_predictions, nn_predictions = baseline(x1, y1, male_x_test)
mse(rr_predictions, nn_predictions, male_y_test)

print("Female target SRCONLY model:")
rr_predictions, nn_predictions = baseline(x2, y2, female_x_test)
mse(rr_predictions, nn_predictions, female_y_test)

print("Mixed target SRCONLY model:")
rr_predictions, nn_predictions = baseline(x3, y3, mixed_x_test)
mse(rr_predictions, nn_predictions, mixed_y_test)

Male target TGTONLY model:
Ridge regression mse: 165.91136715650268
Neural net mse: 234.80075187969925
Female target SRCONLY model:
Ridge regression mse: 163.9702525582729
Neural net mse: 238.6891495601173
Mixed target SRCONLY model:
Ridge regression mse: 156.38891458220849
Neural net mse: 190.49603803486528


In [157]:
x1, y1 = all_data_format(male_x_train, female_x_train, mixed_x_train, male_y_train, female_y_train, mixed_y_train)
x2, y2 = all_data_format(male_x_train, mixed_x_train, female_x_train, male_y_train, mixed_y_train, female_y_train)
x3, y3 = all_data_format(female_x_train, mixed_x_train, male_x_train, female_y_train, mixed_y_train, male_y_train)

In [158]:
print("Mixed target ALL model:")
rr_predictions, nn_predictions = baseline(x1, y1, mixed_x_test)
mse(rr_predictions, nn_predictions, mixed_y_test)

print("Female target ALL model:")
rr_predictions, nn_predictions = baseline(x2, y2, female_x_test)
mse(rr_predictions, nn_predictions, female_y_test)

print("Male target ALL model:")
rr_predictions, nn_predictions = baseline(x3, y3, male_x_test)
mse(rr_predictions, nn_predictions, male_y_test)

Mixed target ALL model:
Ridge regression mse: 147.61480679167093
Neural net mse: 132.3011093502377
Female target ALL model:
Ridge regression mse: 158.5408140352123
Neural net mse: 148.59530791788856
Male target ALL model:
Ridge regression mse: 166.4461244459143
Neural net mse: 156.9360902255639


In [159]:
# change the data groups to the all variables
x1 = weighted(male_x_train, female_x_train, mixed_x_train, male_y_train, female_y_train, mixed_y_train)
x2 = weighted(male_x_train, mixed_x_train, female_x_train, male_y_train, mixed_y_train, female_y_train)
x3 = weighted(female_x_train, mixed_x_train, male_x_train, female_y_train, mixed_y_train, male_y_train)

y1 = update_weighted_y(x1, y1, mixed_y_train)
y2 = update_weighted_y(x2, y2, female_y_train)
y3 = update_weighted_y(x3, y3, male_y_train)

In [160]:
print("Mixed target WEIGHTED model:")
rr_predictions, nn_predictions = baseline(x1, y1, mixed_x_test)
mse(rr_predictions, nn_predictions, mixed_y_test)

print("Female target WEIGHTED model:")
rr_predictions, nn_predictions = baseline(x2, y2, female_x_test)
mse(rr_predictions, nn_predictions, female_y_test)

print("Male target WEIGHTED model:")
rr_predictions, nn_predictions = baseline(x3, y3, male_x_test)
mse(rr_predictions, nn_predictions, male_y_test)

Mixed target WEIGHTED model:
Ridge regression mse: 150.33919195355296
Neural net mse: 219.973058637084
Female target WEIGHTED model:
Ridge regression mse: 158.8998908703008
Neural net mse: 222.10263929618768
Male target WEIGHTED model:
Ridge regression mse: 167.20882162841534
Neural net mse: 197.58270676691728


In [161]:
# number is 1 2 3 denoting which positition x lies in the <x, i, i, i> vector
# male would be <x,x,0,0>, female <x,0,x,0> and mixed <x,0,0,x> 
def feda(source_x, number):
    new_data = []
    for row in source_x:
        expanded_row = []
        for i in row:
            expanded_row.append(i)
            if number == 1:
                expanded_row.append(i)
                expanded_row.append(0)
                expanded_row.append(0)
            elif number == 2:
                expanded_row.append(0)
                expanded_row.append(i)
                expanded_row.append(0)
            elif number == 3:
                expanded_row.append(0)
                expanded_row.append(0)
                expanded_row.append(i)
        new_data.append(expanded_row)
    data = np.array(new_data)
    return data
            

In [162]:
x1, y1 = tgt_data_format(male_x_train, male_y_train)
x2, y2 = tgt_data_format(female_x_train, female_y_train)
x3, y3 = tgt_data_format(mixed_x_train, mixed_y_train)

print("Male target FEDA TGTONLY model:")
rr_predictions, nn_predictions = baseline(feda(x1,1), y1, feda(male_x_test, 1))
mse(rr_predictions, nn_predictions, male_y_test)

print("Female target FEDA TGTONLY model:")
rr_predictions, nn_predictions =  baseline(feda(x2, 2), y2, feda(female_x_test, 2))
mse(rr_predictions, nn_predictions, female_y_test)

print("Mixed target FEDA TGTONLY model:")
rr_predictions, nn_predictions = baseline(feda(x3, 3), y3, feda(mixed_x_test, 3))
mse(rr_predictions, nn_predictions, mixed_y_test)

Male target FEDA TGTONLY model:
Ridge regression mse: 165.47219867878778
Neural net mse: 218.50375939849624
Female target FEDA TGTONLY model:
Ridge regression mse: 164.977530021214
Neural net mse: 252.6656891495601
Mixed target FEDA TGTONLY model:
Ridge regression mse: 155.90804687027676
Neural net mse: 174.84310618066561


In [163]:
feda_male = feda(male_x_train, 1)
feda_female = feda(female_x_train, 2)
feda_mixed = feda(mixed_x_train, 3)

print("Mixed target FEDA SRCONLY:")
x1, y1 = src_data_format(feda_male, feda_female, male_y_train, female_y_train)
x1_test = feda(mixed_x_test, 3)
rr_predictions, nn_predictions = baseline(x1, y1, x1_test)
mse(rr_predictions, nn_predictions, mixed_y_test)

print("Female target FEDA SRCONLY:")
x2, y2 = src_data_format(feda_male, feda_mixed, male_y_train, mixed_y_train)
x2_test = feda(female_x_test, 2)
rr_predictions, nn_predictions = baseline(x2, y2, x2_test)
mse(rr_predictions, nn_predictions, female_y_test)

print("Male target FEDA SRCONLY:")
x3, y3 = src_data_format(feda_female, feda_mixed, female_y_train, mixed_y_train)
x3_test = feda(male_x_test, 1)
rr_predictions, nn_predictions = baseline(x1, y1, x3_test)
mse(rr_predictions, nn_predictions, male_y_test)

Mixed target FEDA SRCONLY:
Ridge regression mse: 152.1812932030653
Neural net mse: 340.56576862123615
Female target FEDA SRCONLY:
Ridge regression mse: 200.20870192579406
Neural net mse: 186.00879765395894
Male target FEDA SRCONLY:
Ridge regression mse: 161.89386231323084
Neural net mse: 154.01127819548873


In [164]:
def find_similarity(source_x, source_y, target_x, threshold):
    x = []
    y = []
    for i in range(0, len(source_x)):
        src_row = source_x[i]
        similarity = 0
        for tgt_row in target_x[:100]:
            similarity += sum((src_row - tgt_row)**2)
        similarity /= 100
        if similarity < threshold:
            x.append(src_row)
            y.append(source_y[i])
    print(str(len(y)) + " samples were found to be similar to the target data.")
    return x, y

x_new1, y_new1 = find_similarity(male_x_train, male_y_train, mixed_x_train, 400)
x_new2, y_new2 = find_similarity(female_x_train, female_y_train, mixed_x_train,  400)

x1 = np.concatenate([mixed_x_train[:100], x_new1, x_new2])
y1 = np.concatenate([mixed_y_train[:100], y_new1, y_new2])

print("TGT ONLY:")
elm_poly.train(np.column_stack([mixed_y_train[:100], mixed_x_train[:100]]))
print(elm_poly.test(np.column_stack((mixed_y_test,mixed_x_test))).get("mse"))

print("TGT with threshold source:")
elm_poly.train(np.column_stack((y1,x1)))
print(elm_poly.test(np.column_stack((mixed_y_test,mixed_x_test))).get("mse"))
      
x1 = np.concatenate([mixed_x_train[:100], male_x_train, female_x_train])
y1 = np.concatenate([mixed_y_train[:100], male_y_train, female_y_train])
print("ALL:")
elm_poly.train(np.column_stack((y1,x1)))
print(elm_poly.test(np.column_stack((mixed_y_test,mixed_x_test))).get("mse"))

1605 samples were found to be similar to the target data.
1598 samples were found to be similar to the target data.
TGT ONLY:
146.62719262808912
TGT with threshold source:
114.57945216538148
ALL:
114.83594574333435


In [61]:
from elm import ELM
from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score

x1, y1 = src_data_format(male_x_train, female_x_train, male_y_train, female_y_train)
x2, y2 = src_data_format(male_x_train, mixed_x_train, male_y_train, mixed_y_train)
x3, y3 = src_data_format(female_x_train, mixed_x_train, female_y_train, mixed_y_train)
hid_nums = [10, 50, 100, 250, 400]
'''
for number in hid_nums:
    print(number, end=' ')
    e = ELM(number)
    ave = 0
    for i in range(10):
        cv = KFold(n_splits=5, shuffle=True)
        scores = cross_val_score(e, x2.astype(int), y2.astype(int),cv=cv, scoring='accuracy', n_jobs=-1)
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.3f " % (ave))
'''
elm = ELM(hid_num=250).fit(x1.astype(int), y1.astype(int))
preds = elm.predict(mixed_x_test.astype(int))
error = mean_squared_error(mixed_y_test.astype(int), preds)
print("ELM mse: " + str(error))

#print("ELM Accuracy %0.3f " % elm.score(mixed_x_test.astype(int), mixed_y_test.astype(int)))

ELM mse: 208.42472266244056


In [165]:
import elm.elmk
# first argument is a kernel function 
# second iscoefficient C of regularization,
# third is a list of arguments for the kernel function.
# safe values -> params = ["poly", 0.9993040203345642, [2.470566656543985, 2.4474257958252617]]
params = ["poly", 1.0722153359905189, [3.06744136499759, 2.3780118881296826]]
elm_poly = elm.ELMKernel(params)
params = ["linear", 250, []]
elm_linear = elm.ELMKernel(params)
params = ["rbf", 5.092921315186362, [-14.097889701989264]]
elm_rbf = elm.ELMKernel(params)

In [None]:
total_data = np.concatenate([np.column_stack((male_y_test,male_x_test)),\
                             np.column_stack((female_y_test,female_x_test)),\
                             np.column_stack((mixed_y_test,mixed_x_test))])
elm_poly.search_param(total_data, kf = ["rbf"])
elm_poly.search_param(total_data, kf = ["linear"])
elm_poly.search_param(total_data, kf = ["poly"])

elmk
##### Start search #####




In [129]:
x1, y1 = src_data_format(male_x_train, female_x_train, male_y_train, female_y_train)
x2, y2 = src_data_format(male_x_train, mixed_x_train, male_y_train, mixed_y_train)
x3, y3 = src_data_format(female_x_train, mixed_x_train, female_y_train, mixed_y_train)
elm_rbf.train(np.column_stack((y2,x2)))
print(elm_rbf.test(np.column_stack((female_y_test,female_x_test))).get("mse"))

164.17033775302636
