Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updation of scikit libraries #60

Merged
merged 10 commits into from
Jun 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,15 @@ methods:
# Scikit-Learn: machine learning in Python
library: scikit
methods:
ICA:
run: ['metric']
iteration: 3
script: methods/scikit/ica.py
format: [csv, txt]
datasets:
- files: ['datasets/iris.csv', 'datasets/wine.csv',
'datasets/cities.csv', 'datasets/diabetes_X.csv']

PCA:
run: ['metric']
iteration: 3
Expand Down Expand Up @@ -1173,7 +1182,6 @@ methods:
'datasets/vehicle.csv', 'datasets/USCensus1990.csv',
'datasets/optdigits.csv', 'datasets/isolet.csv',
'datasets/TomsHardware.csv', 'datasets/covtype.csv']
options: '-s 42'
LinearRegression:
run: ['metric']
iteration: 3
Expand Down Expand Up @@ -1214,6 +1222,24 @@ methods:
- files: [ ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'] ]
options: '-t 50.0'
LogisticRegression:
run: ['metric']
iteration: 3
script: methods/scikit/logistic_regression.py
format: [csv,txt]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
---
# mlpy is a Python module for Machine Learning built on top of NumPy/SciPy
# and the GNU Scientific Libraries.
Expand Down
36 changes: 27 additions & 9 deletions methods/scikit/LSHForest.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def __init__(self, dataset, timeout=0, verbose=True):
self.dataset = dataset
self.timeout = timeout
self.model = None
self.n = 10
self.k = 5
self.n_estimators = 10
self.n_neighbors = 5

'''
Build the model for the Approximate Nearest Neighbors.
Expand All @@ -57,8 +57,12 @@ def __init__(self, dataset, timeout=0, verbose=True):
'''
def BuildModel(self, data, labels):
# Create and train the classifier.
lshf = LSHForest(n_estimators = self.n,
n_neighbors = self.k)
lshf = LSHForest(n_estimators = self.n_estimators,
min_hash_match = self.min_hash_match,
n_candidates = self.n_candidates,
radius_cutoff_ratio = self.radius_cutoff_ratio,
radius = self.radius,
n_neighbors = self.n_neighbors)
lshf.fit(data)
return lshf

Expand All @@ -76,16 +80,30 @@ def RunAnnScikit(q):
Log.Info("Loading dataset", self.verbose)
trainData, labels = SplitTrainData(self.dataset)
testData = LoadDataset(self.dataset[1])
n = re.search("-n (\d+)", options) #Number of Estimators.
k = re.search("-k (\d+)", options) #Number of Neighbors.
self.n = 10 if not n else int(n.group(1))
self.k = 5 if not k else int(k.group(1))
#Number of trees in the LSH Forest.
n_estimators = re.search("-n (\d+)", options)
#Number of neighbors to be returned from the query function.
n_neighbors = re.search("-k (\d+)", options)
#Lowest hash length to be searched when candidate selection is performed.
min_hash_match = re.search("-H (\d+)", options)
#Minimum number of candidates evaluated per estimator.
n_candidates = re.search("--n_candidates (\d+)", options)
#Radius from data point to its neighbors.
radius = re.search("--radius (\d+)", options)
#A value ranges from 0 to 1.
radius_cutoff_ratio = re.search("--radius_cutoff_ratio (\d+)", options)
self.n_estimators = 10 if not n_estimators else int(n_estimators.group(1))
self.n_neighbors = 5 if not n_neighbors else int(n_neighbors.group(1))
self.min_hash_match = 4 if not min_hash_match else int(min_hash_match.group(1))
self.n_candidates = 10 if not n_candidates else int(n_candidates.group(1))
self.radius = 1.0 if not radius else float(radius.group(1))
self.radius_cutoff_ratio = 0.9 if not radius_cutoff_ratio else float(radius_cutoff_ratio.group(1))
try:
with totalTimer:
self.model = self.BuildModel(trainData, labels)
# Run Approximate on the test dataset.
distances,indices = self.model.kneighbors(testData,
n_neighbors = self.k)
n_neighbors = self.n_neighbors)
except Exception as e:
Log.Debug(str(e))
q.put(-1)
Expand Down
42 changes: 38 additions & 4 deletions methods/scikit/allknn.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,14 @@ def RunAllKnnScikit(q):
# Get all the parameters.
k = re.search("-k (\d+)", options)
leafSize = re.search("-l (\d+)", options)

radius = re.search("--radius (\d+)", options)
tree_type = re.search("-t (\s+)", options)
metric = re.search("--metric (\s+)", options)
# Parameter for the Minkowski metric.
# When p=1 it is equivalent to using manhattan_distance and euclidean for p=2.
# For arbitrary p, minkowski_distance is used.
p = re.search("-p (\d+)", options)
n_jobs = re.search("--n_jobs (\d+)", options)
if not k:
Log.Fatal("Required option: Number of furthest neighbors to find.")
q.put(-1)
Expand All @@ -78,18 +85,45 @@ def RunAllKnnScikit(q):
return -1

if not leafSize:
l = 20
leafSize = 20
elif int(leafSize.group(1)) < 0:
Log.Fatal("Invalid leaf size: " + str(leafSize.group(1)) + ". Must" +
" be greater than or equal to 0.")
q.put(-1)
return -1
else:
l = int(leafSize.group(1))
leafSize = int(leafSize.group(1))
if not tree_type:
tree_type = 'kd_tree'
elif str(tree_type.group(1)):
tree_type = str(tree_type.group(1))
if tree_type !='auto' or tree_type !='ball_tree' or tree_type != 'kd_tree' or tree_type != 'brute':
Log.Fatal("Invalid tree type: "+ str(tree_type.group(1))
+ ". Must be either auto, ball_tree, kd_tree or brute.")
q.put(-1)
return -1
radius = 1.0 if not radius else float(radius.group(1))
p = 2 if not p else int(p.group(1))
if not metric:
metric = 'minkowski'
elif metric.group(1):
metric = str(metric.group(1))
if metric not in ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']:
Log.Fatal("Invalid metric type: "+ str(metric.group(1))
+ ". Must be either cityblock, cosine, euclidean, l1, l2 or manhattan")
q.put(-1)
return -1
n_jobs = 1 if not n_jobs else int(n_jobs.group(1))

try:
# Perform All K-Nearest-Neighbors.
model = NearestNeighbors(n_neighbors=k, algorithm='kd_tree', leaf_size=l)
model = NearestNeighbors(n_neighbors=k,
algorithm=tree_type,
leaf_size=leafSize,
radius=radius,
metric=metric,
p=p,
n_jobs=n_jobs)
model.fit(referenceData)

if len(self.dataset) == 2:
Expand Down
36 changes: 33 additions & 3 deletions methods/scikit/elastic_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,18 @@ def __init__(self, dataset, timeout=0, verbose=True):
self.dataset = dataset
self.timeout = timeout
self.model = None
self.rho = 0.5
self.rho = 1.0
self.alpha = 0.5
self.fit_intercept = True
self.normalize = False
self.precompute = False
self.max_iter = 1000
self.copy_X = True
self.tol = 0.0001
self.warm_start = False
self.positive = False
self.selection = 'cyclic'


'''
Build the model for the Elastic Net Classifier.
Expand All @@ -60,7 +70,16 @@ def __init__(self, dataset, timeout=0, verbose=True):
def BuildModel(self, data, labels):
# Create and train the classifier.
elasticNet = SElasticNet(alpha=self.rho,
l1_ratio=self.alpha)
l1_ratio=self.alpha,
fit_intercept = self.fit_intercept,
normalize = self.normalize,
precompute = self.precompute,
max_iter = self.max_iter,
copy_X = self.copy_X,
tol = self.tol,
warm_start = self.warm_start,
positive = self.positive,
selection = self.selection)
elasticNet.fit(data, labels)
return elasticNet

Expand All @@ -81,9 +100,20 @@ def RunElasticNetScikit(q):

r = re.search("-r (\d+)", options)
a = re.search("-a (\d+)", options)
max_iter = re.search("--max_iter (\d+)", options)
tol = re.search("--tol (\d+)", options)
selection = re.search("--selection (\s+)", options)

self.rho = 0.5 if not r else int(r.group(1))
self.rho = 1.0 if not r else int(r.group(1))
self.alpha = 0.5 if not r else int(a.group(1))
self.max_iter = 1000 if not max_iter else int(max_iter.group(1))
self.tol = 0.0001 if not tol else float(tol.group(1))
self.selection = 'cyclic' if not selection else str(selection.group(1))
if self.selection not in ['cyclic','random']:
Log.Fatal("Invalid selection: " + str(selection.group(1))
+ ". Must be either cyclic or random")
q.put(-1)
return -1

try:
with totalTimer:
Expand Down
18 changes: 12 additions & 6 deletions methods/scikit/gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,24 @@ def RunGMMScikit(q):

# Get all the parameters.
g = re.search("-g (\d+)", options)
n = re.search("-n (\d+)", options)
s = re.search("-n (\d+)", options)

s = re.search("-s (\d+)", options)
tol = re.search("-T (\d+)", options)
n_init = re.search("-t (\d+)", options)
max_iter = re.search("-n (\d+)", options)
g = 1 if not g else int(g.group(1))
n = 250 if not n else int(n.group(1))
s = 0 if not s else int(s.group(1))
tol = 0.001 if not tol else float(tol.group(1))
max_iter = 100 if not max_iter else int(max_iter.group(1))
n_init = 1 if not n_init else int(n_init.group(1))

try:
# Create the Gaussian Mixture Model
# Some params changed to match mlpack defaults.
model = mixture.GaussianMixture(n_components=g, covariance_type='full',
random_state=s, n_iter=n, n_init=10, thresh=1e-10)
model = mixture.GaussianMixture(n_components=g,
random_state=s,
n_init=n_init,
tol=tol,
max_iter = max_iter)
with totalTimer:
model.fit(dataPoints)
except Exception as e:
Expand Down
26 changes: 25 additions & 1 deletion methods/scikit/ica.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,36 @@ def RunICAScikit(q):
data = np.genfromtxt(self.dataset, delimiter=',')

s = re.search('-s (\d+)', options)
n_components = re.search('--n_components (\d+)', options)
algorithm = re.search('--algorithm (\s+)', options)
fun = re.search('--fun (\s+)', options)
max_iter = re.search('--max_iter (\d+)', options)
tol = re.search('--tol (\d+)', options)

s = 0 if not s else int(s.group(1))
n_components = None if not n_components else int(n_components.group(1))
algorithm = 'parallel' if not algorithm else str(algorithm.group(1))
if algorithm not in ['parallel','deflation']:
Log.Fatal("Invalid value for algorithm: "+ str(algorithm.group(1))+" .Must be either parallel or deflation")
q.put(-1)
return -1
fun = 'logcosh' if not fun else str(fun.group(1))
if fun not in ['logcosh','exp','cube']:
Log.Fatal("Invalid value for fun: "+ str(fun.group(1))+" .Must be either logcosh,exp or cube")
q.put(-1)
return -1
max_iter = 200 if not max_iter else int(max_iter.group(1))
tol = 0.0001 if not tol else float(tol.group(1))

try:
# Perform ICA.
with totalTimer:
model = FastICA(random_state=s)
model = FastICA(n_components = n_components,
algorithm = algorithm,
fun = fun,
max_iter = max_iter,
tol = tol,
random_state = s)
ic = model.fit(data).transform(data)
except Exception as e:
q.put(-1)
Expand Down
4 changes: 4 additions & 0 deletions methods/scikit/kernel_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ def RunKPCAScikit(q):
degree = 1 if not degree else int(degree.group(1))

model = KernelPCA(n_components=d, kernel="poly", degree=degree)
elif kernel.group(1) == "cosine":
model = KernelPCA(n_components=d, kernel="cosine", degree=degree)
elif kernel.group(1) == "gaussian":
model = KernelPCA(n_components=d, kernel="rbf", degree=degree)
else:
Log.Fatal("Invalid kernel type (" + kernel.group(1) + "); valid " +
"choices are 'linear', 'hyptan' and 'polynomial'.")
Expand Down
30 changes: 24 additions & 6 deletions methods/scikit/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def RunKMeansScikit(q):
clusters = re.search("-c (\d+)", options)
maxIterations = re.search("-m (\d+)", options)
seed = re.search("-s (\d+)", options)
algorithm = re.search("-a (\s+)", options)

# Now do validation of options.
if not clusters and len(self.dataset) != 2:
Expand All @@ -77,18 +78,35 @@ def RunKMeansScikit(q):
return -1

m = 1000 if not maxIterations else int(maxIterations.group(1))

algorithm = 'auto' if not algorithm else str(algorithm.group(1))

if algorithm not in ['naive','elkan','auto']:
Log.Fatal("Invalid value for algorithm: "+algorithm+" must be either elkan or naive")
q.put(-1)
return -1
if algorithm == 'naive':
algorithm = 'auto'
try:
# Create the KMeans object and perform K-Means clustering.
with totalTimer:
if len(self.dataset) == 2:
kmeans = KMeans(n_clusters=int(clusters.group(1)), init=centroids,
n_init=1, max_iter=m)
kmeans = KMeans(n_clusters=int(clusters.group(1)),
init=centroids,
n_init=1,
max_iter=m,
algorithm=algorithm)
elif seed:
kmeans = KMeans(n_clusters=int(clusters.group(1)), init='random',
n_init=1, max_iter=m, random_state=int(seed.group(1)))
kmeans = KMeans(n_clusters=int(clusters.group(1)),
init='random',
n_init=1,
max_iter=m,
random_state=int(seed.group(1)),
algorithm=algorithm)
else:
kmeans = KMeans(n_clusters=int(clusters.group(1)), n_init=1, max_iter=m)
kmeans = KMeans(n_clusters=int(clusters.group(1)),
n_init=1,
max_iter=m,
algorithm=algorithm)

kmeans.fit(data)
labels = kmeans.labels_
Expand Down
9 changes: 8 additions & 1 deletion methods/scikit/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def __init__(self, dataset, timeout=0, verbose=True):
self.dataset = dataset
self.timeout = timeout
self.model = None
self.e = 1e-4
self.n = 100

'''
Build the model for the Logistic Regression.
Expand All @@ -57,7 +59,8 @@ def __init__(self, dataset, timeout=0, verbose=True):
'''
def BuildModel(self, data, responses):
# Create and train the classifier.
lr = SLogisticRegression()
lr = SLogisticRegression(max_iter = self.n,
tol = self.e)
lr.fit(data, responses)
return lr

Expand All @@ -80,6 +83,10 @@ def RunLogisticRegressionScikit(q):

# Use the last row of the training set as the responses.
X, y = SplitTrainData(self.dataset)
e = re.search("-e (\d+)", options) #Tolerance
n = re.search("-n (\d+)", options) #Max_iterations
self.e = 1e-4 if not e else float(e.group(1))
self.n = 100 if not n else int(n.group(1))

try:
with totalTimer:
Expand Down
Loading