In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import random
import numpy as np



In [2]:

column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
data = pd.read_csv("abalone.data", names=column_names)
print("Number of samples: %d" % len(data))

# for more complicated cases use sklearn.feature_extraction.DictVectorizer
for label in "MFI":
    data[label] = data["sex"] == label
del data["sex"]
data.head()

Number of samples: 4177


Unnamed: 0,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings,M,F,I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,True,False,False
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,True,False,False
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,False,True,False
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,True,False,False
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,False,False,True


In [3]:
data["M"]=pd.Series(np.where(data.M.values == True, int(1), int(0)),data.index)
data["F"]=pd.Series(np.where(data.F.values == True, int(1), int(0)),data.index)
data["I"]=pd.Series(np.where(data.I.values == True, int(1), int(0)),data.index)
data.head()


Unnamed: 0,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings,M,F,I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1,0,0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1,0,0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,1,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1,0,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,0,1


In [4]:
y = data.rings.values

del data["rings"] # remove rings from data, so we can convert all the dataframe to a numpy 2D array.
X = data.values.astype(np.float)


In [None]:
from sklearn import cross_validation
train_X, test_X, train_y, test_y = cross_validation.train_test_split(X, y) # splits 75%/25% by default

In [5]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y) # splits 75%/25% by default

In [6]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.20


In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.46
Accuracy of K-NN classifier on test set: 0.22


In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(X_test, y_test)))



Accuracy of LDA classifier on training set: 0.27
Accuracy of LDA classifier on test set: 0.27




In [None]:
print ("f1 on test data is       {}".format(f1_score(test_y, predicted_test_y_log, average='macro')))
print ("Accuracy on test data is {}".format(accuracy_score(test_y, predicted_test_y_log)))
# confusion_matrix(test_y, predicted_test_y)

In [9]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.11
Accuracy of GNB classifier on test set: 0.10


In [6]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.24
Accuracy of SVM classifier on test set: 0.22


In [11]:
print(X_test.shape)

(1045, 10)


In [29]:
def optimize_random(x_train,y_train,percetage,mnb,change_plan):
    number_change_requested = int(percetage/100*x_train.shape[0])
    print("{} percentage error is equal to {} change \n".format(percetage,number_change_requested))
    used_row=[]
    col_history=[]
    occurred_change = 0
    all_changed = 1
    x_train_changed = np.copy(x_train)
    
    for i in range(len(change_plan["number"])):
        occurred_change = 0
        indices = [t for t, x in enumerate(y_train) if x == change_plan["key"][i][0]]

        for p in range (len(indices)):
            if y_train[indices[p]] == mnb.predict([x_train[indices[p]]]):
                
                
                while (len(col_history)<=x_train.shape[1]): #range 4
                    col = random.randint(0,x_train.shape[1]-1)
                    while col in col_history:
                        col = random.randint(0,x_train.shape[1]-1)
                        if (len(col_history)==x_train.shape[1]):
                            break
                    col_history.append(col)

                    
                    if occurred_change == change_plan["number"][i]:
                        col_history=[]
                        break
                    
                    x_train_changed[indices[p]][col] = 0

                    if (change_plan["key"][i][1] == mnb.predict([x_train_changed[indices[p]]])[0]):
                        
                        print(x_train[indices[p]],mnb.predict([x_train[indices[p]]])[0])
                        print(x_train_changed[indices[p]],mnb.predict([x_train_changed[indices[p]]])[0])
                        print(" \n change number {} \n".format(all_changed))
                        
                        occurred_change=occurred_change + 1
                        all_changed=all_changed + 1
                        col_history=[]
                        break
                
                    else:
                        x_train_changed[indices[p]]= np.copy(x_train[indices[p]])
                
    if (all_changed<number_change_requested-1):
        print("your request doesn't complete! please change your plan")
    return np.copy(x_train_changed)


In [35]:
change_plan={"key":[[9,7]],"number":[104]}
optimize_out= optimize_random(X_test,y_test,10,svm,change_plan)

10 percentage error is equal to 104 change 

[ 0.45    0.34    0.13    0.3715  0.1605  0.0795  0.105   1.      0.      0.    ] 9
[ 0.      0.34    0.13    0.3715  0.1605  0.0795  0.105   1.      0.      0.    ] 7
 
 change number 1 

[ 0.45    0.345   0.12    0.4165  0.1655  0.095   0.135   0.      1.      0.    ] 9
[ 0.45    0.345   0.12    0.      0.1655  0.095   0.135   0.      1.      0.    ] 7
 
 change number 2 

[ 0.475   0.37    0.125   0.5095  0.2165  0.1125  0.165   1.      0.      0.    ] 9
[ 0.475   0.37    0.125   0.      0.2165  0.1125  0.165   1.      0.      0.    ] 7
 
 change number 3 

[ 0.44   0.335  0.11   0.394  0.157  0.096  0.122  1.     0.     0.   ] 9
[ 0.     0.335  0.11   0.394  0.157  0.096  0.122  1.     0.     0.   ] 7
 
 change number 4 

your request doesn't complete! please change your plan


In [36]:
y_pred_optimize=svm.predict(optimize_out)
print ("Accuracy on test data is {}".format(accuracy_score(y_test, y_pred_optimize)))

Accuracy on test data is 0.24784688995215312


In [17]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier

sfs1 = SFS(svm, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(X, y)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   20.5s finished

[2018-10-10 10:15:44] Features: 1/1 -- score: 0.256164711515

In [18]:
sfs1.k_feature_idx_

(3,)

In [20]:
def change_spetial_feature(x_train,y_train,percetage,mnb,feature):
    change_item=0
    number_change_spetial = int(percetage/100*x_train.shape[0])
    print(number_change_spetial)
    
    x_train_changed_spetial = np.copy(x_train)

    for count,ele in enumerate(x_train_changed_spetial):
        

        x_train_changed_spetial[count][feature] = 0

        if (y_train[count] != mnb.predict([x_train_changed_spetial[count]])):
            change_item=change_item+1
            if (change_item < number_change_spetial):

                print(x_train[count],y_train[count])
                print(x_train_changed_spetial[count],mnb.predict([x_train_changed_spetial[count]])[0])
                print(" \n change number {} \n".format(change_item))
            else:

                break
        else:
            x_train_changed_spetial[count]= np.copy(x_train[count])

    return np.copy(x_train_changed_spetial)

In [21]:
change_spetial=change_spetial_feature(X_test,y_test,10,svm,3)

104
[ 0.61    0.465   0.15    0.9605  0.4495  0.1725  0.286   0.      0.      1.    ] 9
[ 0.61    0.465   0.15    0.      0.4495  0.1725  0.286   0.      0.      1.    ] 7
 
 change number 1 

[ 0.16    0.12    0.035   0.021   0.0075  0.0045  0.005   0.      0.      1.    ] 5
[ 0.16    0.12    0.035   0.      0.0075  0.0045  0.005   0.      0.      1.    ] 7
 
 change number 2 

[ 0.45    0.34    0.13    0.3715  0.1605  0.0795  0.105   1.      0.      0.    ] 9
[ 0.45    0.34    0.13    0.      0.1605  0.0795  0.105   1.      0.      0.    ] 7
 
 change number 3 

[ 0.48    0.365   0.135   0.6395  0.2945  0.113   0.175   0.      1.      0.    ] 8
[ 0.48    0.365   0.135   0.      0.2945  0.113   0.175   0.      1.      0.    ] 7
 
 change number 4 

[ 0.53    0.42    0.135   0.675   0.294   0.156   0.1825  1.      0.      0.    ] 10
[ 0.53    0.42    0.135   0.      0.294   0.156   0.1825  1.      0.      0.    ] 9
 
 change number 5 

[ 0.41    0.31    0.09    0.3335  0.1635  0.061   

[ 0.525   0.39    0.135   0.      0.2265  0.131   0.21    0.      1.      0.    ] 9
 
 change number 86 

[ 0.46    0.355   0.13    0.517   0.2205  0.114   0.165   0.      1.      0.    ] 9
[ 0.46    0.355   0.13    0.      0.2205  0.114   0.165   0.      1.      0.    ] 7
 
 change number 87 

[ 0.65    0.505   0.175   1.2075  0.5105  0.262   0.39    0.      1.      0.    ] 10
[ 0.65    0.505   0.175   0.      0.5105  0.262   0.39    0.      1.      0.    ] 9
 
 change number 88 

[ 0.63    0.515   0.175   1.1955  0.492   0.247   0.37    1.      0.      0.    ] 11
[ 0.63   0.515  0.175  0.     0.492  0.247  0.37   1.     0.     0.   ] 9
 
 change number 89 

[ 0.215   0.15    0.055   0.041   0.015   0.009   0.0125  0.      0.      1.    ] 3
[ 0.215   0.15    0.055   0.      0.015   0.009   0.0125  0.      0.      1.    ] 7
 
 change number 90 

[ 0.445   0.34    0.12    0.4475  0.193   0.1035  0.13    1.      0.      0.    ] 9
[ 0.445   0.34    0.12    0.      0.193   0.1035  0.13    

In [22]:
y_change_spetial=svm.predict(change_spetial)
print ("Accuracy on test data is {}".format(accuracy_score(y_test, y_change_spetial)))

Accuracy on test data is 0.23062200956937798


In [31]:
print(len(y_test))

1045


In [7]:
def change_spetial_feature_new(x_train,y_train,percetage,mnb,change_plan,feature):
    number_change_requested = int(percetage/100*x_train.shape[0])
    print("{} percentage error is equal to {} change \n".format(percetage,number_change_requested))
    
    used_row=[]
    col_history=[]
    occurred_change = 0
    all_changed = 1
    x_train_changed = np.copy(x_train)
    
    for i in range(len(change_plan["number"])):
        occurred_change = 0
        
        indices = [t for t, x in enumerate(y_train) if x == change_plan["key"][i][0]]
        print(indices)
        for p in range (len(indices)):
            if y_train[indices[p]] == mnb.predict([x_train[indices[p]]]):
            
                x_train_changed[indices[p]][feature] = 0


                if (change_plan["key"][i][1] == mnb.predict([x_train_changed[indices[p]]])[0]):
                    
                    print(x_train[indices[p]],mnb.predict([x_train[indices[p]]])[0])
                    print(x_train_changed[indices[p]],mnb.predict([x_train_changed[indices[p]]])[0])
                    print(" \n change number {} \n".format(all_changed))

                    occurred_change=occurred_change + 1
                    all_changed=all_changed + 1
                    col_history=[]
                    break

                else:
                    x_train_changed[indices[p]]= np.copy(x_train[indices[p]])
                
    if (all_changed<number_change_requested-1):
        print("your request doesn't complete! please change your plan")
    return np.copy(x_train_changed)
    

In [10]:
change_plan={"key":[[9,7]],"number":[6]}
new_random_spetial=change_spetial_feature_new(X_test,y_test,10,svm,change_plan,3)

10 percentage error is equal to 104 change 

[1, 4, 15, 38, 47, 50, 73, 79, 83, 84, 87, 88, 93, 94, 98, 99, 104, 109, 114, 115, 117, 120, 138, 143, 150, 153, 165, 166, 173, 174, 179, 183, 232, 239, 242, 243, 246, 262, 274, 279, 286, 294, 313, 320, 321, 322, 326, 334, 352, 354, 362, 368, 382, 383, 389, 392, 396, 400, 404, 408, 414, 424, 427, 428, 429, 431, 446, 451, 452, 458, 464, 465, 466, 480, 488, 491, 492, 496, 500, 503, 532, 535, 544, 546, 557, 566, 569, 570, 572, 573, 574, 582, 586, 587, 598, 602, 609, 611, 617, 620, 627, 629, 643, 649, 653, 658, 659, 662, 666, 669, 673, 691, 703, 717, 725, 729, 733, 753, 758, 762, 768, 769, 776, 781, 789, 790, 793, 794, 795, 808, 809, 816, 817, 840, 841, 845, 869, 873, 874, 882, 884, 887, 889, 891, 892, 900, 902, 905, 906, 915, 921, 944, 946, 957, 975, 981, 983, 995, 1020, 1026, 1027, 1029, 1030, 1034, 1035, 1038]
[ 0.435   0.35    0.125   0.459   0.197   0.1145  0.145   0.      1.      0.    ] 9
[ 0.435   0.35    0.125   0.      0.197   0.1145  

# check the combination

In [7]:
def combinations_index(iterable, r):
    # combinations('ABCD', 2) --> AB AC AD BC BD CD
    # combinations(range(4), 3) --> 012 013 023 123
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    indices = list(range(r))
    yield list(indices)
    
    while True:
        for i in reversed(range(r)):
            if indices[i] != i + n - r:
                break
        else:
            return
        indices[i] += 1

        for j in range(i+1, r):
            indices[j] = indices[j-1] + 1
        yield list(indices)


In [32]:
import itertools
def check_all_combination(x_train,y_train,percetage,mnb,change_plan):
    number_change_requested = int(percetage/100*x_train.shape[0])
    print("{} percentage error is equal to {} change \n".format(percetage,number_change_requested))
    
    used_row=[]
    occurred_change = 0
    all_changed = 1
    change_done=False
    x_train_changed = np.copy(x_train)
    
    for i in range(len(change_plan["key"])):
        occurred_change = 0
        
        indices = [t for t, x in enumerate(y_train) if x == change_plan["key"][i][0]]
        print("{} rows have target {} \n".format(len(indices),change_plan["key"][i][0]))
        
        
        for p in range (len(indices)):
            if (all_changed == number_change_requested +1):
                print("your requests have been done :)")
                break
            if y_train[indices[p]] == mnb.predict([x_train[indices[p]]])  and  indices[p] not in used_row :
                

                change_done=False
                for L in range(0, len(x_train_changed[indices[p]])+1):
                    if change_done:

                        break
                    else:
                        for subset in combinations_index(x_train_changed[indices[p]], L):
                            if not subset:
                                pass
                            else:

                                x_train_changed[indices[p]][subset] = 0

                                if (change_plan["key"][i][1] == mnb.predict([x_train_changed[indices[p]]])[0]):

                                    print(x_train[indices[p]],mnb.predict([x_train[indices[p]]])[0])
                                    print(x_train_changed[indices[p]],mnb.predict([x_train_changed[indices[p]]])[0])
                                    print(" \n change number {} \n".format(all_changed))
                                    
                                    used_row.append(indices[p])
                                    occurred_change=occurred_change + 1
                                    change_done=True
                                    all_changed=all_changed + 1
                                    break

                                else:
                                    x_train_changed[indices[p]]= np.copy(x_train[indices[p]])

    if (all_changed<=number_change_requested):
        print("your request doesn't complete! please change your plan")
    return np.copy(x_train_changed)

In [33]:
import time
start = time.time()

change_plan={"key":[[9,7],[8,7]]}
new_check_all_combination=check_all_combination(X_test,y_test,10,svm,change_plan)

end = time.time()
print("your execuation time is {} ".format(end - start))

10 percentage error is equal to 104 change 

169 rows have target 9 

[ 0.61    0.47    0.155   1.0325  0.497   0.2175  0.2785  1.      0.      0.    ] 9
[ 0.      0.      0.155   0.      0.497   0.2175  0.2785  1.      0.      0.    ] 7
 
 change number 1 

[ 0.55   0.47   0.15   0.897  0.377  0.184  0.29   0.     1.     0.   ] 9
[ 0.     0.     0.15   0.     0.377  0.184  0.29   0.     1.     0.   ] 7
 
 change number 2 

[ 0.62    0.49    0.16    1.056   0.493   0.244   0.2725  0.      1.      0.    ] 9
[ 0.      0.      0.16    0.      0.493   0.244   0.2725  0.      1.      0.    ] 7
 
 change number 3 

[ 0.43    0.35    0.09    0.397   0.1575  0.089   0.12    0.      1.      0.    ] 9
[ 0.      0.35    0.09    0.      0.1575  0.089   0.12    0.      1.      0.    ] 7
 
 change number 4 

[ 0.56    0.435   0.15    0.8715  0.4755  0.1835  0.1835  0.      1.      0.    ] 9
[ 0.      0.435   0.15    0.      0.4755  0.1835  0.1835  0.      1.      0.    ] 7
 
 change number 5 

[ 0.5

[ 0.45    0.36    0.105   0.4715  0.2035  0.0935  0.149   0.      1.      0.    ] 9
[ 0.      0.36    0.105   0.      0.2035  0.0935  0.149   0.      1.      0.    ] 7
 
 change number 46 

[ 0.575   0.45    0.13    0.785   0.318   0.193   0.2265  1.      0.      0.    ] 9
[ 0.575   0.45    0.13    0.      0.318   0.193   0.2265  0.      0.      0.    ] 7
 
 change number 47 

[ 0.595   0.455   0.16    1.04    0.452   0.2655  0.288   0.      1.      0.    ] 9
[ 0.      0.      0.16    0.      0.452   0.2655  0.288   0.      1.      0.    ] 7
 
 change number 48 

[ 0.575   0.445   0.145   0.847   0.415   0.1945  0.22    1.      0.      0.    ] 9
[ 0.575   0.445   0.145   0.      0.415   0.1945  0.22    0.      0.      0.    ] 7
 
 change number 49 

[ 0.53    0.42    0.135   0.677   0.2565  0.1415  0.21    0.      1.      0.    ] 9
[ 0.      0.42    0.135   0.      0.2565  0.1415  0.21    0.      1.      0.    ] 7
 
 change number 50 

[ 0.51    0.405   0.13    0.7175  0.3725  0.158   

[ 0.535   0.415   0.15    0.5765  0.3595  0.135   0.225   0.      0.      1.    ] 8
[ 0.535   0.415   0.15    0.      0.3595  0.135   0.225   0.      0.      1.    ] 7
 
 change number 91 

[ 0.555   0.425   0.13    0.648   0.2835  0.133   0.2105  0.      0.      1.    ] 8
[ 0.555   0.425   0.13    0.      0.2835  0.133   0.2105  0.      0.      1.    ] 7
 
 change number 92 

[ 0.51    0.38    0.115   0.5155  0.215   0.1135  0.166   0.      0.      1.    ] 8
[ 0.      0.38    0.115   0.5155  0.215   0.1135  0.166   0.      0.      1.    ] 7
 
 change number 93 

[ 0.575  0.44   0.15   0.983  0.486  0.215  0.239  0.     0.     1.   ] 8
[ 0.575  0.44   0.15   0.     0.486  0.215  0.239  0.     0.     1.   ] 7
 
 change number 94 

[ 0.495   0.375   0.12    0.614   0.2855  0.1365  0.161   0.      0.      1.    ] 8
[ 0.495   0.375   0.12    0.      0.2855  0.1365  0.161   0.      0.      1.    ] 7
 
 change number 95 

[ 0.53    0.43    0.14    0.677   0.298   0.0965  0.23    0.      0.  

In [34]:
y_change_all_combination=svm.predict(new_check_all_combination)
print ("Accuracy on test data is {}".format(accuracy_score(y_test, y_change_all_combination)))

Accuracy on test data is 0.12440191387559808
