In [17]:
import numpy as np
import scipy.stats
from time import time
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from scipy.stats import itemfreq
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from scipy.stats import itemfreq
from sklearn.metrics import make_scorer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from sklearn.model_selection import StratifiedKFold, GroupKFold, LeaveOneOut
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from scipy.stats.mstats import zscore

In [4]:
#Load dataset

xin= np.load("./X_train.npy")
yin= np.load("./y_train.npy")
xout= np.load("./X_test.npy")

In [None]:
dims=[50, 70, 75, 80, 90, 100, 120, 150]

In [None]:
n_components= 40

pca = PCA(n_components=n_components, svd_solver='auto',
              whiten=True).fit(xin)

X_pca = pca.transform(xin)

#print(pca.explained_variance_.shape)
#print(pca.explained_variance_ratio_.shape)
y=pca.explained_variance_
#take the highest expl. variance
#print(pca.explained_variance_)

#print(pca.explained_variance_)
#print(pca.explained_variance_ratio_)
#print(pca.explained_variance_ratio_.cumsum())

x = np.arange(0, n_components) 
plt.xticks(x)
plt.scatter(x, y, alpha=0.5)
plt.show()

In [5]:
def dimReduction(n_components, X_train):

    h= 50
    w= 37
    t0 = time()
    print("Extracting the top %d eigenfaces from %d faces"
          % (n_components, X_train.shape[0]))
    pca = PCA(n_components=n_components, svd_solver='auto',
              whiten=True).fit(X_train)

    eigenfaces = pca.components_.reshape((n_components, h, w))

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_pca = pca.transform(X_train)
    print("done in %0.3fs" % (time() - t0))
    return X_pca

In [7]:
#unbalanced dataset!
freq= itemfreq(yin)

#print (freq)
#print(freq[:,1])
#print(np.sum(freq[:,1]))
tot= np.sum(freq[:,1])
sample_weight=(1-(np.sinh(freq[:,1]/(tot)))) 
sample_weight = sample_weight / float(max(sample_weight))
#print(sample_weight)

w2= 1.5 - freq[:,1]/max(freq[:,1])
print (w2)
w3= w2/np.sum(w2)
print(w3)

dic1= {0: sample_weight[0],
      1: sample_weight[1],
      2: sample_weight[2],
      3: sample_weight[3],
      4: sample_weight[4],
      5: sample_weight[5],
      6: sample_weight[6],
     }

dic2= {0: w2[0],
      1: w2[1],
      2: w2[2],
      3: w2[3],
      4: w2[4],
      5: w2[5],
      6: w2[6],
     }

dic3= {0: w3[0],
      1: w3[1],
      2: w3[2],
      3: w3[3],
      4: w3[4],
      5: w3[5],
      6: w3[6],
     }


[ 1.37088608  1.03670886  1.28481013  0.5         1.28227848  1.34810127
  1.23164557]
[ 0.17020273  0.12871287  0.15951595  0.06207764  0.15920163  0.16737388
  0.15291529]


In [None]:
def export(filename, x):
    fo = open( filename , 'w' )
    fo.write(x)
    fo.close()

In [15]:
def uniformDataset(x, y):
    freq= itemfreq(y)
    minOcc= min(freq[:,1])
    #print(minOcc)
    xred= np.empty((0,1850))
    yred= np.array([])
    for i in range(y.shape[0]):
        freqnew= itemfreq(yred)
        if(freqnew.shape== (7,2)):
            occ= freqnew[y[i],1]
        else:
            occ= 0
        if(occ <= minOcc):
            xred= np.vstack((xred, [x[i, :]]))
            yred= np.append(yred, y[i])
    return (xred,yred)
    #print(xred.shape)
    #print(yred.shape)
    #print(itemfreq(yred))

In [18]:
def underSampling(x, y):
    cc = ClusterCentroids()
    x_resampled, y_resampled = cc.fit_sample(x, y)
    return (x_resampled, y_resampled)

In [12]:
def overSampling(x, y):
    sm = SMOTE(kind='svm')
    x_resampled, y_resampled = sm.fit_sample(x, y)
    return (x_resampled, y_resampled)

In [None]:
def weightedF1(y, predictions):
    f1= f1_score(y, predictions, average=None)
    occ= itemfreq(y)
    count= np.sum(occ[:,1])
    score= 0.0
    v=np.empty(occ[:,1].shape[0])
    for i in range(len(f1)):
        v[i]= (1-occ[i,1]/count)*f1[i]
        score = score+ (1-occ[i,1]/count)*f1[i]
    return score
#y=[1, 2, 3, 3, 3]
#pred=[1, 2, 2, 3, 1]
#r=weightedF1(y, pred)
#print (r)

In [27]:
def buildModel(param_grid, dim, x, y):
    t0 = time()
    x_n= dimReduction(dim, x)
    print("Fitting the classifier to the training set")
    #f1custom= make_scorer(weightedF1, greater_is_better=True)
    model = GridSearchCV(SVC(decision_function_shape='ovr'),
            param_grid, cv=StratifiedKFold(n_splits=10),
            n_jobs=-1)
    model = model.fit(x_n, y)
    
    print("done in %0.3fs" % (time() - t0))
    print("DIMENSIONS:")
    print(dim)
    #print("ker:")
    #print(ker)
    #print("Best estimator found by grid search:")
    #print(model.best_estimator_)
    print("Best score found by grid search:")
    print(model.best_score_)
    print("Best params found by grid search:")
    print(model.best_params_)
    print("\n")
    return model

In [None]:
def predict(model, x, y):
    t0 = time()
    print("Predicting people's names on the test set")
    y_pred = model.predict(x)
    print("done in %0.3fs" % (time() - t0))
    print(classification_report(y, y_pred))
    print(confusion_matrix(y, y_pred))
    score= f1_score(y, y_pred, average='weighted')
    return score

In [28]:
import warnings
warnings.filterwarnings('ignore')
a=2
expC= np.arange(-5.,17.)
c=np.power(a, expC)

expG= np.arange(-15.,4.)

gam=np.power(a, expG)


param_grid = {'C': c,
              'gamma': gam,
              'kernel':['rbf'],
              'class_weight': ['balanced']
             }

dims=[80, 100]

#(xnew,ynew)=uniformDataset(xin,yin)

result=0.0

x_ov, y_ov= underSampling(xin, yin)

for i in range(len(dims)):
    model= buildModel(param_grid, dims[i], x_ov, y_ov)
    tmp= model.best_score_
    if(tmp > result):
        result=  tmp
        params= model.best_params_
        d= dims[i]

print("\n")
print("Best: ")
print(result)
print("params:")
print(params)
print("dim")
print(d)

Extracting the top 80 eigenfaces from 357 faces
Projecting the input data on the eigenfaces orthonormal basis
done in 0.094s
Fitting the classifier to the training set
done in 48.042s
DIMENSIONS:
80
Best score found by grid search:
0.78431372549
Best params found by grid search:
{'C': 2.0, 'kernel': 'rbf', 'gamma': 0.0078125, 'class_weight': 'balanced'}


Extracting the top 100 eigenfaces from 357 faces
Projecting the input data on the eigenfaces orthonormal basis
done in 0.085s
Fitting the classifier to the training set
done in 59.973s
DIMENSIONS:
100
Best score found by grid search:
0.775910364146
Best params found by grid search:
{'C': 2.0, 'gamma': 0.00390625, 'kernel': 'rbf', 'class_weight': 'balanced'}




Best: 
0.78431372549
params:
{'C': 2.0, 'kernel': 'rbf', 'gamma': 0.0078125, 'class_weight': 'balanced'}
dim
80


In [1]:
import warnings
warnings.filterwarnings('ignore')
a=2
expC= np.arange(-5.,17.)
c= [2]

#expG= np.arange(-15.,4.)
#gam=np.power(a, expG)

gam= [0.0078125,0.008 ,0.006, 0.005, 0.002, 0.001 , 0.009, 0.008,
      0.0095, 0.0099, 0.01, 0.03, 0.04, 0.05, 0.0085, 0.0001 
      , 0.0009, 0.0005, 0.0007, 0.0001, 0.02, 0.1, 0.5, 0.75,
     0.9, 1, 0.00001, 0.00003, 0.0008]

param_grid = {'C': c,
              'gamma': gam,
              'kernel':['rbf'],
              'class_weight': [dic2]
             }

dims=[80]

#(xnew,ynew)=uniformDataset(xin,yin)

result=0.0

for i in range(len(dims)):
    model= buildModel(param_grid, dims[i], xin, yin)
    tmp= model.best_score_
    if(tmp > result):
        result=  tmp
        params= model.best_params_
        d= dims[i]

print("\n")
print("Best: ")
print(result)
print("params:")
print(params)
print("dim")
print(d)

NameError: name 'np' is not defined

In [9]:
#LINEAR
import warnings
warnings.filterwarnings('ignore')
a=2
expC= np.arange(-5.,17.)
c=np.power(a, expC)

param_grid = {'C': c,
              'kernel':['linear'],
              'class_weight': [dic2]
             }

dims=[80, 150, 100]

#(xnew,ynew)=uniformDataset(xin,yin)

result=0.0

for i in range(len(dims)):
    model= buildModel(param_grid, dims[i], xin, yin)
    tmp= model.best_score_
    if(tmp > result):
        result=  tmp
        params= model.best_params_
        d= dims[i]

print("\n")
print("Best: ")
print(result)
print("params:")
print(params)
print("dim")
print(d)

Extracting the top 80 eigenfaces from 966 faces
Projecting the input data on the eigenfaces orthonormal basis
done in 0.253s
Fitting the classifier to the training set
done in 13.933s
DIMENSIONS:
80
Best score found by grid search:
0.801436801442
Best params found by grid search:
{'C': 0.03125, 'kernel': 'linear', 'class_weight': {0: 1.3708860759493671, 1: 1.0367088607594936, 2: 1.2848101265822784, 3: 0.5, 4: 1.2822784810126582, 5: 1.3481012658227849, 6: 1.2316455696202531}}


Extracting the top 150 eigenfaces from 966 faces
Projecting the input data on the eigenfaces orthonormal basis
done in 0.347s
Fitting the classifier to the training set
done in 20.659s
DIMENSIONS:
150
Best score found by grid search:
0.79753815337
Best params found by grid search:
{'C': 0.03125, 'kernel': 'linear', 'class_weight': {0: 1.3708860759493671, 1: 1.0367088607594936, 2: 1.2848101265822784, 3: 0.5, 4: 1.2822784810126582, 5: 1.3481012658227849, 6: 1.2316455696202531}}


Extracting the top 100 eigenfaces f

In [None]:
#CHI square test
def chisq(xin, yin, k):
    
    x_new = SelectKBest(f_classif, k=k).fit_transform(xin,yin)
    
    return x_new

In [None]:
def modChi(ker, param_grid, k, x, y):
    t0 = time()
    xin= chisq(x,y,k)
    print("Fitting the classifier to the training set")
    model = GridSearchCV(SVC(kernel=ker, class_weight='balanced', decision_function_shape='ovr'), param_grid, cv=5, scoring='f1_micro')
    model = model.fit(xin, y)
    
    print("done in %0.3fs" % (time() - t0))
    print("DIMENSIONS:")
    print(k)
    print("ker:")
    print(ker)
    print("Best estimator found by grid search:")
    print(model.best_estimator_)
    print("Best score found by grid search:")
    print(model.best_score_)
    print("Best params found by grid search:")
    print(model.best_params_)
    print("\n")
    return model

In [None]:
param_grid = {'C': [1],
              'gamma': [0.008, 0.009, 0.0085, 0.01], }

k= [100]

result=0.0

for i in range(len(k)):
    model= modChi('rbf',param_grid, k[i], xin, yin)
    tmp= model.best_score_
    if(tmp > result):
        result=  tmp
        params= model.best_params_
        dim= k[i]

print("\n")
print("Best: ")
print(result)
print("params:")
print(params)
print("dim")
print(dim)

In [None]:
param_grid = {'C': [100, 50, 20, 10, 1, 0.1, 0.001, 0.0001],
              'gamma': [1, 0.1, 0.01, 0.001, 0.005, 0.0001, 0.0005, 0.000001], 
              'coef0': [0, 0.01, 0.001, 0.0001, 100, 1000]}

result= 0.0

for i in range(len(dims)):
    model= buildModel('sigmoid',param_grid, dims[i], xin, yin)
    tmp= model.best_score_
    if(tmp > result):
        result=  tmp
        params= model.best_params_
        d= dims[i]

print("\n")
print("Best: ")
print(result)
print("params:")
print(params)
print("dim")
print(d)

In [None]:
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt

param_range = np.logspace(-5, -1, 5)
x_in= dimReduction(90,xin)
train_scores, test_scores = validation_curve(
    SVC(C=1), x_in, yin, param_name="gamma", param_range=param_range,
    cv=2, scoring="f1_micro", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

In [1]:
###### from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt

param_range = np.logspace(-2.3, -1, 5)
print(param_range)
x_in= dimReduction(75,xin)
train_scores, test_scores = validation_curve(
    SVC(C=1), x_in, yin, param_name="gamma", param_range=param_range,
    cv=StratifiedKFold(n_splits=3), scoring="f1_micro", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

NameError: name 'np' is not defined

In [15]:
def scale(x):
    xi=zscore(x, axis=1)
    print (xi.shape)
    print (xi)
    return xi

In [18]:
scale(xin)

(966, 1850)
[[-1.82780147 -1.78638351 -1.33596349 ..., -1.06674683 -0.97873384
  -0.87001175]
 [ 2.10228515  2.23879075  1.87477553 ...,  1.85202491  2.65968299
   2.36392117]
 [-0.2421792   0.09333889 -0.03128191 ..., -1.79514778 -1.71845794
  -1.81432033]
 ..., 
 [-1.27244699 -0.47461364 -0.1672515  ...,  0.09433326 -0.01684034
  -0.09531575]
 [-1.86059082 -1.88377535 -1.75626004 ...,  0.53901821  0.6317569
   0.70131075]
 [-1.01638567 -0.85002297 -0.8604207  ...,  1.6766088   1.78058577
   1.8741647 ]]


array([[-1.82780147, -1.78638351, -1.33596349, ..., -1.06674683,
        -0.97873384, -0.87001175],
       [ 2.10228515,  2.23879075,  1.87477553, ...,  1.85202491,
         2.65968299,  2.36392117],
       [-0.2421792 ,  0.09333889, -0.03128191, ..., -1.79514778,
        -1.71845794, -1.81432033],
       ..., 
       [-1.27244699, -0.47461364, -0.1672515 , ...,  0.09433326,
        -0.01684034, -0.09531575],
       [-1.86059082, -1.88377535, -1.75626004, ...,  0.53901821,
         0.6317569 ,  0.70131075],
       [-1.01638567, -0.85002297, -0.8604207 , ...,  1.6766088 ,
         1.78058577,  1.8741647 ]], dtype=float32)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
t0=time()
n_neighbors= np.arange(1,201)
parameters = {'n_neighbors': n_neighbors}
mod= GridSearchCV(KNeighborsClassifier(),
            parameters, cv=StratifiedKFold(n_splits=10),
            n_jobs=-1, scoring= 'f1_weighted')
mod = mod.fit(xin, yin)
    
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(mod.best_estimator_)
print("Best score found by grid search:")
print(mod.best_score_)
print("Best params found by grid search:")
print(mod.best_params_)

done in 2028.775s
Best estimator found by grid search:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
Best score found by grid search:
0.542383771634
Best params found by grid search:
{'n_neighbors': 1}
