In [2]:
import pandas as pd
import sklearn as sl
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import numpy as np



In [3]:
def Extract(data):#Function to separate target from the data
    Y = data.ix[:,data.shape[1]-1:data.shape[1]]
    X = data.ix[:,0:data.shape[1]-1]
    return X,Y

def Normalize(data):
    X_norm = (data - data.mean()) / (data.max() - data.min())
    return X_norm

def TrainCrossTest(X,Y,trainTest,trainValid):
    """ 
    X - Data Avaliable
    Y - Target data
    trainTest - Ratio of testing set data to training set data
    trainValid - Ratio of testing set data to validation set data
    """
    sss = StratifiedShuffleSplit(Y,test_size=trainTest) 
    for train_index, test_index in sss:
        X_train, X_test = X.ix[train_index], X.ix[test_index]
        y_train, y_test = Y.ix[train_index], Y.ix[test_index] 

    X_train.reset_index(inplace = True, drop = True)
    X_test.reset_index(inplace = True, drop = True)
    y_train.reset_index(inplace = True, drop = True)
    y_test.reset_index(inplace = True, drop = True)

    X = X_train.copy()
    Y = y_train.copy()
    sss = StratifiedShuffleSplit(y_train,test_size=trainValid)
    for train_index, valid_index in sss:
        X_train, X_valid = X.ix[train_index], X.ix[valid_index]
        y_train, y_valid = Y.ix[train_index], Y.ix[valid_index] 
        
    X_train.reset_index(inplace = True, drop = True)
    X_valid.reset_index(inplace = True, drop = True)
    y_train.reset_index(inplace = True, drop = True)
    y_valid.reset_index(inplace = True, drop = True)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [4]:
data = pd.read_csv('train.csv')
X,Y = Extract(data)
X = Normalize(X)
X = X.dropna(axis = 1,how = 'all') #Removing all the columns which are 'all' or 'any' na
pca = PCA(n_components=0.95) #Changeable parameter
pca = pca.fit(X)
X = pd.DataFrame(pca.fit_transform(X)) #Applies PCA and converts it back to DataFrame
X_train, y_train, X_valid, y_valid, X_test, y_test = TrainCrossTest(X,Y,0.15,0.15) #Separates the training testing and validating set of data

In [5]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((54924, 21), (54924, 1), (9693, 21), (9693, 1), (11403, 21), (11403, 1))

In [6]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.051578,0.210242,1.060631,1.561047,1.732184,-1.002884,-0.198475,0.144375,0.361753,0.322258,...,0.034409,0.145811,-0.201626,0.268847,0.084531,0.225916,-0.015771,-0.161037,-0.056909,0.073295
1,-0.371847,0.110612,0.191154,-0.240645,0.168408,-0.044732,-0.647494,0.097951,-0.330047,0.910962,...,0.007158,0.087856,-0.029152,0.461218,0.011131,0.315231,-0.293371,0.214006,-0.06613,-0.104226
2,0.197348,-0.54531,-0.499154,1.333389,0.009384,0.130015,-0.050552,0.477777,-0.219247,0.032189,...,-0.043672,-0.019422,-0.021054,0.073151,0.028545,0.014604,-0.044031,-0.026756,0.025797,0.065056
3,-0.644431,1.939351,-0.281473,0.334376,-0.241563,-0.229921,0.242637,-0.018378,-0.129268,1.239899,...,0.062421,-0.64107,-0.028456,0.096308,-0.089359,0.2935,0.205408,-0.060011,0.555433,-0.789672
4,0.027942,-0.440151,-0.33712,-0.187748,-0.069578,0.081831,0.506455,-0.383015,0.460997,-0.007743,...,0.080412,-0.006858,-0.035948,-0.071032,-0.024838,0.024292,-0.148182,0.118288,-0.001648,-0.037792


In [7]:
X_valid.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.272353,0.379929,0.066901,-0.436878,-0.042552,-0.118482,-0.057761,-0.331276,0.003733,-0.002319,...,-0.040631,0.01937,0.007846,-0.001764,0.009639,-0.01721,0.029196,-0.017139,-0.004295,-0.008347
1,0.460286,-0.519345,-0.117152,-0.159916,0.044001,0.162093,-0.051052,-0.094183,-0.206886,0.011157,...,-0.019609,-0.029707,0.016287,0.036542,0.003558,0.056397,-0.012977,-0.025497,0.013843,0.047974
2,0.461223,-0.516879,-0.111939,-0.15912,0.047655,0.160487,-0.050969,0.466562,-0.269852,0.015129,...,-0.015471,-0.025389,0.015448,0.037759,0.00696,0.039378,0.001666,-0.024096,-0.002849,0.024673
3,0.336108,-0.420267,-0.057407,-0.159068,0.051702,0.141111,-0.121576,-0.394197,-0.205938,-0.026854,...,0.010184,-0.019909,0.019623,-0.156772,0.003391,-0.118405,0.143512,-0.109474,-0.017494,0.004975
4,0.048908,-0.431075,-0.294117,-0.319423,-0.060709,0.078692,0.4976,0.076419,0.383966,0.00847,...,0.082302,-0.008913,0.006384,0.080754,0.002861,0.068167,0.034978,-0.028881,-0.012814,0.011072


In [8]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.27354,0.378635,0.062948,-0.438647,-0.037684,-0.116174,-0.054881,0.372832,-0.076208,0.001902,...,-0.042157,0.009193,0.015496,-0.005241,7e-06,0.009323,0.003884,-0.018951,0.018963,0.034795
1,-0.979666,0.203669,-0.017683,-0.424337,-0.057796,-0.084599,0.06298,-0.460323,0.091222,0.078746,...,-0.083349,0.039667,-0.019374,0.402873,0.047876,0.214421,-0.157909,0.175833,0.020721,-0.064155
2,-1.274546,0.376355,0.057499,-0.439945,-0.038343,-0.114058,-0.053938,0.237518,-0.06109,0.000875,...,-0.045872,0.001683,0.019104,-0.008058,-0.006443,0.032955,-0.017931,-0.020369,0.041398,0.07053
3,0.460474,-0.518116,-0.114484,-0.159622,0.046612,0.161409,-0.050473,0.321615,-0.254011,0.01363,...,-0.017243,-0.027887,0.016558,0.037067,0.004774,0.048211,-0.006068,-0.025272,0.004856,0.036663
4,-1.548694,0.356066,-0.322369,1.050868,-0.079642,-0.152362,-0.039384,0.371573,-0.004593,0.01927,...,-0.06233,0.02728,-0.027358,0.035839,0.031627,-0.049803,-0.007847,-0.019408,0.014502,0.020965


In [9]:
def AccuracyScores(y_real,y_pred,resultInterest):
    accuracy = accuracy_score(y_real,y_pred)
    x = precision_recall_fscore_support(y_real, y_pred)
    
    if(resultInterest == 'All'):
        return accuracy, x
    else:
        result = []
        for j in x:
            result.append([j[i] for i in resultInterest])
        return accuracy, result

In [10]:
AccuracyScores(y_valid,y_test.ix[:9692],'All')

(0.92365624677602398,
 (array([ 0.95995706,  0.02910053]),
  array([ 0.96057579,  0.02864583]),
  array([ 0.96026632,  0.02887139]),
  array([9309,  384])))

In [1]:
from sklearn.neural_network import MLPClassifier

In [12]:
X = X_train
y = y_train
clf = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X,y)

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
       batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [15]:
xx = clf.predict(X_test)


In [16]:
xx.sum()

0

In [17]:
def SplitRatio(df,target,resultValue,num):
    retX = []
    retY = []
    count = 0
    for i in range(target.shape[0]):
        if target['TARGET'][i] == resultValue:
            retX.append(df.ix[i])
            retY.append(target.ix[i])
        elif count < num:
            retX.append(df.ix[i])
            retY.append(target.ix[i])
            count += 1
    retX = pd.DataFrame(retX)
    retY = pd.DataFrame(retY)
    
    return retX, retY

In [18]:
y_train.sum()


TARGET    2173
dtype: int64

In [44]:
max_prec = 0.0
prec_ind = 0
max_recall = 0.0
recall_ind = 0
max_fscore = 0.0
fscore_ind = 0
for i in range(2173,9173,5):
    #print i
    xx,yy = SplitRatio(X_train,y_train,1,i)
    clf.fit(xx,yy)
    xxx = clf.predict(X_test)
    LL = []
    LL = AccuracyScores(y_test,xxx,[1])
    #print LL
    if max_prec < LL[1][0]:
        max_prec = LL[1][0]
        prec_ind = i
    if max_recall < LL[1][1]:
        max_recall = LL[1][1]
        recall_ind = i
    if max_fscore < LL[1][2]:
        max_fscore = LL[1][2]
        fscore_ind = i
print max_prec,max_recall,max_fscore
print prec_ind,recall_ind,fscore_ind

[0.17915904936014626] [0.74057649667405767] [0.25513905683192262]
9128 2173 9148


In [26]:
clf.fit(xx,yy)

MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
       batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [27]:
clf.predict(X_test)

array([1, 0, 0, ..., 0, 0, 0])

In [28]:
xxx = clf.predict(X_test)

In [29]:
xxx.sum()

3243

In [31]:
AccuracyScores(y_test,xxx,[1])

(0.73463123739366831,
 [[0.10299105766265804], [0.74057649667405767], [0.18083378451543045], [451]])

In [57]:
import csv
f = open("Output.csv",'w')
writer = csv.writer(f)
writer.writerow(['num' ,'Accuracy' ,'Precision' , 'ReCall' , 'F-Score'])
for i in range(2173,9173+1,50):
    xx,yy = SplitRatio(X_train,y_train,1,i)
    clf.fit(xx,yy)
    xxx = clf.predict(X_test)
    LL = []
    LL = AccuracyScores(y_test,xxx,[1])
    #print LL
    try:
        writer.writerow([i,LL[0] , LL[1][0][0] , LL[1][1][0] ,LL[1][2][0]])
    except:
        print " Unsuccessful" + i