# This is an implementation of Quantile-Sketch Normalization on given dataset. 

### The aim of this normalization technique is to run machine learning algorithms on a smaller part of the dataset after normalizing the dataset values to a certain scale

Set the global constants

In [1]:
MAX_SIZE = 22810
N = MAX_SIZE//10

Choose N random indices from a range of MAX_SIZE numbers

In [2]:
import random
probes = sorted(random.sample(range(MAX_SIZE), N))
# print(probes)

Initializing the lists of dataframes that must be concatenated for normalization

In [3]:
list_val, list_det = [], []
list_abs = []

Defining some helper functions for further usage in the code

In [4]:
def preview_table(tbl):
    print("-----------------------------------------------------------------------------------")
    print(tbl.head())
    print("-----------------------------------------------------------------------------------")
    
def table_to_csv(tbl, csv_filename):
    tbl.to_csv(csv_filename)
    print("Created the file:",csv_filename)

Opening the file that contains the path to the dataset files

In [5]:
files = open('listfile_quantile','r')

Reading each file and making separate dataframes for each feature for the sake of applying normalization calculus on them 

In [6]:
import pandas as pd

for file in files:
    file = file.strip('\n')
    print("_-_-_-_-_- Reading the file '{}'_-_-_-_-_".format(file))

    ## read CSV from text file
    df = pd.read_csv(file, sep='\t')

    ## choosing only the N randomly chosen row values of the 'VALUE' feature
    df_val = df[['ID_REF','VALUE']].iloc[probes]
    df_val.columns = ['ID_REF', file[-11:-4]+'_A']
    # preview_table(df_val)
    
    ## appending the 'VALUE' column of each file to the list of dataframes 
    list_val.append(df_val)

    
    
    ## choosing only the N randomly chosen row values of the 'Detection p-value' feature
    df_det = df[['ID_REF','Detection p-value']].iloc[probes]
    df_det.columns = ['ID_REF', file[-11:-4]+'_B']
    # preview_table(df_det)

    ## appending the 'Detection p-value' column of each file to the list of dataframes 
    list_det.append(df_det)

    
    
    ## choosing only the N randomly chosen row values of the 'ABS_CALL' feature
    df_abs = df[['ID_REF','ABS_CALL']].iloc[probes]
    df_abs.columns = ['ID_REF', file[-11:-4]+'_C']
    # preview_table(df_abs)
    
    ## appending the 'ABS_CALL' column of each file to the list of dataframes 
    list_abs.append(df_abs)

_-_-_-_-_- Reading the file 'Dataset_new/GSM6577.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6578.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6579.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6580.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6581.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6582.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6583.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6584.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6227.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6544.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6571.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6572.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6573.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6574.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6575.txt'_-_-_-_-_
_-_-_-_-_- Reading the file 'Dataset_new/GSM6576.txt'_-

Defining a function to convert the list into a table and making the necessary feature-tables

In [7]:
def make_table(l):
    ## concatenating the list of dataframes into a single table
    tbl = pd.concat(l, axis=1)

    ## eliminating any duplicate columns in the table
    tbl = tbl.loc[:, ~tbl.columns.duplicated()]
    
    # preview_table(tbl)
    return tbl


table_val = make_table(list_val)
table_to_csv(table_val, 'table_val.csv')
table_det = make_table(list_det)
table_to_csv(table_det, 'table_det.csv')
table_abs = make_table(list_abs)

Created the file: table_val.csv
Created the file: table_det.csv


Defining a function to implement quantile normalization and calling them on the necessary feature-tables

In [8]:
def normalize(table):
    sorted_tbl = table['ID_REF'].to_frame()
    
    ## sorting all the values column-wise
    for c in table.columns[1:]:
        sorted_tbl[c] = sorted(list(table[c]))
    # preview_table(sorted_tbl)
    
    ## finding the row-wise means of the sorted table
    ## and assigning them to their respective ranks
    rank_vals = sorted_tbl.mean(axis=1).reset_index().drop(['index'], axis=1)
    # table_to_csv(rank_vals, 'rank_vals.csv')
    
    ## converting the ranks table into a dictionary
    ranks_dict = rank_vals.to_dict()[0]
    # print(ranks_dict)

    ## making the column-wise ranks table of the original dataset
    normalized_tbl = table['ID_REF'].to_frame()
    normalized_tbl[table.columns[1:]] = table[table.columns[1:]].rank(method='min')
    # preview_table(normalized_tbl)    

    ## replacing the rank in each cell with its 
    ## corresponding calculated mean value
    for i in range(len(normalized_tbl)):
        for j in range(1,len(normalized_tbl.columns)):
            temp = normalized_tbl.iat[i,j]
            temp = ranks_dict[int(temp)-1]
            normalized_tbl.iat[i,j] = temp
            
    # preview_table(normalized_tbl)    
    return normalized_tbl

normalized_val = normalize(table_val)
table_to_csv(normalized_val, 'normalized_val.csv')

normalized_det = normalize(table_det)
table_to_csv(normalized_det, 'normalized_det.csv')

Created the file: normalized_val.csv
Created the file: normalized_det.csv


Now that we have the preprocessed and normalized dataset, we now move on to classifying the model to predict the 'ABS_CALL' label when the features 'VALUE' and 'Detection p-value' are provided

Preparing the training dataset:

In [9]:
list_X_train, list_y_train = [], []
for i in range(1, len(table_abs.columns)-1):
    ## appending the normalized feature vectors to the list of dataframes
    dfX = pd.concat([normalized_val.iloc[:,[0,i]],normalized_det.iloc[:,i]], axis=1)
    dfX.columns = ['ID_REF', 'VALUE', 'Detection p-value']
    list_X_train.append(dfX)

    ## appending the target vectors to the list of dataframes
    dfy = table_abs.iloc[:,i]
    dfy.columns = ['ABS_CALL']
    list_y_train.append(dfy)
# print(len(list_X_train), len(list_y_train))

## concatenating the list of dataframes of 
## the features into a single table
X_train = pd.concat(list_X_train)
table_to_csv(X_train, 'X_train.csv')
# preview_table(X_train)

## concatenating the list of dataframes of 
## target vectors into a single table
y_train = pd.concat(list_y_train)
table_to_csv(y_train, 'y_train.csv')
# preview_table(y_train)

Created the file: X_train.csv
Created the file: y_train.csv


Preparing the testing dataset:

In [10]:
df_test = pd.read_csv('Dataset_new/GSM6576.txt', sep='\t')
X_test, y_test = df_test.iloc[:,[0,1,3]],df_test.iloc[:,2]
table_to_csv(X_test, 'X_test.csv')
table_to_csv(y_test, 'y_test.csv')

Created the file: X_test.csv
Created the file: y_test.csv


Label encoding the feature vectors in string format into float format:

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X_train['ID_REF'] = le.fit_transform(X_train['ID_REF'])
y_train = le.fit_transform(y_train)

X_test['ID_REF'] = le.fit_transform(X_test['ID_REF'])
y_test = le.fit_transform(y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Training our model with the Gaussian Naive Bayes Classifier algorithm:

In [12]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

Predicting the class labels on the test data:

In [13]:
y_pred = clf.predict(X_test)

Evaluating our classifier model by making a confusion matrix of the predicted and actual target values:

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 7910,    21,  1691],
       [   72,     4,   243],
       [  378,     1, 12490]])

Calculating the accuracy of the classifier model:

In [15]:
from sklearn.metrics import accuracy_score 

print ("Accuracy : ", accuracy_score(y_test,y_pred)*100,'%','\n') 

Accuracy :  89.4519947391495 % 

