#Data Centric Approach
In this notebook we will follow a data centric approach and try to improve predictions for relKa.

In [None]:
#Mount  Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd drive/My\ Drive/Colab\ Notebooks/ML\ Project2

In [None]:
#Usefull Libraries

# pandas / numpy
import pandas as pd
import numpy as np

#import xgboost regressor model
from xgboost import XGBRegressor

#pearson corelation coefficient
from scipy.stats import pearsonr

#import cross validation function from sklearn
from sklearn.model_selection import cross_val_score
#import one hot encoder function from sklearn
from sklearn.preprocessing import OneHotEncoder
#import the train test split function from sklearn to split the dataset randomly for training and testing
from sklearn.model_selection import train_test_split
#import mse loss
from sklearn.metrics import mean_squared_error

In [None]:
#import data from zip
pddata = pd.read_csv('GSM1586785_ScrH-12A_Exd_14mer_cg.csv.zip', compression='zip', error_bad_lines=False)
#print the first samples of the dataframe to get a look at the data
pddata.head(4)

In [None]:
#convert dataframe to np array
data = pddata.to_numpy()
#labels
relKa = data[:,320:]
# features
# we remove the index of the data samples, since it is not a feature
features = data[:,1:320]
# temporarily remove the 2nd feature because one hot encoding causes ram overflow
features = features[:,1:]

### Preprocessing of data

One hot encoding the data is troubling for my setup and for google colab. The first one is unable to perform the task in reasonable time, while the second one runs out of RAM. I tried both the encoder os scikit learn and a custom encoder of mine that you can find here. The main problem is the size of the end result.

In [None]:
def one_hot_encoder(pos,length):
  """
  One hot encode a list of unique elements

  @param pos: int64 
  @param length: int64 
  """
  encoded = np.zeros(length)
  encoded[pos] = 1
  return encoded

In [None]:
#remove duplicate sequences in case they exist
#Kmer = [one_hot_encoder(i,len(relKa[:,0])) for i in range(len(relKa[:,0]))]
#create the encoding for all sequences
#Kmer_encode = [ for i in range(len(Kmer))]
#Kmer

In [None]:

# initialize a one hot encoder that ignores the unseen sequences
# enc = OneHotEncoder(handle_unknown='ignore')
# use the sequences of the data for the initialization
# enc.fit(Kmer_encode)
# encode the sequences
# enc.transform(Kmer_encode).toarray()


In [None]:
def standardization(x):
  """
  Standardization of elements for a vector x
  @param:x np.ndarray
  """
  mean = np.mean(x)
  #print(mean)
  std = np.std(x)
  #print(std)
  x = np.apply_along_axis(lambda y: (y - mean)/std, 0, x)
  return x

In [None]:
#standardize all features
std_features=np.apply_along_axis(standardization, 1, features)

In [None]:
# randomly split the dataset in a 70/30 split
# change the train_size for partitions of different size
# rerun for different partitions
X_train, X_test, y_train, y_test = train_test_split(features, relKa,  train_size=0.7, random_state=33, shuffle=True)

## Initial Training of the regression method 

We choose Xgboost as our model for this approach.

In [None]:
#Cross validation for initial data and Xgboost
scores = cross_val_score(XGBRegressor(objective='reg:squarederror'), features, relKa, scoring='neg_mean_squared_error')

In [None]:
print("Basic loss for XGBoost:",np.mean((-scores)**0.5))

In [None]:
#Set the parameters for the XGBRegressor
#we will use the GPU, otherwise it does not train
param_dict = {
    'max_depth':10,
    'n_estimators':1000,
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist'
}

In [None]:
#model of XBGRegressor without extensive parameter tuning
model=XGBRegressor(**param_dict)

In [None]:
#fit the model
model.fit(X_train,y_train,eval_metric="rmse")

In [None]:
# do some predictions with the model
y_pred = model.predict(X_test)

In [None]:
#mean squared error
mean_squared_error(y_test,y_pred)
#np.corrcoef(y_test,y_pred)