In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/bioresponse/train.csv')
test = pd.read_csv('../input/bioresponse/test.csv')

### Let's start by checking the number of columns

In [None]:
train.columns

In [None]:
x_full = train.loc[:, train.columns!='Activity']
y_full = train['Activity']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(x_full, y_full, test_size=0.33, random_state=0)
print(X_train.shape)
print(X_val.shape)

### Given the large number of features, we need to do some feature selection. We will be using the PCA component analysis to select the features with the highest variance.

In [None]:
from sklearn.decomposition import PCA

pca_test = PCA(n_components=1776)
pca_test.fit(X_train)
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.axvline(linewidth=4, color='r', linestyle = '--', x=324, ymin=0, ymax=1)
plt.show()
evr = pca_test.explained_variance_ratio_
cvr = np.cumsum(pca_test.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr

In [None]:
pca_df.loc[320:350]

In [None]:
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

In [None]:
pca.explained_variance_ratio_

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_val)

print(classification_report(y_val, y_pred))

### We are clearly not going far with this type of model. Let's tune it...

In [None]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(X_val)
 
# print classification report
print(classification_report(y_val, grid_predictions))

### Clearly the model is not fit for this task...