# Projet: Biological response prediction

In [1]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import random
import pandas as pd
import numpy as np

## 1. Lecture du Data set

In [63]:
path_train = "../dataset/train.csv"
path_test = "../dataset/test.csv"
data_pd = pd.read_csv(path_train, sep=",") #The first column is the target
# We split the data for train set and a validation set
# Actually the validation set will act like a test set here
data = (data_pd.as_matrix()[:,:]).astype(float)
random.shuffle(data)
limit_train = int(0.8 * data.shape[0])
train_set = data[:limit_train]
valid_set = data[limit_train + 1:]

#X_complet, y_complet = data[:, 1:], data[:, 0]

X_train, y_train = train_set[:, 1:], train_set[:,0]
X_valid, y_valid = valid_set[:, 1:], valid_set[:, 0]

test_set = pd.read_csv(path_test, sep=",").as_matrix()
test_set_x = test_set[:, 1:]

# Exploration des données

In [64]:
data_pd.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [65]:
data_pd.describe()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
count,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,...,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0
mean,0.542255,0.076948,0.592436,0.068142,0.03899,0.212112,0.686653,0.274713,0.455133,0.749517,...,0.026926,0.014663,0.013863,0.021861,0.015196,0.016796,0.012263,0.01173,0.020261,0.011197
std,0.498278,0.079989,0.10586,0.078414,0.115885,0.102592,0.078702,0.090017,0.162731,0.071702,...,0.161889,0.120215,0.116938,0.146249,0.122348,0.128522,0.110074,0.107683,0.140911,0.105236
min,0.0,0.0,0.282128,0.0,0.0,0.00263,0.137873,0.00613,0.0,0.27559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0333,0.517811,0.0,0.0,0.138118,0.625627,0.207374,0.378062,0.707339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0667,0.585989,0.05,0.0,0.190926,0.674037,0.277845,0.499942,0.738961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1,0.668395,0.1,0.0,0.261726,0.740663,0.335816,0.569962,0.788177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.964381,0.95,1.0,1.0,0.994735,0.790831,0.98987,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 2. Mesure de l'importance des features

### 2.1 Embedded approach : Random forest

In [6]:
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
indices = indices[:50] # Afficher les 50 premiers seulement ! 

# Diagramme en barre permettant de connaitre l'importance des features
%matplotlib auto
df = pd.DataFrame( importances[indices], index = indices)
df.plot(kind='bar', legend = False, width = .8, figsize = (15,5))
plt.title("Features importance")
plt.show()

Using matplotlib backend: Qt5Agg


### 2.2 Wrapper aproach : RFE

In [59]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import random
import matplotlib.pyplot as plt

In [66]:
names = data_pd.columns.values

# 1 - Baseline selection
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=100)
rfe.fit(X_train, y_train)
 
print "Features sorted by their rank:"
score = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))

Features sorted by their rank:


In [67]:
ranking_1 = rfe.ranking_
selected_var_1 = [i for i in score if i[0] == 1.0 ]
selected_var_names_1 = ', '.join([i[1] for i in selected_var])
selected_var_names_1

'D1453, D1460, D1461, D1467, D1470, D1471, D1472, D1475, D1477, D1479, D1480, D1483, D1487, D1494, D1499, D1500, D1506, D1507, D1508, D1512, D1513, D1514, D1520, D1523, D1526, D1528, D1529, D1532, D1533, D1538, D1540, D1551, D1552, D1560, D1562, D1567, D1571, D1574, D1576, D1583, D1586, D1587, D1588, D1590, D1591, D1594, D1597, D1599, D1603, D1604, D1616, D1622, D1623, D1628, D1630, D1634, D1639, D1645, D1647, D1655, D1658, D1659, D1661, D1664, D1668, D1671, D1674, D1675, D1679, D1682, D1684, D1685, D1689, D1691, D1693, D1701, D1702, D1703, D1704, D1705, D1710, D1712, D1724, D1728, D1729, D1731, D1733, D1736, D1738, D1741, D1746, D1749, D1751, D1754, D1756, D1758, D1765, D1767, D1768, D1772'

In [68]:
# 2 - Shuffle data and re-select variable
c = zip(X_train, y_train)
random.shuffle(c)
X_train, y_train = zip(*c)
rfe2 = RFE(lr, n_features_to_select=100)

rfe2.fit(X_train, y_train) 
print "Features sorted by their rank:"
score = sorted(zip(map(lambda x: round(x, 4), rfe2.ranking_), names))

Features sorted by their rank:


In [69]:
ranking_2 = rfe2.ranking_
selected_var_2 = [i for i in score if i[0] == 1.0 ]
selected_var_names_2 = ', '.join([i[1] for i in selected_var_2])
selected_var_names_2

'D1451, D1453, D1454, D1456, D1457, D1462, D1465, D1468, D1471, D1476, D1484, D1485, D1494, D1498, D1505, D1511, D1513, D1517, D1519, D1520, D1524, D1531, D1533, D1534, D1538, D1539, D1548, D1553, D1555, D1556, D1564, D1569, D1570, D1573, D1575, D1582, D1584, D1587, D1588, D1589, D1590, D1594, D1597, D1599, D1600, D1607, D1613, D1614, D1615, D1626, D1630, D1631, D1635, D1637, D1640, D1651, D1653, D1658, D1663, D1670, D1673, D1677, D1679, D1684, D1690, D1691, D1692, D1694, D1696, D1697, D1698, D1699, D1706, D1709, D1716, D1717, D1721, D1722, D1723, D1727, D1728, D1731, D1733, D1737, D1738, D1741, D1743, D1744, D1745, D1751, D1752, D1756, D1760, D1762, D1763, D1764, D1766, D1769, D1771, D1773'

In [74]:
len(list(set(selected_var_names_2.split(",")) & set(selected_var_names.split(","))))

24

### 2.3 Wrapper approach : RFECV (cross validation) 

In [77]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC

In [None]:
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=4)
rfecv.fit(X_train, y_train)