In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from sklearn.metrics import f1_score
import graphviz
from sklearn import tree
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [None]:
test = pd.read_csv('../input/data-without-drift/test_clean.csv')[['time', 'signal']]
train = pd.read_csv('../input/data-without-drift/train_clean.csv')[['time', 'signal', 'open_channels']]
train.shape,test.shape

In [None]:
from tqdm.notebook import tqdm
train1 = np.asarray(train[['signal','open_channels']])
train_dict = {}
for sig,chan in tqdm(train1):
    temp = []
    try:
        temp = train_dict[sig]
        temp.append(chan)
    except KeyError:
        temp.append(chan)
    finally:
        train_dict[sig] = temp

**len(train_dict)** = 1967506 i.e. ~2Million unique signal values  

In [None]:
print('len(train_dict)',len(train_dict)) 
sns.distplot(list(train_dict.keys()))

In [None]:
train_info_dict={}
from collections import Counter
for key,value in train_dict.items():
    train_info_dict[key] = Counter(value)

In [None]:
counter = 0
special_signals={}
for key,value in train_info_dict.items():
    if len(value)>1:
        special_signals[key] = value
        counter+=1
        #print(key,value)
print("No of signal values from Batch-3 until Batch-10 that overlap:",counter)

In [None]:
channel_probs = []
for sig,chan_dict in special_signals.items():
    for i,j in chan_dict.items():
        #print(sig,i,np.round(j/sum(chan_dict.values()),4))
        channel_probs.append((sig,i,np.round(j/sum(chan_dict.values()),4)))

In [None]:
train.head()

In [None]:
df_channel_probs = pd.DataFrame(channel_probs)
df_channel_probs.columns=['signal','open_channels','prob']
df_channel_probs.shape

In [None]:
df_channel_probs.head()

In [None]:
result = pd.merge(train, df_channel_probs, how='left', on=['signal','open_channels'])
result = result.fillna(1)

In [None]:
result.head()

### If a particular signal value doesn't facilitate opening the same number of channels every time, then "just" the signal is not sufficient to determine the number of channels opening.  
## Can I treat the 42704 signals that overlap as outliers??? Or perhaps assign them probability

In [None]:
# mean = []
# var = []
# for i in range(11):
#     sns.distplot(train_dict[i],axlabel='electrical signal values')
#     mean.append(np.mean(train_dict[i]))
#     var.append(np.std(train_dict[i])*np.std(train_dict[i]))

In [None]:
# for no_of_channels in range(11):
#     print('no_of_channels:',no_of_channels,'| percent of overlapped signal',100*np.round(len(np.unique(train_dict[no_of_channels]))/len(train_dict[no_of_channels]),4),'%')

no_of_channels: 0 | percent of overlapped signal 18.4%    
no_of_channels: 1 | percent of overlapped signal 46.43%  
no_of_channels: 2 | percent of overlapped signal 44.11%  
no_of_channels: 3 | percent of overlapped signal 45.91%  
no_of_channels: 4 | percent of overlapped signal 52.98%  
no_of_channels: 5 | percent of overlapped signal 57%  
no_of_channels: 6 | percent of overlapped signal 56.46%  
no_of_channels: 7 | percent of overlapped signal 53.6%    
no_of_channels: 8 | percent of overlapped signal 53.78%  
no_of_channels: 9 | percent of overlapped signal 56.61%  
no_of_channels:10 | percent of overlapped signal 72.58%  

**inference:** for 0 channels we can only be 72% sure during prediction, for 1 channels we can only be 54% sure during prediction, so on and so forth

In [None]:
sns.distplot(test.signal.values)

In [None]:
test.head(10)

In [None]:
from tqdm.notebook import tqdm
train = np.asarray(train[['signal','open_channels']])
test_dict = {}
for sig,chan in tqdm(test):
    temp = []
    try:
        temp = train_dict[chan]
        temp.append(sig)
    except KeyError:
        temp.append(sig)
    finally:
        train_dict[chan] = temp

## What data was recorded?  
The electrical signal and maximum number of channels open constitute a data instance.
## How was the data collected? 
One data instance is recorded every 0.1 milli-second. So, in one second there are 10,000 data instances recorded.  
The data was recorded in batches of 50 seconds. Therefore, one batch contains 500,000 rows.  
Training data contains 10 batches: 5 million rows  
Test data contains 4 batches: 2 million rows  

In [None]:
plt.figure(figsize=(20,5)); res = 1000
plt.xticks(np.arange(0, 5500000, step=500000))
plt.plot(range(0,train.shape[0],res),train.signal[0::res])
for i in range(11): plt.plot([i*500000,i*500000],[-5,12.5],'r')
for j in range(10): plt.text(j*500000+200000,10,str(j+1),size=20)
plt.xlabel('Row',size=16); plt.ylabel('Signal',size=16); 
plt.title('Training Data Signal - 10 batches',size=20)
plt.show()

## Can I rearrange the batches? Is batch-1 collected prior to batch-2 and so on & so forth? In other words, Does markovian property apply at batch levels(obvious, 50 secs) or at the training data set level?Since, low-probability channels appear only in the first 2 batches, does that mean that for a channel to become a high-probability channel it needs to first become a low-probability one and then transcend to become a high-probability one?

In [None]:
train['prev'] = 0
train.head()

In [None]:
train['prev'][0+1:500000] = train['signal'][0:500000-1]
train['prev'][500000+1:1000000] = train['signal'][500000:1000000-1]
train['prev'][1000000+1:1500000] = train['signal'][1000000:1500000-1]
train['prev'][1500000+1:2000000] = train['signal'][1500000:2000000-1]
train['prev'][2000000+1:2500000] = train['signal'][2000000:2500000-1]
train['prev'][2500000+1:3000000] = train['signal'][2500000:3000000-1]
train['prev'][3000000+1:3500000] = train['signal'][3000000:3500000-1]
train['prev'][3500000+1:4000000] = train['signal'][3500000:4000000-1]
train['prev'][4000000+1:4500000] = train['signal'][4000000:4500000-1]
train['prev'][4500000+1:5000000] = train['signal'][4500000:5000000-1]

In [None]:
plt.plot(train['time'][500000:1000000],train['signal'][500000:1000000])

In [None]:
df=train.copy()

In [None]:
#REMOVING OUTLIERS: POINTS OUTSIDE 3SD
temp = []
for i in range(10): 
    print('Processing Batch-{}'.format(i+1))
    a = i * 500000
    b = (i+1) * 500000
    temp_df = df[a:b]
    plt.plot(temp_df['open_channels'],temp_df['signal'])
    plt.show()
    temp_df = temp_df[np.abs(temp_df.signal-temp_df.signal.mean()) <= (2*temp_df.signal.std())]
    print('Aftre removing outliers in Batch-{}'.format(i+1))
    plt.plot(temp_df['open_channels'],temp_df['signal'])
    plt.show()
    temp.append(temp_df)

In [None]:
new_df = pd.DataFrame(np.vstack(temp))
new_df.columns = ['time','signal','open_channels']
new_df.shape

In [None]:
new_df.head()

In [None]:
plt.figure(figsize=(20,5)); res = 1000
plt.ylim(bottom=-1);plt.ylim(top=12)
plt.yticks(np.arange(-1, 12, step=1))
plt.xticks(np.arange(0, 5500000, step=500000))
plt.plot(range(0,train.shape[0],res),train.open_channels[0::res])
for i in range(11): plt.plot([i*500000,i*500000],[-5,12.5],'r')
for j in range(10): plt.text(j*500000+200000,10,str(j+1),size=20)
plt.xlabel('Row',size=16); 
plt.ylabel('Channels Open',size=16); 
plt.title('Training Data Open Channels - 10 batches',size=20)
plt.show()

The channels are classified broadly into 2 categories depending on whether they have a low-probability of opening(low conductance) or a high-probability of opening(high conductance):-  
1.Batch1 and Batch2 represent low-probability channels: binary classification.>>>>Model1  
2.Other Batches represnet high-probabaility channels: multi-class classification.>>>>Model2

## Analysis from above EDA:-  
From the plots above, it looks like they used 5 different synthetic models.   
One model produced maximum 1 open channel with low probability (batches 1 and 2).   
One model produced maximum 1 open channel with high probability (batches 3 and 7).   
One model produced maximum 3 open channels (batches 4 and 8).   
One model produced maximum 5 open channels (batches 6 and 9) and  
One model produced maximum 10 open channels (batches 5 and 10).   

According to the paper [here][1], the data is synthesized. Also "electrophysiological" noise and drift were added.  
Drift is a signal bias causing the signal to no longer be a horizontal line like batches 2, 7, 8, 9, 10.

> Data description and dataset construction. Ion channel dwell-times were
simulated using the method of Gillespie 43 from published single channel models.
Channels are assumed to follow a stochastic Markovian process and transition
from one state to the next simulated by randomly sampling from a lifetime
probability distribution calculated for each state. Authentic “electrophysiological”
noise was added to these events by passing the signal through a patch-clamp
amplifier and recording it back to file with CED’s Signal software via an Axon
electronic “model cell”. In some datasets additional drift was applied to the final
data with Matlab. Two different stochastic gating models, (termed M1 and M2)
were used to generate semi-synthetic ion channel data. M1 is a low open probability model from ref. 41 (Fig. 3a, b), typically no more than one ion channel opens
simultaneously. Model M2 is from refs. 42,44 and has a much higher open probability (Fig. 3c, d), consequently up to five channels opened simultaneously and there are few instances of zero channels open.


[1]: https://www.nature.com/articles/s42003-019-0729-3


In [None]:
sns.countplot(train['open_channels'])

# Correlation Between Signal and Open Channels
Let's look closely at random intervals of signal and open channels to observe how they relate. We notice that they are highly correlated and move up and down together. Therefore we can probabily predict open channels from the one feature signal. The only complication is the synthetic drift that was added.

In [None]:
for k in range(10):
    a = int( np.random.uniform(0,train.shape[0]-50000) )
    b=a+5000; res=10
    print('#'*25)
    print('### Random %i to %i'%(a,b))
    print('#'*25)
    plt.figure(figsize=(20,5))
    plt.plot(range(a,b,res),train.signal[a:b][0::res])
    plt.plot(range(a,b,res),train.open_channels[a:b][0::res])
    plt.show()

# Test Data
Let's display the test data signal

In [None]:
plt.figure(figsize=(20,5))
plt.xticks(np.arange(0, 2500000, step=100000))
res = 1000; let = ['1s', '3', '5', '1s','1f','10','5','10','1s','3']
plt.plot(range(0,test.shape[0],res),test.signal[0::res])
for i in range(5): plt.plot([i*500000,i*500000],[-5,12.5],'r')
for j in range(21): plt.plot([j*100000,j*100000],[-5,12.5],'y:')
for k in range(4): plt.text(k*500000+200000,10,str(k+1),size=20)
for k in range(10): plt.text(k*100000+40000,7,let[k],size=16)
plt.xlabel('Row',size=16); plt.ylabel('Signal',size=16); 
plt.title('Test Data Signal - 4 batches - 10 subsamples',size=20)
plt.show()

## Analysis of EDA above:-  
From this plot we can locate the 5 models in action. 

## Nizamuddin approach: Make 2 models:-  
1.for low-probability channels : batch1,batch2 in training data  
2.for high-probability channels: batch3 to batch10 in training data

In [None]:
train2 = train.copy()

### 1) low probability model

In [None]:
X_train = np.asarray(train2[['signal','prev']][0:1000000]).reshape((-1,2))
y_train = np.asarray(train2.open_channels.values[0:1000000]).reshape((-1,1))
print('X_train.shape,y_train.shape:',X_train.shape,y_train.shape)
plt.hist(X_train)
plt.hist(y_train)

In [None]:
clf1s = tree.DecisionTreeClassifier(max_depth=1,criterion='entropy')
clf1s = clf1s.fit(X_train,y_train)
print('Training model low-probability channel')
preds = clf1s.predict(X_train)
print('f1 validation score =',f1_score(y_train,preds,average='macro'))
tree_graph = tree.export_graphviz(clf1s, out_file=None, max_depth = 10,
    impurity = False, feature_names = ['signal','prev'], class_names = ['0', '1'],
    rounded = True, filled= True )
graphviz.Source(tree_graph)  

### 2)high probability model

In [None]:
# %%time
# import pandas
# import xgboost
# from sklearn import model_selection
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import LabelEncoder

# X = np.asarray(train2[['signal','prob']][1000000:]).reshape((-1,2))
# Y = np.asarray(train2.open_channels.values[1000000:])
# print('X_train.shape,y_train.shape:',X_train.shape,y_train.shape)
# # encode string class values as integers
# label_encoder = LabelEncoder()
# label_encoder = label_encoder.fit(Y)
# label_encoded_y = label_encoder.transform(Y)
# seed = 7
# test_size = 0.33
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)
# # fit model no training data
# model = xgboost.XGBClassifier(objective='multi:softmax',num_classes=11)
# model.fit(X_train, y_train)
# print(model)
# # make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# # evaluate predictions
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from numpy import argmax
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train = np.asarray(train2[['signal','prev']][1200000:]).reshape((-1,2))
y_train = np.asarray(train2.open_channels.values[1200000:])
print('X_train.shape,y_train.shape:',X_train.shape,y_train.shape,type(X_train))

X = X_train
y = y_train
seed = 1
#y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=seed)
X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.20,random_state=seed)
# X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.25,random_state=seed)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
sns.countplot(y_train)


In [None]:
x=4
# X_train = X_train.reshape((-1,x,2))
# X_test = X_test.reshape((-1,x,2))
# y_train = y_train.reshape((-1,x,1))
# y_test = y_test.reshape((-1,x,1))
# y_train=np.asarray([*map(np.squeeze,y_train)])
# y_test=np.asarray([*map(np.squeeze,y_test)])
# y_train=y_train.reshape(-1,1,4)
# y_test=y_test.reshape(-1,1,4)
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

For seed=1,1 dense layer 0f 10 units 80% accuracy

In [None]:
%%time
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPool1D
# determine the number of input features
n_timesteps = x
n_features = 2
print('n_timesteps: ',n_timesteps,'n_features:',n_features)
n_classes = 11   #0,1,2.....10
model = Sequential()
# model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(n_timesteps,n_features)))
# #model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
# model.add(Dropout(0.25))
# model.add(MaxPool1D(pool_size=2))
# model.add(Flatten())
model.add(Dense(10, activation='relu',input_shape=(n_features,)))
model.add(Dropout(0.25))
model.add(Dense(n_classes, activation='softmax'))
model.compile(optimizer='adam',
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])
model.summary()

In [None]:
#train
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))
# evaluate the model


In [None]:
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print('Test Accuracy: %.4f' % acc)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)


In [None]:
# make a prediction

print('Predicted: %s (class=%d)' % (yhat, argmax(yhat)))

In [None]:
from keras.layers import Dense, Embedding, Dropout, LSTM
from keras.models import Sequential
from keras.layers import Bidirectional
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

embed_dim = 100
lstm_out = 128
max_features = 5000

model8 = Sequential()
model8.add(Embedding(max_features, embed_dim, input_length = X_train.shape[0]))
model8.add(Dropout(0.2))
model8.add(Conv1D(filters=100, kernel_size=3, padding='same',  activation='relu'))
model8.add(MaxPooling1D(pool_size=2))
model8.add(Bidirectional(LSTM(lstm_out)))
model8.add(Dense(n_classes,activation='softmax'))
model8.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model8.summary())

In [None]:
history = model8.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


# Make Five Simple Models
We will make one model for each different type of signal we observed above.

In [None]:
train2 = train.copy()

## 1 Slow Open Channel--batch1 and batch2

In [None]:
batch = 1; a = 500000*(batch-1); b = 500000*batch
batch = 2; c = 500000*(batch-1); d = 500000*batch
X_train = np.concatenate([train2.signal.values[a:b],train2.signal.values[c:d]]).reshape((-1,1))
y_train = np.concatenate([train2.open_channels.values[a:b],train2.open_channels.values[c:d]]).reshape((-1,1))

clf1s = tree.DecisionTreeClassifier(max_depth=1)
clf1s = clf1s.fit(X_train,y_train)
print('Training model 1s channel')
preds = clf1s.predict(X_train)
print('has f1 validation score =',f1_score(y_train,preds,average='macro'))
print('X_train.shape,y_train.shape: ',X_train.shape,y_train.shape)
tree_graph = tree.export_graphviz(clf1s, out_file=None, max_depth = 10,
    impurity = False, feature_names = ['signal'], class_names = ['0', '1'],
    rounded = True, filled= True )
graphviz.Source(tree_graph)  

## 1 Fast Open Channel

In [None]:
batch = 3; a = 500000*(batch-1); b = 500000*batch
batch = 7; c = 500000*(batch-1); d = 500000*batch
X_train = np.concatenate([train2.signal.values[a:b],train2.signal.values[c:d]]).reshape((-1,1))
y_train = np.concatenate([train2.open_channels.values[a:b],train2.open_channels.values[c:d]]).reshape((-1,1))

clf1f = tree.DecisionTreeClassifier(max_depth=1)
clf1f = clf1f.fit(X_train, y_train)
print('Training model 1f channel')
preds = clf1f.predict(X_train)
print('has f1 validation score =',f1_score(y_train,preds,average='macro'))

tree_graph = tree.export_graphviz(clf1f, out_file=None, max_depth = 10,
    impurity = False, feature_names = ['signal'], class_names = ['0', '1'],
    rounded = True, filled= True )
graphviz.Source(tree_graph) 

## 3 Open Channels

In [None]:
batch = 4; a = 500000*(batch-1); b = 500000*batch
batch = 8; c = 500000*(batch-1); d = 500000*batch
X_train = np.concatenate([train2.signal.values[a:b],train2.signal.values[c:d]]).reshape((-1,1))
y_train = np.concatenate([train2.open_channels.values[a:b],train2.open_channels.values[c:d]]).reshape((-1,1))

clf3 = tree.DecisionTreeClassifier(max_leaf_nodes=4)
clf3 = clf3.fit(X_train,y_train)
print('Training model 3 channel')
preds = clf3.predict(X_train)
print('has f1 validation score =',f1_score(y_train,preds,average='macro'))

tree_graph = tree.export_graphviz(clf3, out_file=None, max_depth = 10,
    impurity = False, feature_names = ['signal'], class_names = ['0', '1','2','3'],
    rounded = True, filled= True )
graphviz.Source(tree_graph) 

## 5 Open Channels

In [None]:
batch = 6; a = 500000*(batch-1); b = 500000*batch
batch = 9; c = 500000*(batch-1); d = 500000*batch
X_train = np.concatenate([train2.signal.values[a:b],train2.signal.values[c:d]]).reshape((-1,1))
y_train = np.concatenate([train2.open_channels.values[a:b],train2.open_channels.values[c:d]]).reshape((-1,1))

clf5 = tree.DecisionTreeClassifier(max_leaf_nodes=6)
clf5 = clf5.fit(X_train, y_train)
print('Trained model 5 channel')
preds = clf5.predict(X_train)
print('has f1 validation score =',f1_score(y_train,preds,average='macro'))

tree_graph = tree.export_graphviz(clf5, out_file=None, max_depth = 10,
    impurity = False, feature_names = ['signal'], class_names = ['0', '1','2','3','4','5'],
    rounded = True, filled= True )
graphviz.Source(tree_graph) 

## 10 Open Channels

In [None]:
batch = 5; a = 500000*(batch-1); b = 500000*batch
batch = 10; c = 500000*(batch-1); d = 500000*batch
X_train = np.concatenate([train2.signal.values[a:b],train2.signal.values[c:d]]).reshape((-1,1))
y_train = np.concatenate([train2.open_channels.values[a:b],train2.open_channels.values[c:d]]).reshape((-1,1))

clf10 = tree.DecisionTreeClassifier(max_leaf_nodes=8)
clf10 = clf10.fit(X_train, y_train)
print('Trained model 10 channel')
preds = clf10.predict(X_train)
print('has f1 validation score =',f1_score(y_train,preds,average='macro'))

tree_graph = tree.export_graphviz(clf10, out_file=None, max_depth = 10,
    impurity = False, feature_names = ['signal'], class_names = [str(x) for x in range(11)],
    rounded = True, filled= True )
graphviz.Source(tree_graph) 

# Analyze Test Data Drift
Let's plot the drift in the training and test data

## Training Data Drift
We observe drift whereever the following plot is not a horizontal line. We see drift in batches 2, 7, 8, 9, 10.

In [None]:
# ORIGINAL TRAIN DATA
plt.figure(figsize=(20,5))
r = train.signal.rolling(30000).mean()
plt.plot(train.time.values,r)
for i in range(11): plt.plot([i*50,i*50],[-3,8],'r:')
for j in range(10): plt.text(j*50+20,6,str(j+1),size=20)
plt.title('Training Signal Rolling Mean. Has Drift wherever plot is not horizontal line',size=16)
plt.show()

# TRAIN DATA WITHOUT DRIFT
plt.figure(figsize=(20,5))
r = train2.signal.rolling(30000).mean()
plt.plot(train2.time.values,r)
for i in range(11): plt.plot([i*50,i*50],[-3,8],'r:')
for j in range(10): plt.text(j*50+20,6,str(j+1),size=20)
plt.title('Training Signal Rolling Mean without Drift',size=16)
plt.show()

## Test Data Drift
We observe drift in test subsamples A, B, E, G, H, I and test batch 3.


In [None]:
plt.figure(figsize=(20,5))
let = ['A','B','C','D','E','F','G','H','I','J']
r = test2.prev.rolling(30000).mean()
plt.plot(test.time.values,r)
for i in range(21): plt.plot([500+i*10,500+i*10],[-3,6],'r:')
for i in range(5): plt.plot([500+i*50,500+i*50],[-3,6],'r')
for k in range(4): plt.text(525+k*50,5.5,str(k+1),size=20)
for k in range(10): plt.text(505+k*10,4,let[k],size=16)
plt.title('Test Signal Rolling Mean. Has Drift wherever plot is not horizontal line',size=16)
plt.show()

# Predict Test


1s ---> atmost 1 open channel with low prob  
1f ---> atmost 1 open channel with high prob  
 3 ---> atmost 3 open channels with high prob  
 5 ---> atmost 5 open channels with high prob  
10 ---> atmost 10 open channels with high prob  

In [None]:
test2.tail()

In [None]:
test2=test.copy()
test2['prev']=0

In [None]:
test2['prev'][0+1:500000] = test2['signal'][0:500000-1]
test2['prev'][500000+1:1000000] = test2['signal'][500000:1000000-1]
test2['prev'][1000000+1:1500000] = test2['signal'][1000000:1500000-1]
test2['prev'][1500000+1:2000000] = test2['signal'][1500000:2000000-1]

In [None]:
test2['prob']=0.5

In [None]:
a = 0


In [None]:
sub = pd.read_csv('../input/liverpool-ion-switching/sample_submission.csv')

a = 0 # SUBSAMPLE A, Model 1s
#sub.iloc[100000*a:100000*(a+1),1] = clf1s.predict(test2[['signal','prev']][100000*a:100000*(a+1)].reshape((-1,1)))
sub.iloc[100000*a:100000*(a+1),1] =clf1s.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))

a = 1 # SUBSAMPLE B, Model 3
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions = [*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

a = 2 # SUBSAMPLE C, Model 5
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions =[*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

a = 3 # SUBSAMPLE D, Model 1s
#sub.iloc[100000*a:100000*(a+1),1] = clf1s.predict(test2.signal.values[100000*a:100000*(a+1)].reshape((-1,1)))
sub.iloc[100000*a:100000*(a+1),1] =clf1s.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))

a = 4 # SUBSAMPLE E, Model 1f
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions = [*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

a = 5 # SUBSAMPLE F, Model 10
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions = [*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

a = 6 # SUBSAMPLE G, Model 5
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions = [*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

a = 7 # SUBSAMPLE H, Model 10
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions = [*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

a = 8 # SUBSAMPLE I, Model 1s
#sub.iloc[100000*a:100000*(a+1),1] = clf1s.predict(test2.signal.values[100000*a:100000*(a+1)].reshape((-1,1)))
sub.iloc[100000*a:100000*(a+1),1] =clf1s.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))

a = 9 # SUBSAMPLE J, Model 3
y_pred = model.predict(np.asarray(test2[['signal','prev']][100000*a:100000*(a+1)]))
predictions = [*map(np.argmax,y_pred)]
sub.iloc[100000*a:100000*(a+1),1] = np.asarray(predictions)

 # BATCHES 3 AND 4 seem to be generated from Model 1s
#sub.iloc[1000000:2000000,1] = clf1s.predict(test2.signal.values[1000000:2000000].reshape((-1,1)))
sub.iloc[1000000:2000000,1] =clf1s.predict(np.asarray(test2[['signal','prev']][1000000:2000000]))

# Display Test Predictions

In [None]:
plt.figure(figsize=(20,5))
res = 1000
plt.plot(range(0,test.shape[0],res),sub.open_channels[0::res])
for i in range(5): plt.plot([i*500000,i*500000],[-5,12.5],'r')
for i in range(21): plt.plot([i*100000,i*100000],[-5,12.5],'r:')
for k in range(4): plt.text(k*500000+250000,10,str(k+1),size=20)
for k in range(10): plt.text(k*100000+40000,7.5,let[k],size=16)
plt.title('Test Data Predictions',size=16)
plt.show()

In [None]:
sub.to_csv('submission.csv',index=False,float_format='%.4f')

In [None]:
sub['open_channels'].hist()

In [None]:
import pickle
# save model to file
pickle.dump(model, open("pima.pickle.dat", "wb"))
 


In [None]:
# some time later...
 
# load model from file
loaded_model = pickle.load(open("pima.pickle.dat", "rb"))

In [None]:
loaded_model.predict(X_test[:10])