In [None]:
import numpy as np # numpy library used mainly for linear algebra
import pandas as pd # pandas library used to read and manipulate tabular data

# define random seed for reproducibility we will use it in other instances in the code
seed = 17
np.random.seed(seed)

# load our data
print("loading our dataset please wait..")
root_dir = "../datasets/reducing-commercial-aviation-fatalities/" # the root directory of the dataset
df_train = pd.read_csv(root_dir + "train.csv") # load training data
df_test = pd.read_csv(root_dir + "test.csv") # load testing data
print("Our dataset loaded.. Successfully")

# EDA

In [None]:
# explore the training and testing data
print(df_train.head())
print(df_test.head())

In [None]:
# get the unique values to categorize the dataframes

crew_ids = df_train["crew"].unique()
print("crew ids: {}".format(crew_ids))

experiment_ids = df_train["experiment"].unique()
print("experiment ids: {}".format(experiment_ids))

seat_ids = df_train["seat"].unique()
print("seat ids: {}".format(seat_ids))




*   crew - a unique id for a pair of pilots. There are 9 crews in the data.
*   experiment - One of CA, DA, SS or LOFT. The first 3 comprise the training set. The latter the test set.
* seat - is the pilot in the left (0) or right (1) seat
*  time - seconds into the experiment


---


As mentionned in the data description of the competition, each of the 18 pilots was recorded over time and subjected to the CA, DA, or SS cognitive states. The training set contains three experiments (one for each state) in which the pilots experienced just one of the states.

Thus, we have to hierarchically group the training data by :

crew
seat (pilot)
experiment


---



let's visualize the data by categorizing it using those attributes






In [None]:
# categorize and map with intergers

dic_exp = {'CA': 2, 'DA': 3, 'SS': 1, 'LOFT': 4}
# A = baseline, B = SS, C = CA, D = DA
dic_event = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

labels_exp = {v: k for k, v in dic_exp.items()}
labels_event = {v: k for k, v in dic_event.items()}

df_train["event"] = df_train["event"].apply(lambda x: dic_event[x])
df_train["event"] = df_train["event"].astype('int8')
df_train['experiment'] = df_train['experiment'].apply(lambda x: dic_exp[x])
df_test['experiment'] = df_test['experiment'].apply(lambda x: dic_exp[x])

df_train['experiment'] = df_train['experiment'].astype('int8')
df_test['experiment'] = df_test['experiment'].astype('int8')

In [None]:
# indexing the df_train with pilots and experiments
DF_train = df_train.set_index(['crew', 'seat', 'experiment']).sort_values(by=['time']).sort_index()
DF_train.head()

In [None]:
# retrieve one pilot
pilot = DF_train.loc[1, 0]
pilot.head()

In [None]:
# constructing a visualizing tool for our needs
# areas colors corresponding to the event state of the pilot
# baseline: gray, CA: green, SS: red, DA: blue
import matplotlib.pyplot as plt

# areas colors corresponding to the event state of the pilot
# baseline: gray, SS: red, CA: green, DA: blue
event_colors = {0: 'gray', 1: 'red', 2: 'green', 3: 'blue'}

def plot_ts(pilot_data, features, exp=0):
    exp_data = pilot_data.loc[exp]
    
    ax = exp_data.plot(
         kind='line',
         x='time', 
         y=features, 
         figsize=(15,5), 
         linewidth=2.
    )
    changes = exp_data[exp_data.event.diff().abs()>0][['time', 'event']].values
    times = [0] + list(changes[:, 0]) + [exp_data.time.max()]
    events = [exp_data.event.iloc[0]] + list(changes[:, 1])
    for i in range(len(times)-1):
        event = events[i]
        ax.axvspan(times[i], times[i+1], facecolor=event_colors[event], alpha=0.1)
    
    plt.show()

Here is the time evolution of the feature event over time, which are also represented by the color backgrounds, so we can assure that the plotting function is correct.

In [None]:
plot_ts(pilot_data=pilot, features='event', exp=dic_exp["CA"])
plot_ts(pilot_data=pilot, features='event', exp=dic_exp["SS"])
plot_ts(pilot_data=pilot, features='event', exp=dic_exp["DA"])

##### Visualize the gsr feature to the 3 experiments for the same pilot using plot_ts function

In [None]:
## write your visualization code here

##### Visualize the ecg feature to the 3 experiments for the same pilot using plot_ts function

In [None]:
## write your visualization code here

You can visualize anything using the plot_ts function 


Here is a very good example to see the data


In [None]:
f = ["eeg_fp1","eeg_f7","eeg_f8"]

plot_ts(pilot_data=pilot, features=f, exp=dic_exp["CA"])
plot_ts(pilot_data=pilot, features=f, exp=dic_exp["SS"])
plot_ts(pilot_data=pilot, features=f, exp=dic_exp["DA"])

Please visualize all the features and understand the nature of the dataset using plot_ts function

Let's take some of features and try to understand

like ecg which was so noisy

In [None]:
plot_ts(pilot_data=pilot, features='ecg', exp=dic_exp["CA"])

We can use biosppy library which can help us in this



In [None]:
from biosppy import ecg

ecg_out = ecg.ecg(signal=pilot['ecg'].values, sampling_rate=256., show=False)
plt.plot(ecg_out['heart_rate_ts'], ecg_out['heart_rate'])

##### Please use biosppy for gsr , r & eeg signals

In [None]:
#### use biosppy here

Let's get to know our data more and relation between everything


In [None]:
import seaborn as sns

#### Visualizing the count of events in the train dataset

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(df_train['event'])
plt.xlabel("State of the pilot", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Target repartition", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.countplot('experiment', hue='event', data=df_train)
plt.xlabel("Experiment and state of the pilot", fontsize=12)
plt.ylabel("Count (log)", fontsize=12)
plt.yscale('log')
plt.title("Target repartition for different experiments", fontsize=15)
plt.show()


The experiment of the test set is LOFT (Line Oriented Flight Training), which is a full flight (take off, flight, and landing) in a flight simulator.

Seat
Which seat the pilot is sitting in.

0 : left seat
1 : right seat
This probably has nothing to do with the outcome of the experiment though.

##### Visualize between the event and the seat here like the previous example

In [None]:
### write your visualization code here

Time of the experiment¶


In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x='event', y='time', data=df_train.sample(50000))
plt.ylabel("Time (s)", fontsize=12)
plt.xlabel("Event", fontsize=12)
plt.title("Which time do events occur at ?", fontsize=15)
plt.show()

Data set of test time
Data set of train time

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(df_test['time'], label='Test set')
sns.distplot(df_train['time'], label='Train set')
plt.legend()
plt.xlabel("Time (s)", fontsize=12)
plt.title("Reparition of the time feature", fontsize=15)
plt.show()

### Electroencephalogram recordings

In [None]:
eeg_features = ["eeg_fp1", "eeg_f7", "eeg_f8", "eeg_t4", "eeg_t6", "eeg_t5", "eeg_t3", "eeg_fp2", "eeg_o1", "eeg_p3", "eeg_pz", "eeg_f3", "eeg_fz", "eeg_f4", "eeg_c4", "eeg_p4", "eeg_poz", "eeg_c3", "eeg_cz", "eeg_o2"]

##### visualize all eeg signals using sns.boxplot and 5k samples from the dataset

In [None]:
#### Write your visualization code here

We Also check if features have the same distribution on the test and train set

In [None]:
plt.figure(figsize=(20,25))
plt.title('ECG features distributions')
i = 0


    
sns.distplot(df_test.sample(10000)["ecg"], label='Test set', hist=False)
sns.distplot(df_train.sample(10000)["ecg"], label='Train set', hist=False)
plt.xlim((-500, 500))
plt.legend()
plt.xlabel("ecg", fontsize=12)

plt.show()

##### visualize all eeg features as the previous example

In [None]:
#### Write your visualization code here

Write a note waht do you observe

### Electrocardiogram
- 3-point Electrocardiogram signal. The sensor had a resolution/bit of .012215 µV and a range of -100mV to +100mV. The data are provided in microvolts.

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x='event', y='ecg', data=df_train.sample(50000))
plt.ylabel("Electrocardiogram Signal (µV)", fontsize=12)
plt.xlabel("Event", fontsize=12)
plt.title("Electrocardiogram signal influence", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(df_test['ecg'], label='Test set')
sns.distplot(df_train['ecg'], label='Train set')
plt.legend()
plt.xlabel("Electrocardiogram Signal (µV)", fontsize=12)
plt.title("Electrocardiogram Signal Distribution", fontsize=15)
plt.show()

Except foir the >20000-ish samples, train/test repartitions are similar.

### Respiration 
- A measure of the rise and fall of the chest. The sensor had a resolution/bit of .2384186 µV and a range of -2.0V to +2.0V. The data are provided in microvolts.

In [None]:
### write visualization here as previous for r signals

In [None]:
### write visualization here as previous for r signals

Nothing much to say here, the test set has a bunch of sample with lower values though.

### Galvanic Skin Response
 - A measure of electrodermal activity. The sensor had a resolution/bit of .2384186 µV and a range of -2.0V to +2.0V. The data are provided in microvolts.
 > "The galvanic skin response (GSR, which falls under the umbrella term of electrodermal activity, or EDA) refers to changes in sweat gland activity that are reflective of the intensity of our emotional state, otherwise known as emotional arousal."

In [None]:
### write visualization here as previous for gsr signals

In [None]:
### write visualization here as previous for gsr signals

# Preprocessing the dataset for the training and model benchmarking




In [None]:
# let's see the dataframes again to revise them
print(df_train.head())
print(df_test.head())

Get the numpys ready

Just another check 
That the same number of pilots

In [None]:
x_pilots = 20 * df_train['seat'] + df_train['crew']
xx_pilots = 20 * df_test['seat'] + df_test['crew']

print("Number of train pilots : ", len(x_pilots.unique()))
print("Number of test pilots : ", len(xx_pilots.unique()))

In [None]:
### reduce the memory usage for the dataframes to avoid ram explosion
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

##### Reducing the dataframes

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

#### The Columns which we will train on

In [None]:
features_n = ["eeg_fp1", "eeg_f7", "eeg_f8", "eeg_t4", "eeg_t6",
              "eeg_t5", "eeg_t3", "eeg_fp2", "eeg_o1", "eeg_p3", 
              "eeg_pz", "eeg_f3", "eeg_fz", "eeg_f4", "eeg_c4",
              "eeg_p4", "eeg_poz", "eeg_c3", "eeg_cz", "eeg_o2",
              "ecg", "r", "gsr"]

##### What do you think ? Do you see that we forgot something ? or that's right ?!

In [None]:
# adding the pilot column in the dataframe
df_train['pilot'] = 100 * df_train['seat'] + df_train['crew']
df_test['pilot'] = 100 * df_test['seat'] + df_test['crew']
print("Number of pilots : ", len(df_train['pilot'].unique()))

This dataset has a different nature and should be normalized by pilot.
So we will create a function to normalize the dataset by pilot.
We should normalize the test dataset as the train dataset

In [None]:
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
def normalize_by_pilots(train_df, test_df):
    pilots = train_df["pilot"].unique()
    for pilot in tqdm(pilots):
        ids = train_df[train_df["pilot"] == pilot].index
        scaler = MinMaxScaler()
        train_df.loc[ids, features_n] = scaler.fit_transform(train_df.loc[ids, features_n])
        test_df.loc[ids, features_n] = scaler.transform(test_df.loc[ids, features_n])
        
    return train_df, test_df

In [None]:
train_df, test_df = normalize_by_pilots(df_train, df_test)

Train\Val splitting

As we had seen there is different counts of the observations by states. So dataset is disbalanced. In such case for Train\Val splitting better to use StratifiedShuffleSplit method for proportional splitting by all classes

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in split.split(train_df,train_df["event"]):
    strat_train = train_df.loc[train_index]
    strat_val = train_df.loc[val_index]

In [None]:
x_train = strat_train[features_n]
y_train = strat_train['event']
x_val = strat_val[features_n]
y_val = strat_val['event']

print(x_train.head(2))
print(y_train.head(2))
print(x_val.head(2))
print(y_val.head(2))

#### We can solve the imblanaced data cuz of imbalanced number of events using SMOTE
##### Did you hear of it ? 

In [None]:
#### try SMOTE here

## Feature Engineering

##### Did you try to do some processing and aggregations on the signals ??

##### What about some polynomial interactions between 2 features ?!

##### Should we design a single model for every pilot ?!

#### Did you know that there is some missing data (sensors) in pilots data

# Models


## Sklearn Example

##### Sklearn MLPClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics
from time import time
import lightgbm as lgb

In [None]:
# Definition
mlp_clf = MLPClassifier(hidden_layer_sizes=(100), max_iter=10, alpha=0.0001,
                      solver='sgd', verbose=10,  random_state=21,tol=0.000000001)
# training
mlp_clf.fit(x_train, y_train)
# metrics
pred = mlp_clf.predict(x_val)
prob = mlp_clf.predict_proba(x_val)
loss = log_loss(y_val, prob, labels=mlp_clf.classes_)
score = metrics.accuracy_score(y_val, pred)
print("Log loss: {}".format(loss))
print("accuracy:   %0.3f" % score)
print(metrics.classification_report(y_val, pred, target_names=["A","B","C","D"]))
print(metrics.confusion_matrix(y_val, pred))
# submission
prob_test = mlp_clf.predict_proba(test_df[features_n])

sub = pd.DataFrame(prob_test, columns=['A', 'B', 'C', 'D'])
sub.to_csv('submission_x.csv', index_label='id')

Lets consider following models:

- Boosted Decision Trees (using lightgbm)
- Decision Tree Classifier
- Random Forest
- Multilayer Perceptron Classifier

In [None]:
classifiers = [
    ### Fill that code with your models
]

df_metrics = pd.DataFrame(columns=['model', 'log_loss', 'accuracy', 'precision', 'recall'])

# iterate over classifiers
for clf in classifiers:
    print('_' * 80)
    print("Training: {}".format(clf))
    t0 = time()
    clf.fit(x_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(x_val)
    prob = clf.predict_proba(x_val)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    
    loss = log_loss(y_val, prob, labels=clf.classes_)
    print("Log loss: {}".format(loss))

    score = metrics.accuracy_score(y_val, pred)
    print("accuracy:   %0.3f" % score)
    
    pr_score = precision_score(y_val, pred, average='weighted')
    rc_score = recall_score(y_val, pred, average='weighted')
    
    df_metrics.loc[str(clf)] = [clf, loss, score, pr_score, rc_score]
    
    print("classification report:")
    print(metrics.classification_report(y_val, pred,
                                        target_names=["A","B","C","D"]))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_val, pred))

    print()
    clf_descr = str(clf).split('(')[0]

print(df_metrics)
df_metrics.to_csv("df_metrics.csv")

## Keras

In [None]:
# import numpys from the dataframes
x_train_np = x_train.values
y_train_np = y_train.values
x_val_np = x_val.values
y_val_np = y_val.values

In [None]:
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense, Dropout, ReLU
from keras.optimizers import Adam
from keras.regularizers import l2

###

# build Here your own keras model

###

# Plot the loss curves for training and validation 
plt.plot(history.history['loss'], color='r', label="training loss")
plt.plot(history.history['val_loss'], color='b', label="validation loss")
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot the accuracy curves for training and validation 
plt.plot(history.history['acc'], color='r', label="training accuracy")
plt.plot(history.history['val_acc'], color='b', label="validation accuracy")
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy")
plt.ylim(0,1)
plt.legend()
plt.show()

### What about Random Forest Classifier ? Get the best fit of it on our data

In [None]:
### implement the best random forest classifier with the best parameters here

### What about Decision Tree Classifier ? Get the best fit of it on our data

In [None]:
### implement the best decision tree classifier with the best parameters here

#### Do you think using some RNNs & LSTMs can work on this dataset ? And if yes how you will construct it.