# Project M2 - Cell Type Classification with Morphology features
### The objective of this project is to classify two cell types (spiny/aspiny) according to their Morphology features using both logistic regression and neural network.
#### The data set is downloaded from the __[Allen Institute data base](http://alleninstitute.github.io/AllenSDK/_static/examples/nb/cell_types.html#Computing-Electrophysiology-Features)__ and is already saved in the file "MorphFeatures.csv".
#### Train the classifiers similarly to Project M1 using only morphology featrues. Then try to combine them with electrophysiology features to see how that would change the accuracy.

## Getting started with the Allen's data set
Use python library Pandas to read the csv file. The data set is now stored in Pandas dataframe.

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv("MorphFeatures.csv",index_col=0)
print(df.shape)
df.head(5) #show some lines on the data from the first record.

(670, 31)


Unnamed: 0,average_bifurcation_angle_local,average_bifurcation_angle_remote,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,hausdorff_dimension,id,max_branch_order,max_euclidean_distance,...,scale_factor_y,scale_factor_z,soma_surface,specimen_id,superseded,tags,total_length,total_surface,total_volume,dendrite_type
0,82.727781,,0.864267,0.345092,20.723077,0.96451,,491119743,6.0,99.779724,...,0.1144,0.28,435.74027,478107198,False,3D Neuron Reconstruction morphology,1666.082926,1803.875644,167.343086,aspiny
1,82.50668,,0.90389,0.634047,105.277778,0.862183,,546781359,3.0,432.38311,...,0.1144,0.28,1446.587725,502367941,False,3D Neuron Reconstruction morphology,2277.259374,4543.139073,921.571895,spiny
2,77.536678,,0.863104,0.417929,73.666667,0.926633,,537042261,6.0,373.630444,...,0.1144,0.28,287.118123,515771244,False,3D Neuron Reconstruction morphology,3589.339062,4704.910407,582.285423,spiny
3,76.583222,,0.900537,0.400396,95.979167,0.942049,,689123605,11.0,943.382549,...,0.1144,0.28,180.994813,561435279,False,3D Neuron Reconstruction morphology,5416.228778,6814.93329,740.722806,spiny
4,72.01925,,0.873518,0.227626,47.535714,1.0,,657879305,5.0,186.218009,...,0.1144,0.28,55.055236,591268268,False,3D Neuron Reconstruction morphology,1659.465869,1185.773462,69.144146,aspiny


The cell type is determined by the dendrite type in the last column of the data set. Ignore the samples of minority type called "sparsely spiny".

In [3]:
df.dropna(axis=1,inplace=True) # Drop columns with Nan values
df = df.drop_duplicates(subset=['specimen_id']) # drop duplicated of specimen_id

# Get rid of sparsely spiny cells
df = df[df.dendrite_type!='sparsely spiny'] #keep all the data that 'dendrite_type' is not 'sparsely spiny'
print(df.shape)
print(df.columns)
df.head(5)

(619, 29)
Index(['average_bifurcation_angle_local', 'average_contraction',
       'average_diameter', 'average_fragmentation',
       'average_parent_daughter_ratio', 'id', 'max_branch_order',
       'max_euclidean_distance', 'max_path_distance',
       'neuron_reconstruction_type', 'number_bifurcations', 'number_branches',
       'number_nodes', 'number_stems', 'number_tips', 'overall_depth',
       'overall_height', 'overall_width', 'scale_factor_x', 'scale_factor_y',
       'scale_factor_z', 'soma_surface', 'specimen_id', 'superseded', 'tags',
       'total_length', 'total_surface', 'total_volume', 'dendrite_type'],
      dtype='object')


Unnamed: 0,average_bifurcation_angle_local,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,id,max_branch_order,max_euclidean_distance,max_path_distance,neuron_reconstruction_type,...,scale_factor_y,scale_factor_z,soma_surface,specimen_id,superseded,tags,total_length,total_surface,total_volume,dendrite_type
0,82.727781,0.864267,0.345092,20.723077,0.96451,491119743,6.0,99.779724,126.59379,dendrite-only,...,0.1144,0.28,435.74027,478107198,False,3D Neuron Reconstruction morphology,1666.082926,1803.875644,167.343086,aspiny
1,82.50668,0.90389,0.634047,105.277778,0.862183,546781359,3.0,432.38311,496.831994,dendrite-only,...,0.1144,0.28,1446.587725,502367941,False,3D Neuron Reconstruction morphology,2277.259374,4543.139073,921.571895,spiny
2,77.536678,0.863104,0.417929,73.666667,0.926633,537042261,6.0,373.630444,436.958952,dendrite-only,...,0.1144,0.28,287.118123,515771244,False,3D Neuron Reconstruction morphology,3589.339062,4704.910407,582.285423,spiny
3,76.583222,0.900537,0.400396,95.979167,0.942049,689123605,11.0,943.382549,989.448317,full,...,0.1144,0.28,180.994813,561435279,False,3D Neuron Reconstruction morphology,5416.228778,6814.93329,740.722806,spiny
4,72.01925,0.873518,0.227626,47.535714,1.0,657879305,5.0,186.218009,221.639502,full,...,0.1144,0.28,55.055236,591268268,False,3D Neuron Reconstruction morphology,1659.465869,1185.773462,69.144146,aspiny


## Feature Engineering

In [4]:
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df_full = df.drop(columns=['id','neuron_reconstruction_type','scale_factor_x', 'scale_factor_y','scale_factor_z','specimen_id','superseded','tags'])
print(df_full.shape)
df_full.head()

(619, 21)


Unnamed: 0,average_bifurcation_angle_local,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,max_branch_order,max_euclidean_distance,max_path_distance,number_bifurcations,number_branches,...,number_stems,number_tips,overall_depth,overall_height,overall_width,soma_surface,total_length,total_surface,total_volume,dendrite_type
0,82.727781,0.864267,0.345092,20.723077,0.96451,6.0,99.779724,126.59379,33,73,...,7,40,51.4886,140.506829,136.267522,435.74027,1666.082926,1803.875644,167.343086,aspiny
1,82.50668,0.90389,0.634047,105.277778,0.862183,3.0,432.38311,496.831994,9,23,...,5,14,92.6671,566.70122,370.170045,1446.587725,2277.259374,4543.139073,921.571895,spiny
2,77.536678,0.863104,0.417929,73.666667,0.926633,6.0,373.630444,436.958952,21,46,...,4,25,65.4696,425.897625,381.015114,287.118123,3589.339062,4704.910407,582.285423,spiny
3,76.583222,0.900537,0.400396,95.979167,0.942049,11.0,943.382549,989.448317,24,52,...,4,28,99.9139,1217.694976,524.550156,180.994813,5416.228778,6814.93329,740.722806,spiny
4,72.01925,0.873518,0.227626,47.535714,1.0,5.0,186.218009,221.639502,14,32,...,4,18,54.3718,172.075941,261.459057,55.055236,1659.465869,1185.773462,69.144146,aspiny


In [5]:
X = abs(df_full.iloc[:,:-1]) # Need to take absolute value for SelectKBest to work
y = df_full.iloc[:,-1]       # iloc Purely integer-location based indexing for selection by position.

In [6]:
bestfeatures = SelectKBest(score_func=chi2)
fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df_full.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

print(featureScores.nlargest(20,'Score'))  #print 20 best features

                              Specs          Score
18                    total_surface  539939.690284
17                     total_length  264251.289979
10                     number_nodes  220597.718099
19                     total_volume  115805.409932
7                 max_path_distance   30418.233712
6            max_euclidean_distance   27774.577274
14                   overall_height   23066.856685
9                   number_branches    2410.349432
15                    overall_width    2214.133645
13                    overall_depth    1994.393610
12                      number_tips    1213.162125
8               number_bifurcations    1201.211153
16                     soma_surface     905.144986
5                  max_branch_order     556.265637
11                     number_stems      64.412472
3             average_fragmentation      55.236331
0   average_bifurcation_angle_local      16.120053
2                  average_diameter       0.834203
4     average_parent_daughter_r

In [7]:
pick_feats = list(featureScores.nlargest(5,'Score').Specs) # make a list of the ten best features
pick_feats.append('dendrite_type') # add dendrite_type to the list

df_small = df[pick_feats] # Make a new DataFrame with our selected features
df_small.head(5)

Unnamed: 0,total_surface,total_length,number_nodes,total_volume,max_path_distance,dendrite_type
0,1803.875644,1666.082926,1470,167.343086,126.59379,aspiny
1,4543.139073,2277.259374,2011,921.571895,496.831994,spiny
2,4704.910407,3589.339062,3137,582.285423,436.958952,spiny
3,6814.93329,5416.228778,4652,740.722806,989.448317,spiny
4,1185.773462,1659.465869,1406,69.144146,221.639502,aspiny


### Now you have defined the training data set and the class labels. Next train the logistic regression classifier and the neural network like in the two examples and compare the performance of these two methods.
#### Example for logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, scale
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.neural_network import MLPClassifier 

Using TensorFlow backend.


Split the dataset into two groups for training and testing.

In [9]:
train = df_small.sample(frac=0.8,random_state=111) # random pick 80% for training
test = df_small.drop(train.index)
print(train.shape)
print(test.shape)

(495, 6)
(124, 6)


Train and test in different groups.

In [10]:
X = train.values[:,:-1] # put all columns from first to last but not including the last one in X
Y = train.values[:,-1]  # put last columns in y
x = test.values[:,:-1]  # put all columns from first to last but not including the last one in X
y = test.values[:,-1]   # put last columns in y

lb = LabelBinarizer()  # Binarize labels in a one-vs-all fashion
Y_b = lb.fit_transform(Y) # convert values in y into binary labels
y_b = lb.fit_transform(y)

log_reg = LogisticRegression(penalty="l2")
n_n = MLPClassifier()

log_reg.fit(X,Y_b)
n_n.fit(X,Y_b)

y_pred = log_reg.predict(x)  #Predict class labels for samples in X.
y_pred2 = n_n.predict(x)

print("Model accuracy:", accuracy_score(y_b,y_pred)) 
print("Model accuracy:", accuracy_score(y_b,y_pred2))
# Calculate the accuracy comparing the predicted labels with the ground truth


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model accuracy: 0.7983870967741935
Model accuracy: 0.5564516129032258


Model accuracy before combination: 
Linear regression 79.8% 
Neural Network 55.6%

### After you finish both logistic regression and neural network training on the morphology featrues, try to combine with the electrophysiology features and see how it will affect the result. You can find the corresponding cell in the electrophysiology features dataset according to the specimen_id.

In [1]:
df_ef = pd.read_csv("ElecPhyFeatures.csv",index_col=0)       # Read electrophysiology data
print(df.shape)
print(df_ef.shape)

df_cb = pd.merge(df,df_ef,how='inner',left_on='specimen_id', right_on='specimen_id',suffixes=('_mp','')) # Combine two data frames
print(df_cb.shape)

df_cb = df_cb.drop(columns='dendrite_type_mp') # dendrite_type was duplicated while merging two dataframes, drop out one of them.
print(df_cb.shape)
print(df_cb.columns)
df_cb.head(5)

df_cb.drop(columns=['id','neuron','reconstruction_type','scale_factor_x',\
                   'scale_factor_y','scale_factor_z','specimen_id','superseded''tags',\
                    'thumbnail_sweep_id','rheobas_sweep_id','id_map'],inplace=True)

df_cb.dropna(axis=1,inplace=True) # Drop 

NameError: name 'pd' is not defined

Now that you have combined the data, you must re-split it, train, and test it

In [39]:
train = df_cb.sample(frac=0.8,random_state=111) # random pick 80% for training
test = df_cb.drop(train.index)
print(train.shape)
print(test.shape)

(495, 84)
(124, 84)


Train and test in different groups now that data is combined

In [40]:
X2 = train.values[:,:-1] #put all columns from first to last but not including the last one in X
Y2 = train.values[:,-1]  # put last columns in y
x2 = test.values[:,:-1]  # put all columns from first to last but not including the last one in X
y2 = test.values[:,-1]   # put last columns in y

lb2 = LabelBinarizer()  # Binarize labels in a one-vs-all fashion
Y_b2 = lb.fit_transform(Y2) # convert values in y into binary labels
y_b2 = lb.fit_transform(y2)

log_regression = LogisticRegression(penalty="l2")
n_net = MLPClassifier()

log_regression.fit(X2,Y_b2)
n_net.fit(X2,Y_b2)

y_predict = log_reg.predict(x2)  #Predict class labels for samples in X.
y_predict2 = n_n.predict(x2)

print("Model accuracy:", accuracy_score(y_b,y_pred)) 
print("Model accuracy:", accuracy_score(y_b,y_pred2))
# Calculate the accuracy comparing the predicted labels with the ground truth


ValueError: could not convert string to float: '3D Neuron Reconstruction morphology'