In [1]:
 #import pandas and numpy libraries
import pandas as pd
import numpy as np

In [3]:
#Establish connection for Google drive to access files
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
#load data and replace null with 0
df = pd.read_csv('gdrive/MyDrive/full_table_4_SOMs.csv')
df = df.replace(np.nan,0) 
print(df)

       Unnamed: 0  Unnamed: 0.1  ...  SOM_Cluster  Distribution
0               0         98607  ...            2             1
1               1         67595  ...            3             1
2               2        121125  ...            3             1
3               3         99994  ...            0             0
4               4        117589  ...            2             1
...           ...           ...  ...          ...           ...
19995       19995         95340  ...            2             1
19996       19996         21925  ...            0             0
19997       19997         63677  ...            2             1
19998       19998         68229  ...            2             1
19999       19999         24723  ...            2             1

[20000 rows x 36 columns]


In [5]:
X = df[['length','abundance','latitude','longitude','depth','nitrate','temperature','salinity','oxygen','darwin_alkalinity','SOM_Cluster']]
y = df[['c_to_n']]
X.shape

(20000, 11)

In [6]:
#Univariate feature selection
#".values.ravel()" is used to convert the y column-vector to 1D array
from sklearn.feature_selection import f_regression
X_new = f_regression(X,y.values.ravel())
print('Results (F-statistic | p-value):')
print('Length: ' + str(X_new[0][0]) + ' | ' + str(X_new[1][0]))
print('Abundance: ' + str(X_new[0][1]) + ' | ' + str(X_new[1][1]))
print('Latitude: ' + str(X_new[0][2]) + ' | ' + str(X_new[1][2]))
print('Longitude: ' + str(X_new[0][3]) + ' | ' + str(X_new[1][3]))
print('Depth: ' + str(X_new[0][4]) + ' | ' + str(X_new[1][4]))
print('Nitrate: ' + str(X_new[0][5]) + ' | ' + str(X_new[1][5]))
print('Temperature: ' + str(X_new[0][6]) + ' | ' + str(X_new[1][6]))
print('Salinity: ' + str(X_new[0][7]) + ' | ' + str(X_new[1][7]))
print('Oxygen: ' + str(X_new[0][8]) + ' | ' + str(X_new[1][8]))
print('Alkalinity: ' + str(X_new[0][9]) + ' | ' + str(X_new[1][9]))
print('SOM Cluster: ' + str(X_new[0][10]) + ' | ' + str(X_new[1][10]))

Results (F-statistic | p-value):
Length: 51.646945106615824 | 6.877053389419566e-13
Abundance: 0.08918457170523508 | 0.7652194773838653
Latitude: 1.2196470157592918 | 0.26944322400967785
Longitude: 1.9619159031749582 | 0.16132408010447752
Depth: 99.26301944618356 | 2.5059600185508335e-23
Nitrate: 75.74869921061523 | 3.4670952770832803e-18
Temperature: 22.085328313801433 | 2.625317773963414e-06
Salinity: 0.18943916603549654 | 0.6633885022720774
Oxygen: 2.494904253137154 | 0.11423113791878442
Alkalinity: 46.177614490443716 | 1.1104403869733972e-11
SOM Cluster: 21211.82051780836 | 0.0


In [7]:
#Create train / test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

In [27]:
#Mutual Information
#Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between
#the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
selector = SelectKBest(mutual_info_regression, k=6)
train = selector.fit_transform(X_train, y_train.values.ravel())
selector.get_support()
print('Results:')
print('Length: ' + str(selector.scores_[0]))
print('Abundance: ' + str(selector.scores_[1]))
print('Latitude: ' + str(selector.scores_[2]))
print('Longitude: ' + str(selector.scores_[3]))
print('Depth: ' + str(selector.scores_[4]))
print('Nitrate: ' + str(selector.scores_[5]))
print('Temperature: ' + str(selector.scores_[6]))
print('Salinity: ' + str(selector.scores_[7]))
print('Oxygen: ' + str(selector.scores_[8]))
print('Alkalinity: ' + str(selector.scores_[9]))
print('SOM Cluster: ' + str(selector.scores_[10]))

Results:
Length: 0.13828431114277606
Abundance: 0.0
Latitude: 0.011483774380223188
Longitude: 0.006389781306366693
Depth: 0.012097781077716085
Nitrate: 0.013288188413607571
Temperature: 0.009891251582668126
Salinity: 0.025286985638579118
Oxygen: 0.02010898101479075
Alkalinity: 0.019199197845435556
SOM Cluster: 0.7590060953324422


In [9]:
#ANOVA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
fvalue_selector = SelectKBest(f_regression, k=6)
X_kbest = fvalue_selector.fit_transform(X_train, y_train.values.ravel())

print('Results:')
print('Length: ' + str(fvalue_selector.scores_[0]))
print('Abundance: ' + str(fvalue_selector.scores_[1]))
print('Latitude: ' + str(fvalue_selector.scores_[2]))
print('Longitude: ' + str(fvalue_selector.scores_[3]))
print('Depth: ' + str(fvalue_selector.scores_[4]))
print('Nitrate: ' + str(fvalue_selector.scores_[5]))
print('Temperature: ' + str(fvalue_selector.scores_[6]))
print('Salinity: ' + str(fvalue_selector.scores_[7]))
print('Oxygen: ' + str(fvalue_selector.scores_[8]))
print('Alkalinity: ' + str(fvalue_selector.scores_[9]))
print('SOM Cluster: ' + str(fvalue_selector.scores_[10]))

Results:
Length: 34.644423844942686
Abundance: 0.07826366602442014
Latitude: 3.3148990430766165
Longitude: 1.935223471796427
Depth: 69.13381702030533
Nitrate: 51.51013720193876
Temperature: 17.259580905127635
Salinity: 0.6145144624837379
Oxygen: 1.3169084227647034
Alkalinity: 33.16747802758853
SOM Cluster: 15114.738905641088


In [10]:
import sys
import joblib
sys.modules['sklearn.externals.joblib'] = joblib

In [25]:
#Sequential Feature Selection - Forward
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

sfs = SequentialFeatureSelector(LinearRegression(), k_features=9, forward=True, floating=False, scoring='r2', cv=4)
sfs = sfs.fit(X_train, y_train.values.ravel())  

# print the results
selected_features = X_train.columns[list(sfs.k_feature_idx_)]
print('Selected Features:')
print(selected_features)
print('Final Prediction Score:')
print(sfs.k_score_)

Selected Features:
Index(['length', 'abundance', 'latitude', 'longitude', 'depth', 'nitrate',
       'salinity', 'darwin_alkalinity', 'SOM_Cluster'],
      dtype='object')
Final Prediction Score:
0.5269346727519564
