In [1]:
import numpy as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.externals import joblib

from scipy.stats import linregress

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

In [2]:
# load files

train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))

** Absorbance levels at different wavelength. **

- Include all of the features involving absorbance levels at different wavelengths.
- We can choose to include C02 band if we want.

In [4]:
class MIR:
    def __init__(self, train, test, CO2_band=False):
        self.train     = train
        self.test      = test
        self.CO2_band  = CO2_band
    
    def prepare(self):
        # initiates the process
        self.extract_absorbance_levels()
        
    def extract_absorbance_levels(self):
        
        # there are 3578 mid-infrared absorbance features
        features = self.train.columns[1:3579] # start from 1 because we won't include PIDN
        
        if not self.CO2_band:
            self.train_ = self.train[features]
            self.test_  = self.test[features]
        else:
            # remove the CO2 band from the dataset
            start_index = list(features).index('m2379.76')
            end_index   = list(features).index('m2352.76')
            
            co2_features    = features[start_index:end_index+1]
            features_wo_co2 = features.drop(co2_features)
            
            self.train_  = self.train[features_wo_co2]
            self.test_   = self.test[features_wo_co2]
            
    def get_dataset(self):
        
        # return the prepared datasets
        return self.train_, self.test_

In [6]:
d1 = MIR(train, test)
d1.prepare()

train_1, test_1 = d1.get_dataset()

# dump the dataset onto the disk
joblib.dump(train_1, os.path.join(basepath, 'data/processed/dataset_1/train/train'))
joblib.dump(test_1, os.path.join(basepath, 'data/processed/dataset_1/test/test'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_1/test/test',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_1/test/test_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_1/test/test_02.npy']

In [7]:
d2 = MIR(train, test, CO2_band=True)
d2.prepare()

train_2, test_2 = d2.get_dataset()

# dump the dataset onto the disk
joblib.dump(train_2, os.path.join(basepath, 'data/processed/dataset_2/train/train'))
joblib.dump(test_2, os.path.join(basepath, 'data/processed/dataset_2/test/test'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_2/test/test',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_2/test/test_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_2/test/test_02.npy']

** Spatial Data. **

* Spatial predictors from remote sensing data sources.

In [9]:
class Spatial:
    def __init__(self, train, test):
        self.train = train
        self.test  = test
    
    def prepare(self):
        self.extract_spatial_features()
    
    def extract_spatial_features(self):
        features = self.train.columns
        
        spatial_features = features[-21:-6]
        
        self.train_ = self.train[spatial_features]
        self.test_  = self.test[spatial_features]
        
    def get_dataset(self):
        return self.train_, self.test_

In [10]:
d3 = Spatial(train, test)
d3.prepare()

train_3, test_3 = d3.get_dataset()

joblib.dump(train_3, os.path.join(basepath, 'data/processed/dataset_3/train/train'))
joblib.dump(test_3, os.path.join(basepath, 'data/processed/dataset_3/test/test'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_3/test/test',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_3/test/test_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_3/test/test_02.npy']

** Take the first order derivative of the features. **

In [11]:
class Derivative:
    def __init__(self, train, test):
        self.train = train
        self.test = test
    
    def prepare(self):
        self.take_derivative()
    
    def get_absorbance_features(self):
        features = self.train.columns
        
        return features[1:3579]
    
    def take_derivative(self):
        absorbance_features = self.get_absorbance_features()
        
        self.train_ = self.train[absorbance_features].diff(periods=-1, axis=1).dropna(axis=1)
        self.test_  = self.test[absorbance_features].diff(periods=-1, axis=1).dropna(axis=1)
        
    def get_dataset(self):
        return self.train_, self.test_   

In [12]:
d4 = Derivative(train, test)
d4.prepare()

train_4, test_4 = d4.get_dataset()

joblib.dump(train_4, os.path.join(basepath, 'data/processed/dataset_4/train/train'))
joblib.dump(test_4, os.path.join(basepath, 'data/processed/dataset_4/test/test'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_4/test/test',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_4/test/test_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/dataset_4/test/test_02.npy']

** All of the above datasets assume that all the features are required for each of the target variable. **

** Prepare dataset after feature selection. **

* Transform all of the features and group them into bins of 8.
* Apply average of the values.
* Calculate pearson score with the target variable.
* Store feature relevance dataframe instead of data.

In [57]:
class FeatureRelevance:
    def __init__(self, train):
        self.train = train
        
    def get_absorbance_features(self):
        features = self.train.columns
        
        return features[1:-5]
        
    def prepare(self):
        absorbance_features = self.get_absorbance_features()
        
        self.train_ = self.train[absorbance_features]
        self.train_['Depth'] = (self.train_.Depth == 'TopSoil').astype(np.int)
        
        
        feature_relevance_Ca   = self.feature_relevance(self.train_, self.train.Ca)
        feature_relevance_P    = self.feature_relevance(self.train_, self.train.P)
        feature_relevance_Sand = self.feature_relevance(self.train_, self.train.Sand)
        feature_relevance_SOC  = self.feature_relevance(self.train_, self.train.SOC)
        feature_relevance_pH   = self.feature_relevance(self.train_, self.train.pH)
        
        feature_rel_df = pd.DataFrame({'variables': self.train_.columns, 
                               'Ca_r_squared': feature_relevance_Ca,
                               'P_r_squared': feature_relevance_P,
                               'Sand_r_squared': feature_relevance_Sand,
                               'SOC_r_squared': feature_relevance_SOC,
                               'pH_r_squared': feature_relevance_pH
                              })
        
        return feature_rel_df
        
    
    @staticmethod
    def calculate_r_squared(X, y):
        slope, intercept, r_value, p_value, std_err = linregress(X, y)
        return r_value ** 2
    
    @staticmethod
    def calculate_score(X, y, bins):
        X_agg = []
        y_agg = []

        for bin_ in range(bins.nunique()):
            mask = (bins == bin_)

            X_agg.append(X.loc[mask].mean())
            y_agg.append(y.loc[mask].mean())

        X_agg = np.array(X_agg)
        y_agg = np.array(y_agg)

        r_squared = FeatureRelevance.calculate_r_squared(X_agg, y_agg)

        return r_squared
    
    @staticmethod
    def feature_r_squared(X, y, features, bins):
        return [FeatureRelevance.calculate_score(X[feature], y, bins) for feature in features]

    @staticmethod
    def feature_relevance(X, y, nbins=8):

        features = X.columns
        bins = pd.cut(X[features[0]], nbins, labels=False)

        return FeatureRelevance.feature_r_squared(X, y, features, bins)
    

In [58]:
f = FeatureRelevance(train)
feature_rel_df = f.prepare()

In [59]:
# store the feature relevance dataframe onto disk
joblib.dump(feature_rel_df, os.path.join(basepath, 'data/interim/feature_relevance/feat_rel_df'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/feature_relevance/feat_rel_df',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/feature_relevance/feat_rel_df_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/feature_relevance/feat_rel_df_02.npy']

** Prepare a dataset from feature relevance dataframe. **

In [3]:
# load feature relevance dataframe from the disk
feature_rel_df = joblib.load(os.path.join(basepath, 'data/interim/feature_relevance/feat_rel_df'))

In [16]:
class Relevant_Features:
    
    def __init__(self, train, test):
        self.train  = train
        self.test   = test
        self.labels = ['Ca', 'P', 'Sand', 'SOC', 'pH']
     
    def prepare(self, feature_rel_df, n_features=1000):
        self.get_relevant_features(feature_rel_df, n_features)
        self.save_featureset_per_target()
        
    def get_relevant_features(self, rel_df, n_features):
        n_labels = len(self.labels)
        relevant_features = []

        for i in range(n_labels):
            features = rel_df.sort_values(by='%s_r_squared'%(self.labels[i]), ascending=False)['variables'][:n_features]
            relevant_features.append(features)

        self.relevant_features = relevant_features
        return relevant_features
    
    def save_featureset_per_target(self):
        for i in range(len(self.labels)):
            train_, test_ = self.train[self.relevant_features[i]], self.test[self.relevant_features[i]]
            
            joblib.dump(train_, os.path.join(basepath, 'data/processed/dataset_5/%s/train/train'%self.labels[i]))
            joblib.dump(test_, os.path.join(basepath, 'data/processed/dataset_5/%s/test/test'%self.labels[i]))
            
            print('Dumped datasets for label: %s'%(self.labels[i]))

In [17]:
d5 = Relevant_Features(train, test)
d5.prepare(feature_rel_df)

Dumped datasets for label: Ca
Dumped datasets for label: P
Dumped datasets for label: Sand
Dumped datasets for label: SOC
Dumped datasets for label: pH
