# Drug to Drug Interaction (DDI) - Model Training

- Load the ./data/ssp_interaction_type.csv.gz
- Process the features
  - Set the categorical features names
  - Set the numeric features names  
  - Set the target variable
- Split the data
  - train/validation/test split with 60%/20%/20% distribution.
  - Random_state 42
  - Use strategy = y to deal with the class imbalanced problem
- Train the model
  - LogisticRegression
  - RandomForestClassifier
  - XGBClassifier
  - DecisionTreeClassifier
- Evaluate the models and compare them
  - accuracy_score
  - precision_score
  - recall_score
  - f1_score
- Confusion Matrix

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


In [2]:
# open the csv file and read it into a pandas dataframe 
df = pd.read_csv('./data/ssp_interaction_type.csv.gz', compression='gzip')
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191808 entries, 0 to 191807
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ssp               191808 non-null  float64
 1   interaction_type  191808 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 2.9 MB
None
        ssp  interaction_type
0  0.091837                 1
1  0.093023                 1
2  0.012346                 1
3  0.069307                 1
4  0.043103                 1


In [25]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

class DDITrainData():
    """
    Class to hold the training data for the DDI project
    """

    def __init__(self, df, target_variable='interaction_type'):
        self.df = df
        self.X = None
        self.y = None
        self.target_variable = target_variable
        self.categorical_features = None
        self.numerical_features = None
        # list of all features
        self.all_features = None
        
    def process_features(self):
        """
        Process the features for the model
        """        
        # get the features
        self.categorical_features = list(self.df.select_dtypes(include=['object']).columns)
        self.numerical_features = list(self.df.select_dtypes(include=[np.number]).columns)

        # remove the target feature from the list of numeric features
        if self.target_variable in self.numerical_features:
            self.numerical_features.remove(self.target_variable)

        print('Categorical features',self.categorical_features)
        print('Numerical features',self.numerical_features)
        print('Target feature',self.target_variable)

        # create a list of all features
        self.all_features = self.categorical_features + self.numerical_features
                
        return self.categorical_features, self.numerical_features
    
    def split_data(self, test_size=0.2, random_state=42):
        """
        Split the data into training and validation sets
        """
        # split the data in train/val/test sets, with 60%/20%/20% distribution with seed 1
        X = self.df[self.all_features]
        y = self.df[self.target_variable]
        X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

        # .25 splits the 80% train into 60% train and 20% val
        X_train, X_val, y_train, y_val  = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=random_state)

        X_train = X_train.reset_index(drop=True)
        X_val = X_val.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        y_val = y_val.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)

        # print the shape of all the data splits
        print('X_train shape', X_train.shape)
        print('X_val shape', X_val.shape)
        print('X_test shape', X_test.shape)
        print('y_train shape', y_train.shape)
        print('y_val shape', y_val.shape)
        print('y_test shape', y_test.shape)
        
        return X_train, X_val, y_train, y_val, X_test, y_test

In [26]:
# Process the features
target_variable = 'interaction_type'

# create an instance of the DDITrainData class to process the data
train_data = DDITrainData(df, target_variable=target_variable)

# get the features and target series
cat_features, num_features = train_data.process_features()


Categorical features []
Numerical features ['ssp']
Target feature interaction_type


In [27]:
# split the data in train/val/test sets
# use 60%/20%/20% distribution with seed 1
# use stratified sampling to ensure the distribution of the target feature is the same in all sets
X_train, X_val, y_train, y_val, X_test, y_test = train_data.split_data(test_size=0.2, random_state=42)

print(X_train.head())

X_train shape (115084, 1)
X_val shape (38362, 1)
X_test shape (38362, 1)
y_train shape (115084,)
y_val shape (38362,)
y_test shape (38362,)
        ssp
0  0.121622
1  0.116279
2  0.082353
3  0.091954
4  0.117647


In [93]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

class DDIModelFactory():
    """
    Factory class for DDI prediction model    
    """    

    def __init__(self, categorical_features, numeric_features):
        # Initialize the preprocessing transformers
        self.scaler = StandardScaler()        
        self.encoder = DictVectorizer(sparse=False)

        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        
        self.models = None
        self.model = None

    def preprocess_data(self, X, is_training=True):      
        """
        Preprocess the data for training or validation
        """  
        X_dict = X.to_dict(orient='records')
        
        if is_training:
            X_std = self.encoder.fit_transform(X_dict)        
        else:
            X_std = self.encoder.transform(X_dict)
            
        print(f'Preprocess X shape {X.shape} training {is_training}')   
        # Return the standardized features and target variable
        return X_std
    
    def preprocess_target(self, y):
        """
        Preprocess the target variable to make sure the data starts from 0 and is continuous
        The target range starts at 1, so we need to subtract 1 from the target variable
        """
        # encode the target variable
        min = y.min()
        max = y.max()
        y_encoded = y
        
        if min != 0:
            print('Min target value is not 0, encoding  y - 1')
            y_encoded = y - 1

        return y_encoded
        
    def train(self, X_train, y_train, reset=False, random_state=42, reg=10, estimators=100, iter=1000, depth=5):
        """
         Train the models
        """        
        if self.models is None or reset:
            self.models = {
                'logistic_regression': LogisticRegression(C=10, max_iter=iter, random_state=random_state, n_jobs=-1),
                'random_forest': RandomForestClassifier(n_estimators=estimators, max_depth=5, random_state=random_state, n_jobs=-1),
                'xgboost': XGBClassifier(n_estimators=estimators, max_depth=depth, random_state=random_state, n_jobs=-1),                
                'decision_tree': DecisionTreeClassifier(max_depth=depth, random_state=random_state)
            }
        
        for model in self.models.keys():
            print('Training model', model)
            self.models[model].fit(X_train, y_train)            

    def evaluate(self, X_val, y_val, average='macro'):
        """
        Evaluate the model on the validation data set and return the metrics
        """

        # create a dataframe to store the metrics
        df_metrics = pd.DataFrame(columns=['model', 'accuracy', 'precision', 'recall', 'f1', 'y_pred'])

        # define the metrics to be calculated
        fn_metrics = { 'accuracy': accuracy_score,'precision': precision_score,'recall': recall_score,'f1': f1_score}

        # loop through the models and get its metrics
        for model_name in self.models.keys():
            
            model = self.models[model_name]
            y_pred = model.predict(X_val)
                        
            # add a new row to the dataframe for each model            
            df_metrics.loc[len(df_metrics)] = [model_name, 0, 0, 0, 0, y_pred]

            # get the row index
            row_index = len(df_metrics)-1

            # Evaluate the model metrics
            for metric in fn_metrics.keys():

                # determine which metrics call and use the corresponding average and zero_division parameters
                score = 0
                                
                if metric == 'accuracy':
                    score = fn_metrics[metric](y_val, y_pred)
                elif metric == 'precision':
                    score = fn_metrics[metric](y_val, y_pred, average=average, zero_division=0)                                
                else:
                    score = fn_metrics[metric](y_val, y_pred, average=average)
                                
                df_metrics.at[row_index,metric] = score
           
        return df_metrics

    def save(model_name, path):
        """
        Save the model
        """
        # get the model from the models dictionary
        model = self.models[model_name]

        if model is None:
            print('Model not found')
            return
            
        # save the model
        model.save(path)

            
    def predict(self, X_val):
        """
        Predict the target variable on the validation data set and return the predictions
        """        
        probs = self.model.predict_proba(X_val)
        return probs


In [68]:
# hot encode the categorical features for the train data
model_factory = DDIModelFactory(cat_features, num_features)
X_train_std = model_factory.preprocess_data(X_train[cat_features + num_features], True)

# hot encode the categorical features for the validation data
X_val_std = model_factory.preprocess_data(X_val[cat_features + num_features], False)

# preprocess the target variable
y_train_encoded = model_factory.preprocess_target(y_train)


Preprocess X shape (115084, 1) training True
Preprocess X shape (38362, 1) training False
Min target value is not 0, encoding  y - 1


In [69]:
# train the models
model_factory.train(X_train_std, y_train_encoded)


Training model logistic_regression


Training model random_forest
Training model xgboost
Training model decision_tree


In [71]:
# Evaluate the model
y_val_encoded = model_factory.preprocess_target(y_val)
df_metrics = model_factory.evaluate(X_val_std, y_val_encoded, average='macro')

# print the metrics
df_metrics[['model','accuracy', 'precision', 'recall', 'f1']].head()


Min target value is not 0, encoding  y - 1


Unnamed: 0,model,accuracy,precision,recall,f1
0,logistic_regression,0.314556,0.003701,0.011763,0.00563
1,random_forest,0.314608,0.003701,0.011765,0.005631
2,xgboost,0.31539,0.008824,0.012044,0.006608
3,decision_tree,0.315234,0.011737,0.011852,0.005911


In [92]:
# fine tune the model hyperparameters
model_factory.models = None
model_factory.train(X_train_std, y_train_encoded, estimators=100, iter=1000, depth=5)



{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 5,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_parallel_tree': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}