# pistachio


In [1]:
# convert arff to parquet

from scipy.io import arff 
import pandas as pd
import os

arff_filepath = './data/Pistachio_Dataset/Pistachio_16_Features_Dataset/Pistachio_16_Features_Dataset.arff'
parquet_path = './data/pistachio_16.snappy.pqt'

def arff_to_parquet(input_arff: str, output_parquet: str):
    """convert arff file to parquet"""
    if not os.path.exists(input_arff):
        raise ValueError(f"input file '{input_arff}' does not exist")
    data, meta = arff.loadarff(input_arff)
    print("arff metadata")
    print(meta)
    df = pd.DataFrame(data)
    df['Class'] = df['Class'].astype(str)
    df.to_parquet(output_parquet)
##################

if not os.path.exists(parquet_path):
    print("converting arff to parquet")
    arff_to_parquet(arff_filepath, parquet_path)


In [2]:
# Dataset: Pistachio_16_Features_Dataset
# 	AREA's type is numeric
# 	PERIMETER's type is numeric
# 	MAJOR_AXIS's type is numeric
# 	MINOR_AXIS's type is numeric
# 	ECCENTRICITY's type is numeric
# 	EQDIASQ's type is numeric
# 	SOLIDITY's type is numeric
# 	CONVEX_AREA's type is numeric
# 	EXTENT's type is numeric
# 	ASPECT_RATIO's type is numeric
# 	ROUNDNESS's type is numeric
# 	COMPACTNESS's type is numeric
# 	SHAPEFACTOR_1's type is numeric
# 	SHAPEFACTOR_2's type is numeric
# 	SHAPEFACTOR_3's type is numeric
# 	SHAPEFACTOR_4's type is numeric
# 	Class's type is nominal, range is ('Kirmizi_Pistachio', 'Siit_Pistachio')

## Load Data
load data from parquet, stratify split to train and test

In [3]:
from typing import List
import numpy as np
from sklearn.model_selection import train_test_split

def split_data(
        input_parquet: str, 
        train_filename: str,
        test_filename: str,
        label_column: str,
        test_fraction: float=0.2,
        seed: int=42):
    """stratify sample the data"""
    # set seed
    # np.random.seed(seed)
    in_df = pd.read_parquet(input_parquet)
    y = in_df.pop(label_column)
    x_train, x_test, y_train, y_test = train_test_split(
        in_df, 
        y, 
        random_state=seed, 
        stratify=y, 
        test_size=test_fraction)
    # reattach labels
    x_train[label_column] = y_train
    x_test[label_column] = y_test
    print(f'x_train shape = {x_train.shape}')
    print(f'y_train shape = {y_train.shape}')
    print(f'x_test shape = {x_test.shape}')
    print(f'y_test shape = {y_test.shape}')
    # write data
    x_train.to_parquet(train_filename)
    x_test.to_parquet(test_filename)
##############################

train_path = './data/pistachio_train.pqt'
test_path = './data/pistachio_test.pqt'
split_seed = 41
label_column = 'Class'
test_fraction = 0.2
input_data_schema_path = "./data/pistachio_schema.json"

if not (os.path.exists(train_path) and os.path.exists(test_path)):
    split_data(
        parquet_path,
        train_path,
        test_path,
        label_column=label_column,
        test_fraction=test_fraction,
        seed=split_seed)
    print('split train and test data')


## Validate Data

In [4]:
#! pip install pandera[io]

In [5]:
# very quick EDA/summary on the train data
train = pd.read_parquet(train_path)
summary = train.describe(include='all')
summary

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,ROUNDNESS,COMPACTNESS,SHAPEFACTOR_1,SHAPEFACTOR_2,SHAPEFACTOR_3,SHAPEFACTOR_4,Class
count,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718.0,1718
unique,,,,,,,,,,,,,,,,,2
top,,,,,,,,,,,,,,,,,Kirmizi_Pistachio
freq,,,,,,,,,,,,,,,,,985
mean,79990.176368,1423.800903,446.184575,238.404956,0.840033,318.003064,0.940279,85048.962165,0.715327,1.897053,0.571019,0.713397,0.005698,0.003016,0.510931,0.955492,
std,13075.73813,375.345641,32.531725,30.37299,0.048895,26.853806,0.050205,13180.036678,0.053204,0.239194,0.212615,0.044681,0.000825,0.000333,0.064074,0.050951,
min,29808.0,858.363,320.3445,133.5096,0.5049,194.8146,0.588,37935.0,0.4272,1.1585,0.0628,0.476,0.004,0.0024,0.2266,0.6204,
25%,71859.5,1169.58875,426.33705,217.92545,0.817225,302.4803,0.920025,76334.0,0.685225,1.735225,0.375375,0.682225,0.0052,0.0028,0.465425,0.944,
50%,79997.5,1261.599,448.89775,236.29945,0.8492,319.14885,0.9545,85097.0,0.7258,1.8935,0.6441,0.71085,0.0056,0.003,0.50535,0.9733,
75%,89018.0,1603.212,468.6313,257.292375,0.8751,336.661875,0.976775,93919.0,0.753575,2.06665,0.7445,0.7423,0.0061,0.0032,0.551,0.9872,


In [6]:

import pandera as pa
# schema = pa.infer_schema(train)
# print(schema)
from pandera import Check, Column, DataFrameSchema
# define schema based on inspecting the training data above
if not os.path.exists(input_data_schema_path):
# if True:
    print("creating schema")
    schema = DataFrameSchema(
        columns={
            "AREA": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=100.0),
                    Check.less_than_or_equal_to(max_value=1e6)
                ],
                nullable=False
            ),
            "PERIMETER": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=100.0),
                    Check.less_than_or_equal_to(max_value=1e6)
                ],
                nullable=False
            ),
            "MAJOR_AXIS": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=10.0),
                    Check.less_than_or_equal_to(max_value=1e6)
                ],
                nullable=False
            ),
            "MINOR_AXIS": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=10.0),
                    Check.less_than_or_equal_to(max_value=1e6)
                ],
                nullable=False
            ),
            "ECCENTRICITY": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "EQDIASQ": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=100.0),
                    Check.less_than_or_equal_to(max_value=1e6)
                ],
                nullable=False
            ),
            "SOLIDITY": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "CONVEX_AREA": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=1000.0),
                    Check.less_than_or_equal_to(max_value=1e6)
                ],
                nullable=False
            ),
            "EXTENT": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "ASPECT_RATIO": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=100.0)
                ],
                nullable=False
            ),
            "ROUNDNESS": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "COMPACTNESS": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "SHAPEFACTOR_1": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "SHAPEFACTOR_2": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "SHAPEFACTOR_3": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "SHAPEFACTOR_4": Column(
                dtype="float64",
                checks=[
                    Check.greater_than_or_equal_to(min_value=0.0),
                    Check.less_than_or_equal_to(max_value=1.0)
                ],
                nullable=False
            ),
            "Class": Column(
                dtype="object",
                checks=[
                    Check.isin(['Siit_Pistachio', 'Kirmizi_Pistachio'])
                ],
                nullable=False
            )
        }
    )
    print(schema)
    schema.to_json(input_data_schema_path)
    print(f"wrote schema to {input_data_schema_path}")


In [7]:

# Validate Data (dtypes, count nulls)
# preprocess - feature engineering, cast class to 1/0
# tune - hyperopt or bayes_opt
# stash metadata/experiment results
# train with best parameters
# evaluate
# run inference on dummy "new" data
from pandera import DataFrameSchema
def validate_data(in_df: pd.DataFrame, schema_file: str) -> pd.DataFrame:
    """check input data, count nulls, basic stats"""
    # load schema
    the_schema = DataFrameSchema.from_json(schema_file)
    the_schema.validate(in_df)

    
    
    # summary = in_df.describe(include='all')
    # # check for entirely missing columns
    # entirely_missing = [x for x in in_df.columns if summary.loc['count', x] == 0]
    # if entirely_missing:
    #     raise ValueError(f'following columns in supplied data are missing: {entirely_missing}')
    # # check that columns have more than one unique value
    # single_value_columns = [x for x in in_df.columns if summary.loc['unique', column] == 1]
    # if entirely_missing:
    #     raise ValueError(f'following columns in supplied data are missing: {entirely_missing}')

validate_data(train, input_data_schema_path)

print("no errors, data looks good")



# train['Class'] = train.Class.astype('category')
# train.Class.cat.codes

# https://pandas.pydata.org/docs/user_guide/categorical.html#controlling-behavior


no errors, data looks good


In [8]:
from pandas.api.types import CategoricalDtype
def preprocess(in_raw_df: pd.DataFrame) -> pd.DataFrame:
    """preprocess the data, do any cleaning, feature engineering, etc"""
    out_df = in_raw_df.copy()

    #cross some features
    out_df['SOLIDITY_MAJOR'] = out_df.SOLIDITY*out_df.MAJOR_AXIS

    # reorder
    cols = [x for x in out_df.columns if x != 'Class']
    out_df = out_df[cols + ['Class']]

    # convert Class to categorical
    class_type = CategoricalDtype(categories=['Siit_Pistachio', 'Kirmizi_Pistachio'])
    out_df.Class = out_df.Class.astype(class_type)
    # create a binary column
    out_df['Target'] = out_df.Class.cat.codes

    return out_df

train_proc = preprocess(train)
train_proc.head()


    
    

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,ROUNDNESS,COMPACTNESS,SHAPEFACTOR_1,SHAPEFACTOR_2,SHAPEFACTOR_3,SHAPEFACTOR_4,SOLIDITY_MAJOR,Class,Target
1903,101005.0,1382.8,476.3697,279.0725,0.8104,358.6134,0.9478,106565.0,0.7732,1.707,0.6638,0.7528,0.0047,0.0028,0.5667,0.9674,451.503202,Siit_Pistachio,0
1168,96240.0,1427.5699,476.6801,264.1641,0.8324,350.0522,0.9703,99186.0,0.7946,1.8045,0.5934,0.7344,0.005,0.0027,0.5393,0.9731,462.522701,Kirmizi_Pistachio,1
716,48509.0,1020.717,439.9729,143.3477,0.9454,248.5228,0.9792,49537.0,0.7156,3.0693,0.5851,0.5649,0.0091,0.003,0.3191,0.9793,430.821464,Kirmizi_Pistachio,1
1570,100634.0,1260.0811,481.9576,266.369,0.8334,357.9542,0.9925,101396.0,0.7059,1.8094,0.7964,0.7427,0.0048,0.0026,0.5516,0.9981,478.342918,Siit_Pistachio,0
889,81407.0,1244.441,497.262,220.6327,0.8962,321.9482,0.9254,87965.0,0.5527,2.2538,0.6606,0.6474,0.0061,0.0027,0.4192,0.9447,460.166255,Kirmizi_Pistachio,1


## cross validation/tuning

In [9]:
import sklearn
from sklearn.model_selection import StratifiedKFold, cross_validate
from typing import Dict

def cross_validate_estimator(train_X: pd.DataFrame, train_Y: pd.DataFrame, clf: sklearn.base.BaseEstimator, metrics: Dict, n_folds: int=5, cv_seed:int=23, n_jobs: int=2):
    """for a given set of model parameters, use cross validation to evaluate model performance"""
    
    # generate cv_folds
    cv_folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=cv_seed)

    results = cross_validate(clf, train_X, train_Y, cv=cv_folds, scoring=metrics, n_jobs=n_jobs)

    return results
    


In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, make_scorer

params = {
    'learning_rate': 0.1,
    'booster': 'gbtree',
    'n_jobs': 1,
    'gamma': 0.01,
    'min_child_weight': 0.05,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.3,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

cv_seed = 37
metrics = {
    "roc_auc": make_scorer(roc_auc_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "f1_score": make_scorer(f1_score)
}

cols = train_proc.columns
features = [x for x in cols if x not in ['Class','Target']]
Xx = train_proc[features]
Yy = train_proc.Target.values

clf = XGBClassifier(objective='binary:logistic', eval_metric='auc', **params)

results = cross_validate_estimator(Xx, Yy, clf, metrics, cv_seed=cv_seed, n_jobs=2)
agged_results = {k:(np.mean(v), np.std(v)) for k,v in results.items()}

agged_results


{'fit_time': (0.12723264694213868, 0.003507726389684401),
 'score_time': (0.006651782989501953, 0.0006431507328528009),
 'test_roc_auc': (0.8691124721914356, 0.010111355494033258),
 'test_precision': (0.8864885217567421, 0.011389116123450178),
 'test_recall': (0.8923857868020304, 0.031684236617923335),
 'test_f1_score': (0.8889775847285855, 0.013208642189853053)}

In [13]:
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours

fixed_parameters = {
    "booster": "gbtree",
    "n_jobs": 1,
    'colsample_bytree': 0.3
}
pbounds = {
    'learning_rate': (0.01, 0.3),
    'gamma': (0.0, 0.3),
    'min_child_weight': (0.01, 0.07),
    'max_depth': (3, 5),
    'subsample': (0.7, 0.9),
    'reg_alpha': (0.01, 0.1),
    'reg_lambda': (0.01, 0.1)
}
integer_parameters = ['max_depth']

def cast_integer_params(params: Dict, integer_params: List[str]):
    """cast floats in param values to integers"""
    for x in integer_parameters:
            if x in params:
                params[x] = int(params[x])
    return params
    

def optimise_tune(
    train_x: pd.DataFrame,
    train_y: pd.DataFrame,
    pbounds: Dict, 
    fixed_parameters: Dict,
    integer_parameters: List[str],
    metrics: Dict, 
    cv_seed: int, 
    n_folds: int=5,
    opt_n_init: int=10,
    opt_n_iter: int=20,
    opt_random_seed: int=42
):
    """use Bayesian optimisation to search for optimal model hyperparameters"""

    # initialise list to hold (detailed) experiment results
    trials = []
    # discrete parameters need to be handled specially in bayesopt (explicitly cast to int)
    

    # function to run a trial - evaluate a given set of searchable parameters
    def run_trial(**probe_params):
        params = {**probe_params, **fixed_parameters}
        params = cast_integer_params(params, integer_parameters)
        

        # set up the XGBclassifier
        clf = XGBClassifier(objective='binary:logistic', eval_metric='auc', **params)

        # train/evaluate model through cross validation
        results = cross_validate_estimator(train_x, train_y, clf, metrics, cv_seed=cv_seed, n_jobs=2)

        # aggregate metrics over cv folds - gather mean and std deviation of each metric
        agged_results = {k:(np.mean(v), np.std(v)) for k,v in results.items()}

        # take the final score - the objective to be used for bayes_opt, as the lower bound of mean roc_auc (mean roc_auc minus error in mean roc_auc)
        # This penalises cases where the mean might be high, but where there is more variation across folds (more uncertainty in how the model may generalise).
        final_score = agged_results['test_roc_auc'][0] - agged_results['test_roc_auc'][1]/np.sqrt(n_folds)

        # append all the metrics to the trial result.
        trials.append( {"final_score": final_score, 'params': params, "results": agged_results})
        return final_score

    optimizer = BayesianOptimization(
        f = run_trial,
        random_state=opt_random_seed,
        pbounds=pbounds,
        verbose=2)

    optimizer.maximize(init_points=10,n_iter=40)

    print(f"best_result: {optimizer.max}")
    return optimizer.max, trials

# run the optimisation
best_trial, cv_experiments = optimise_tune(Xx, Yy, pbounds, fixed_parameters, integer_parameters, metrics, cv_seed, n_folds=5)

    
        

    



|   iter    |  target   |   gamma   | learni... | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.8613   [0m | [0m0.1124   [0m | [0m0.2857   [0m | [0m4.464    [0m | [0m0.04592  [0m | [0m0.02404  [0m | [0m0.02404  [0m | [0m0.7116   [0m |
| [95m2        [0m | [95m0.8707   [0m | [95m0.2599   [0m | [95m0.1843   [0m | [95m4.416    [0m | [95m0.01124  [0m | [95m0.09729  [0m | [95m0.08492  [0m | [95m0.7425   [0m |
| [0m3        [0m | [0m0.8703   [0m | [0m0.05455  [0m | [0m0.06319  [0m | [0m3.608    [0m | [0m0.04149  [0m | [0m0.04888  [0m | [0m0.03621  [0m | [0m0.8224   [0m |
| [95m4        [0m | [95m0.8731   [0m | [95m0.04185  [0m | [95m0.09472  [0m | [95m3.733    [0m | [95m0.03736  [0m | [95m0.08067  [0m | [95m0.02797  [0m | [95m0.8028   [0m |
| [0m5        [0m | [0m0.8653  

## Train final model

In [14]:
from xgboost import XGBClassifier

final_parameters = {**(best_trial['params']),**fixed_parameters}
final_parameters = cast_integer_params(final_parameters, integer_parameters)

def train_model(
    train_x: pd.DataFrame,
    train_y: pd.DataFrame,
    params: Dict
): 
    """Train a model on entire train set"""
    clf = XGBClassifier(objective='binary:logistic', eval_metric='auc', **params)

    model = clf.fit(train_x, train_y)
    return model
    
model = train_model(Xx, Yy, final_parameters)



In [15]:
model.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': 'gbtree',
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.3,
 'eval_metric': 'auc',
 'gamma': 0.013486306220427203,
 'gpu_id': None,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': 0.05456513134009019,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 3,
 'max_leaves': None,
 'min_child_weight': 0.04821252064927356,
 'monotone_constraints': None,
 'n_jobs': 1,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': 0.02118815334022752,
 'reg_lambda': 0.03214980200991906,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.7115962436536546,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

## Model Evaluation

In [None]:
# feature importance
# precision recall plot
# roc curve
# probability calibration
# confusion matrix


In [16]:
train_predicted_probs = model.predict_proba(Xx)[:,1]
# train_predicted_probs[0:5]
train_predicted_classes = model.predict(Xx)
train_predicted_classes[0:5]
train_predicted_labels = np.array([train_proc.Class.cat.categories[x] for x in train_predicted_classes])

In [17]:
train_predicted_labels

array(['Siit_Pistachio', 'Siit_Pistachio', 'Kirmizi_Pistachio', ...,
       'Siit_Pistachio', 'Kirmizi_Pistachio', 'Siit_Pistachio'],
      dtype='<U17')

In [18]:
def evaluation_metrics(predicted_probs, predicted_classes, actual_classes, prefix=None):
    """evaluate results"""
    results = {}
    prefix = '' if prefix is None else prefix
    results[f"{prefix}roc_auc_score"] = roc_auc_score(actual_classes, predicted_probs)
    results[f"{prefix}precision_score"] = precision_score(actual_classes, predicted_classes)
    results[f"{prefix}recall_score"] = recall_score(actual_classes, predicted_classes)
    results[f"{prefix}f1_score"] = f1_score(actual_classes, predicted_classes)
    results[f"{prefix}accuracy_score"] = accuracy_score(actual_classes, predicted_classes)
    return results

evaluation_metrics(train_predicted_probs, train_predicted_classes, Yy)
    

{'roc_auc_score': 0.970653942839731,
 'precision_score': 0.9148073022312373,
 'recall_score': 0.915736040609137,
 'f1_score': 0.9152714358193811,
 'accuracy_score': 0.9027939464493597}

In [37]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve
sns.set()

def make_roc_plot(predicted_probs, actual_classes, title: str="ROC curve", xlabel='False Positive Rate',ylabel: str='True Positive Rate'):
    """make a roc curve"""
    tpr, fpr, _ = roc_curve(actual_classes, predicted_probs)
    fig = plt.Figure()
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.plot(fpr, tpr, color=sns.xkcd_rgb['blurple'], label='roc curve')
    ax.plot([0.0, 1.0],[0.0, 1.0], color=sns.xkcd_rgb['merlot'], linestyle='--', label='random')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()
    # fig.show()
    return fig, ax

    
make_roc_plot(train_predicted_probs, Yy, title='pistacio classifier roc curve - train data')


    


(<Figure size 640x480 with 1 Axes>,
 <Axes: title={'center': 'pistacio classifier roc curve - train data'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>)

In [36]:
import matplotlib as mpl
mpl.use('AGG')
display(roc_fig_train)

<Figure size 640x480 with 1 Axes>

In [None]:
# for x in train.columns:
#     schema = schema.update_column(x, nullable=False)
sc2 = pa.DataFrameSchema({
    'col1': pa.Column(str, nullable=True),
    'col2': pa.Column(int, nullable=False, unique=True)})
print(sc2)


In [None]:
print(schema)

In [None]:
#print(schema.to_script())

In [None]:
# import pandera
# pandera.__version__
import xgboost
xgboost.__version__