In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,classification_report,confusion_matrix,precision_score

Using TensorFlow backend.


In [2]:
ds = pd.read_csv('data/bmi.csv')

In [3]:
ds.Index.value_counts()/len(ds)

5    0.396
4    0.260
2    0.138
3    0.136
1    0.044
0    0.026
Name: Index, dtype: float64

In [4]:
ds.head()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


#### Train test split, resampling

In [5]:
def train_test_resampled(X,y):
    sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
    for train_idx, test_idx in sss.split(X,y):
        X_train = X.loc[train_idx]
        y_train = y.loc[train_idx]
        X_test = X.loc[test_idx]
        y_test = y.loc[test_idx]
    
    # resampling
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    
    return X_res,y_res,X_test,y_test

In [6]:
X,y = pd.get_dummies(ds.drop('Index',axis=1)),ds.Index

X_train,y_train,X_test,y_test = train_test_resampled(X,y)

#### Fit the models

In [7]:
def train_models(X_train,y_train):

    # svc
    svc = SVC(C=1.0,
            kernel='rbf',
            degree=3,
            gamma='scale',
            random_state=42)

    svc.fit(X_train,y_train)

    # knn
    knn = KNeighborsClassifier(n_neighbors=5,
                                weights='uniform',
                                algorithm='auto')
    knn.fit(X_train,y_train)

    # random forest
    rf = RandomForestClassifier(n_estimators=100,
                                criterion='gini',
                                max_depth=None,
                                max_features=None)
    rf.fit(X_train,y_train)

    # gradient boosting
    gb = GradientBoostingClassifier(loss='deviance',
                                    learning_rate=0.1,
                                    n_estimators=100,
                                    subsample=1.0,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    max_depth=3,
                                    random_state=42,
                                    max_features=None)
    gb.fit(X_train,y_train)
    print('trained')
    
    models = {'svc':svc,
          'knn':knn,
          'rf':rf,
          'gb':gb}
    
    return models

In [8]:
models = train_models(X_train,y_train)

trained


Evaluation

In [9]:
for k,v in models.items():
    f1 = f1_score(y_pred=v.predict(X_test),y_true=y_test,average='weighted')
    prec = precision_score(y_pred=v.predict(X_test),y_true=y_test,average='weighted')
    print(f'results for model: {k}\nf1: {f1}\nPrecision: {prec}')
    print(classification_report(y_pred=v.predict(X_test),y_true=y_test))
    print('#'*10)

results for model: svc
f1: 0.7981043485649948
Precision: 0.8268969696969697
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.67      1.00      0.80         4
           2       0.83      0.71      0.77        14
           3       0.55      0.86      0.67        14
           4       0.72      0.69      0.71        26
           5       1.00      0.82      0.90        39

    accuracy                           0.79       100
   macro avg       0.79      0.85      0.81       100
weighted avg       0.83      0.79      0.80       100

##########
results for model: knn
f1: 0.85162336822034
Precision: 0.8629864010120178
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.60      0.75      0.67         4
           2       0.83      0.71      0.77        14
           3       0.86      0.86      0.86        14
           4       0.77  

#### Add feautures

In [10]:
from sklearn.preprocessing import PolynomialFeatures

In [11]:
poly = PolynomialFeatures(2,include_bias=False)

In [12]:
X_enriched = pd.concat((pd.DataFrame(poly.fit_transform(X[['Height','Weight']]),columns=['Height','Weight','Height_2','Weight_2','interaction']),X[['Gender_Female','Gender_Male']]),axis=1)

In [13]:
X_train_1,y_train_1,X_test_1,y_test_1 = train_test_resampled(X_enriched,y)

In [14]:
models_1 = train_models(X_train_1,y_train_1)

trained


In [15]:
for k,v in models_1.items():
    f1 = f1_score(y_pred=v.predict(X_test_1),y_true=y_test_1,average='weighted')
    prec = precision_score(y_pred=v.predict(X_test_1),y_true=y_test_1,average='weighted')
    print(f'results for model: {k}\nf1: {f1}\nPrecision: {prec}')
    print(classification_report(y_pred=v.predict(X_test_1),y_true=y_test_1))
    print('#'*10)

results for model: svc
f1: 0.8061135997657738
Precision: 0.8328596025873702
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.67      1.00      0.80         4
           2       0.85      0.79      0.81        14
           3       0.63      0.86      0.73        14
           4       0.69      0.77      0.73        26
           5       1.00      0.77      0.87        39

    accuracy                           0.80       100
   macro avg       0.81      0.86      0.82       100
weighted avg       0.83      0.80      0.81       100

##########
results for model: knn
f1: 0.8709823509823511
Precision: 0.8775057471264367
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.60      0.75      0.67         4
           2       0.83      0.71      0.77        14
           3       0.86      0.86      0.86        14
           4       0.83