In [1]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from pickle import dump

In [2]:
import sys
sys.path.append('/home/ubuntu/w210_melanoma/melanoma-detection')

In [3]:
from melanoma_detection.preprocessor import ColumnSelector, Encoder

In [4]:
df = pd.read_csv(
    '/home/ubuntu/w210_melanoma/melanoma-detection/concatenated_whole.csv.gz', 
    compression='gzip',
    error_bad_lines=False,
)

drop_col = ['image_name','patient_id', 'diagnosis', 'benign_malignant']
data = df.drop(drop_col,axis = 1)
train, test = train_test_split(data, test_size = 0.2, random_state = 0)

X_train = train.drop(['target'], axis=1)
y_train = train['target']
X_test = test.drop(['target'], axis=1)
y_test = test['target']

cat = ['sex','anatom_site_general_challenge' ]
num = list(X_train.columns.values)
num = [elm for elm in num if elm not in cat]

In [5]:
cat_pipe = Pipeline([
    ('selector', ColumnSelector(cat)),
    ('fill_na', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('label_encoder', Encoder()),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
])

# Define numerical pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(num)),
    ('fill_na', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
])

# Combine categorical and numerical pipeline
preprocessor = FeatureUnion([
    ('cat', cat_pipe),
    ('num', num_pipe),
])

In [6]:
X_train_trasform = preprocessor.fit_transform(X_train)
# X_train_fin = pd.DataFrame(X_train_trasform)

X_test_trasform = preprocessor.transform(X_test)
X_test_fin = pd.DataFrame(X_test_trasform)

dump(preprocessor, open('/home/ubuntu/preprocessor.pkl', 'wb'))

In [7]:
from pickle import load
preprocessor2 = load(open('/home/ubuntu/preprocessor.pkl', 'rb'))
X_test_fin_2 = pd.DataFrame(preprocessor2.transform(X_test))
print((X_test_fin.values==X_test_fin_2.values).all())

True


In [8]:
X_test_fin.to_csv('/home/ubuntu/X_test_fin.csv')