In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

 load data

In [5]:
path = 'D:/Python/exercise/risk_factors_cervical_cancer.csv'
df = pd.read_csv(path)
names = list(df.columns)
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


clean data

In [8]:
df = df.replace('?',np.NaN)
imputer=Imputer(missing_values='NaN',strategy='mean')
df = imputer.fit_transform(df)
df = pd.DataFrame(df,columns=names)



model training

In [14]:
x=df[names[0:-4]]
y=df[names[-4:]]

x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2,random_state=0)

#pipeline
models = [Pipeline([('minmax',MinMaxScaler()),
                    ('pca01',PCA()),
                    ('rf',RandomForestClassifier())]),
         Pipeline([('minmax',MinMaxScaler()),
                   ('pca',PCA(n_components=0.5)),
                   ('rf',RandomForestClassifier(n_estimators=50,max_depth=1))])]

#params
params={
    'pca01__n_components':[0.5,0.6,0.7,0.8,0.9],
    'rf__n_estimators':[50,100,150],
    'rf__max_depth':[1,3,5,7,9]
}

model = GridSearchCV(estimator=models[0],param_grid=params,cv=5)
model.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('minmax',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('pca01',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                       

In [17]:
#print best model
print('best params:',model.best_params_)
print('best estimator:',model.best_estimator_)
print('best score:',model.best_score_)

best params: {'pca01__n_components': 0.5, 'rf__max_depth': 1, 'rf__n_estimators': 50}
best estimator: Pipeline(memory=None,
         steps=[('minmax', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('pca01',
                 PCA(copy=True, iterated_power='auto', n_components=0.5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=1,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                       

In [19]:
#put best result to the new model
model01=models[1]
model01.fit(x_train, y_train)
print('training data score:',model01.score(x_train, y_train))
print('test data score:',model01.score(x_test, y_test))

training data score: 0.8775510204081632
test data score: 0.8953488372093024
