# Genentech Cervical Cancer - Procedure

https://www.kaggle.com/c/cervical-cancer-screening/

In [1]:
# imports
import sys # for stderr
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# settings 
%logstop
%logstart  -o 'ipynb.log' rotate
plt.style.use('ggplot')
# constants
# plt.rcParams['figure.figsize'] = (10.0, 10.0)
# pd.set_option('display.max_rows', 50)
# pd.set_option('display.max_columns', 50)

Logging hadn't been started.
Activating auto-logging. Current session state plus future input saved.
Filename       : ipynb.log
Mode           : rotate
Output logging : True
Raw input log  : False
Timestamping   : False
State          : active


In [3]:
# versions 
import sys
print(pd.datetime.now())
print('Python: '+sys.version)
print('numpy: '+np.__version__)
print('pandas: '+pd.__version__)
print('sklearn: '+skl.__version__)

2016-01-23 12:49:01.396794
Python: 2.7.11 |Anaconda 2.4.0 (x86_64)| (default, Dec  6 2015, 18:57:58) 
[GCC 4.2.1 (Apple Inc. build 5577)]
numpy: 1.10.2
pandas: 0.17.1
sklearn: 0.17


In [12]:
procedure_vars = pd.read_csv('procedure_vars.csv')
procedure_vars[:10]

Unnamed: 0,FeatureName,HashVal,MinVal,MaxVal,Weight,RelScore
0,57454,7389192,0,8,1.8751,100.00%
1,81252,7412990,0,1,1.8175,96.93%
2,57456,7389194,0,5,1.7814,95.00%
3,57455,7389193,0,5,1.7143,91.42%
4,S4020,12393181,0,1,1.608,85.76%
5,S0605,1723495,0,2,1.5684,83.64%
6,G0143,32381052,0,6,1.47,78.39%
7,90696,7422434,0,1,1.4434,76.98%
8,S4023,22267026,0,1,1.4377,76.67%
9,69710,7401448,0,1,1.3513,72.06%


In [5]:
procedure_code = pd.read_csv('./input/procedure_code.csv.gz')

In [6]:
procedure_code[:2]

Unnamed: 0,procedure_code,procedure_description
0,0001F,HRT FAILURE ASSESSED
1,0001M,"INFECTIOUS DISEASE, HCV, SIX BIOCHEMICAL ASSAYS"


In [13]:
procedures = pd.merge(procedure_vars, procedure_code, left_on='FeatureName', right_on='procedure_code', how='left' )

In [9]:
procedure_code.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16276 entries, 0 to 16275
Data columns (total 2 columns):
procedure_code           16276 non-null object
procedure_description    16276 non-null object
dtypes: object(2)
memory usage: 381.5+ KB


In [10]:
procedure_vars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14347 entries, 0 to 14346
Data columns (total 6 columns):
FeatureName    14347 non-null object
   HashVal     14347 non-null int64
  MinVal       14347 non-null float64
  MaxVal       14347 non-null float64
   Weight      14347 non-null object
  RelScore     14347 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 784.6+ KB


In [11]:
procedures.FeatureName[0]

'57454   '

In [14]:
procedures[:10]

Unnamed: 0,FeatureName,HashVal,MinVal,MaxVal,Weight,RelScore,procedure_code,procedure_description
0,57454,7389192,0,8,1.8751,100.00%,57454,COLPOSCOPY CERVIX BX CERVIX & ENDOCRV CURRETAGE
1,81252,7412990,0,1,1.8175,96.93%,81252,GJB2 GENE ANALYSIS FULL GENE SEQUENCE
2,57456,7389194,0,5,1.7814,95.00%,57456,COLPOSCOPY CERVIX ENDOCERVICAL CURETTAGE
3,57455,7389193,0,5,1.7143,91.42%,57455,COLPOSCOPY CERVIX UPPR/ADJCNT VAGINA W/CERVIX BX
4,S4020,12393181,0,1,1.608,85.76%,S4020,IN VITRO FERTILIZATION PROCEDURE CANCELLED BEFOR
5,S0605,1723495,0,2,1.5684,83.64%,S0605,"DIGITAL RECTAL EXAMINATION, MALE, ANNUAL"
6,G0143,32381052,0,6,1.47,78.39%,G0143,"SCREENING CYTOPATHOLOGY, CERVICAL OR VAGINAL (AN"
7,90696,7422434,0,1,1.4434,76.98%,90696,DTAP-IPV VACCINE CHILD 4-6 YRS FOR IM USE
8,S4023,22267026,0,1,1.4377,76.67%,S4023,"DONOR EGG CYCLE, INCOMPLETE, CASE RATE"
9,69710,7401448,0,1,1.3513,72.06%,69710,IMPLTJ/RPLCMT EMGNT BONE CNDJ DEV TEMPORAL BONE


In [15]:
procedures.to_csv('procedure_vars2.csv')

In [16]:
procedures[:20]

Unnamed: 0,FeatureName,HashVal,MinVal,MaxVal,Weight,RelScore,procedure_code,procedure_description
0,57454,7389192,0,8,1.8751,100.00%,57454,COLPOSCOPY CERVIX BX CERVIX & ENDOCRV CURRETAGE
1,81252,7412990,0,1,1.8175,96.93%,81252,GJB2 GENE ANALYSIS FULL GENE SEQUENCE
2,57456,7389194,0,5,1.7814,95.00%,57456,COLPOSCOPY CERVIX ENDOCERVICAL CURETTAGE
3,57455,7389193,0,5,1.7143,91.42%,57455,COLPOSCOPY CERVIX UPPR/ADJCNT VAGINA W/CERVIX BX
4,S4020,12393181,0,1,1.608,85.76%,S4020,IN VITRO FERTILIZATION PROCEDURE CANCELLED BEFOR
5,S0605,1723495,0,2,1.5684,83.64%,S0605,"DIGITAL RECTAL EXAMINATION, MALE, ANNUAL"
6,G0143,32381052,0,6,1.47,78.39%,G0143,"SCREENING CYTOPATHOLOGY, CERVICAL OR VAGINAL (AN"
7,90696,7422434,0,1,1.4434,76.98%,90696,DTAP-IPV VACCINE CHILD 4-6 YRS FOR IM USE
8,S4023,22267026,0,1,1.4377,76.67%,S4023,"DONOR EGG CYCLE, INCOMPLETE, CASE RATE"
9,69710,7401448,0,1,1.3513,72.06%,69710,IMPLTJ/RPLCMT EMGNT BONE CNDJ DEV TEMPORAL BONE


In [18]:
procedures[-20:]

Unnamed: 0,FeatureName,HashVal,MinVal,MaxVal,Weight,RelScore,procedure_code,procedure_description
14327,44393,7376131,0,1,-1.0956,-58.43%,44393,COLONOSCOPY STOMA ABLATION LESION
14328,27240,7358978,0,1,-1.1076,-59.07%,27240,CLTX INTR/PERI/SBTRCHNTC FEMORAL FX W/MANJ
14329,33226,7364964,0,1,-1.1148,-59.45%,33226,RPSG PREV IMPLTED CAR VEN SYS L VENTR ELTRD
14330,Q9948,26848838,0,1,-1.1239,-59.94%,Q9948,LOCM 250-299 MG/ML IODINE CONC ML
14331,27059,7358797,0,1,-1.1345,-60.50%,27059,RAD RESECTION TUMOR SOFT TISS PELVIS&HIP 5 CM/>
14332,J1590,141294,0,1,-1.1352,-60.54%,J1590,"INJECTION, GATIFLOXACIN, 10 MG"
14333,65101,7396839,0,1,-1.1399,-60.79%,65101,ENUCLEATION OF EYE W/O IMPLANT
14334,99487,7431225,0,1,-1.1497,-61.31%,99487,CMPLX CHRON CARE MGMT W/O PT VST 1ST HR PER MO
14335,33990,7365728,0,1,-1.1551,-61.60%,33990,INSJ PERQ VAD W/IMAGING ARTERY ACCESS ONLY
14336,G8702,21266173,0,1,-1.1591,-61.82%,G8702,DOCUMENTATION THAT PROPHYLACTIC ANTIBIOTICS WERE
