In [21]:
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
import requests, zipfile, StringIO
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import gzip, binascii

import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import re
from sklearn.preprocessing import Imputer
from numpy import random

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from six.moves import xrange  # pylint: disable=redefined-builtin

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [22]:
url = 'https://www.kaggle.com/piotrgrabo/breastcancerproteomes/downloads/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  print (statinfo.st_size)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

data_zipfilename = maybe_download('breastcancerproteomes.zip', 5680320)
print ('data_zipfilename: ', data_zipfilename)
#clinical_data_BC_filename = maybe_download('clinical_data_breast_cancer.csv', 18637)
#PAM50_proteins_filename = maybe_download('PAM50_proteins.csv', 6674)


5680320
Found and verified breastcancerproteomes.zip
data_zipfilename:  breastcancerproteomes.zip


In [23]:
def maybe_extract(filename, force=False):
  #print (os.path.abspath(filename))
  print (os.path.splitext(filename)[0])
  root = os.path.splitext(filename)[0]  # remove .zip
  print (root)
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    zf = zipfile.ZipFile(filename, 'r')
    zipfile_namelist = zf.namelist()
    print (zipfile_namelist)
    
    for filename in zipfile_namelist:
        try:
            data = zf.read(filename)
        except KeyError:
            print ('ERROR: Did not find %s in zip file' % filename)
        else:
            print (filename, ':')
            #print (repr(data))

  return zipfile_namelist
  
data_files= maybe_extract(data_zipfilename)


breastcancerproteomes
breastcancerproteomes
Extracting data for breastcancerproteomes. This may take a while. Please wait.
['77_cancer_proteomes_CPTAC_itraq.csv', 'PAM50_proteins.csv', 'clinical_data_breast_cancer.csv']
77_cancer_proteomes_CPTAC_itraq.csv :
PAM50_proteins.csv :
clinical_data_breast_cancer.csv :


In [24]:
#z = os.path.basename(filename)
#print (z)

dataset_path = data_files[0]
print (data_path)
pam50_proteins = data_files[1]
print (pam50_proteins)
clinical_info = data_files[2]
print (clinical_info)


77_cancer_proteomes_CPTAC_itraq.csv
PAM50_proteins.csv
clinical_data_breast_cancer.csv


In [25]:
try:
    data = pd.read_csv(os.path.abspath(dataset_path))
    data.drop(['gene_symbol','gene_name'], axis = 1, inplace = True)
    print ("Breast Cancer Patients dataset has {} samples with {} features each.".format(*data.shape))
    
    pam50 = pd.read_csv(os.path.abspath(pam50_proteins))
    print ("Pam50 dataset has {} samples with {} features each.".format(*data.shape))
    
    clinical = pd.read_csv(os.path.abspath(clinical_info))
    print ("Clinical dataset has {} samples with {} features each.".format(*data.shape))
except:
    print ("Dataset could not be loaded. Is the dataset missing?")

Breast Cancer Patients dataset has 12553 samples with 84 features each.
Pam50 dataset has 12553 samples with 84 features each.
Clinical dataset has 12553 samples with 84 features each.
