In [4]:
import requests
import json
import re
import gzip
import pandas as pd
import tarfile
import os
import sys
import shutil
import numpy as np
import tarfile
import io
from io import StringIO

In [5]:
#Generates a folder to store the data portal gene expression data if none exits
if not os.path.exists(os.path.join(os.getcwd(),"data")):
    os.makedirs(os.getcwd(),"data")

In [110]:
class gdc_data:
    '''
    Creates data objects that can query the gdc data portal for gene expression data,
    write compressed data from portal to disk, and uncompress and store the gene expression
    data in a pandas dataframe (gene x sample_id)
    '''
    
    def __init__(self, name):
        #Initialize the type of cancer for the database query and the size of query
        self.name = name
        #initialize gene epression data matrix
        self.data = pd.DataFrame()
        #Initialize empty manifest data matrix
        self.manifest = pd.DataFrame()
        #Initialize the location for the data directory
        self.main_dir = os.path.join(os.getcwd(),"data")
        self.query_dir = os.path.join(self.main_dir,self.name)
        #Initialize location of data csv file
        self.file = os.path.join(self.query_dir,self.name+".csv")
        #Initialize empty file name
        self.file_name = ''
        #Initialize an empty http reponse
        self.response = ''
        #Initialize variable for size of query
        self.size = ''

    def data_query(self):
        '''
        Performs a query of the NCI genomic portal given a type of cancer.
        Ex. Type: Hepatocellular Carcinoma - LIHC
        Name followed by no. of samples desired. Ex. LIHC10 returns gene expression for first 10 samples 
        Returns the name of compressed tar.gz file, and a binary data file in memory
        '''
        
        files_endpt = "https://api.gdc.cancer.gov/files"
        
        cancer = ''.join([x for x in self.name if not x.isdigit()])
        size = ''.join([x for x in self.name if x.isdigit()])
        #If the size of the query was not specified, acquire data for all samples
        if not size.isdigit():
            size = 2000
    
        #Filters for the query, recieving all RNA-Seq, HTSeq-Count files for a specific cancer
        filters = {
            "op": "and",
            "content":[
                {
                "op": "in",
                "content":{
                    "field": "cases.project.project_id",
                    "value": ["TCGA-"+cancer]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.experimental_strategy",
                    "value": ["RNA-Seq"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.analysis.workflow_type",
                    "value": ["HTSeq - Counts"]
                    }
                }
            ]
        }

        # Here a GET is used, so the filter parameters should be passed as a JSON string.
        params = {
            "filters": json.dumps(filters),
            "fields": "file_id",
            "format": "JSON",
            "size": size  #Set to the first 10 files for developing
            }

        response = requests.get(files_endpt, params = params)
        file_uuid_list = []

        # This step populates the download list with the file_ids from the previous query
        for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
            file_uuid_list.append(file_entry["file_id"])

        data_endpt = "https://api.gdc.cancer.gov/data"

        params = {"ids": file_uuid_list}
        #Acquire memory location of compressed data from the data portal
        response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})
        
        response_head_cd = response.headers["Content-Disposition"]
        #Acquire the name of the file 
        file_name = re.findall("filename=(.+)", response_head_cd)[0]

        self.file_name = file_name
        self.response = response
        
    def data_write(self):
        """
        Writes query file to query directory
        """
        
        #Performs data query if filename and response have not been populated yet
        if not self.file_name and not self.response:
            self.data_query()
        
        #Create a path for this query if it doesnt exist already
        if not os.path.exists(self.query_dir):
            os.makedirs(self.query_dir)
            
        #desired location of compressed data targz file
        targz = os.path.join(self.query_dir,self.file_name)    
        #Opens a file named after the file_name, and writes the contents of the query to the file    
        with open(targz, "wb") as output_file:
            output_file.write(self.response.content) #writes the response to the desired location
        
        self.data_write_targz()
           
    def data_write_targz(self):
        '''
        Uncompressess a targz file under the query directory, and writes to disk
        '''
        
        #Performs data query and writes targz if filename and response have not been populated yet
        #Check if the filename has been populated, if not then check if it exists in the query directory
        #, if neither, run query
        if not self.file_name:
            self.file_name = ''.join([x for x in os.listdir(self.query_dir) if x[-6:] == 'tar.gz'])
            if not self.file_name:
                self.data_query()
                self.data_write()
            
        #desired location of compressed data targz file
        targz = os.path.join(self.query_dir,self.file_name)  
        
        #Create a path for the uncompressed targz files if it doesnt exist already
        uncomp_targz_dir = os.path.join(self.query_dir,"uncompressed_targz")
        if not os.path.exists(uncomp_targz_dir):
            os.makedirs(uncomp_targz_dir)
        #Unzips the tar.gz file into desired folder
        with tarfile.open(targz) as tar:
            tar.extractall(uncomp_targz_dir)
            tar.close()
        #Stores the manifest of the data 
        self.manifest = pd.read_table(os.path.join(uncomp_targz_dir,"MANIFEST.txt"),sep="\t")
        
        #Create a path for this uncompressed gz files if it doesnt exist already
        uncomp_gz_dir = os.path.join(uncomp_targz_dir,"uncompressed_gz")
        if not os.path.exists(uncomp_gz_dir):
            os.makedirs(uncomp_gz_dir)
        
        #Unzips all gz gene expression files in the query directory
        for subdir, dirs, files in os.walk(self.query_dir):
            for file in files:
                if file[-4:] == "s.gz":
                    with gzip.open(os.path.join(subdir,file),'rb') as f:
                        file_content = f.read().decode("utf-8")
                        df = pd.read_csv(StringIO(file_content),sep="\t",header=None).set_index(0)
                        df.columns = [files[0]]
                        df.to_csv(os.path.join(uncomp_gz_dir,file[:-3]),header=False,sep=",",index=True)
                        
        self.data_save()
                        
    def data_save(self):
        """
        Saves dataframe to object from uncompressed tar and targz files
        """
        #initialize/clear gene epression data matrix
        self.data = pd.DataFrame()
        
        uncomp_targz_dir = os.path.join(self.query_dir,"uncompressed_targz")
        uncomp_gz_dir = os.path.join(uncomp_targz_dir,"uncompressed_gz")
        #Stores the manifest of the data 
        self.manifest = pd.read_table(os.path.join(uncomp_targz_dir,"MANIFEST.txt"),sep="\t")
            
        for subdir, dirs, files in os.walk(self.query_dir):
            for file in files:
                if file[-4:] == "unts":
                    df = pd.read_csv(os.path.join(uncomp_gz_dir,file),sep=",",header=None).set_index(0)
                    df.columns = [file]
                    self.data = pd.concat([self.data,df],axis=1)
        
        self.size = self.data.shape #Store the dimensions of the data matrix
                                
    def data_add(self,data):
        """
        Directly load data into object if avaliable as a dataframe
        """
        self.data = data
            
    def save_file(self,safe=True):
        """
        Saves loaded data as a csv to disk
        safe mode (default) - True
        if downloading and saving large volumes of data change to (False, 0, None,"",[])
        """
        #File location
        file = os.path.join(self.query_dir,self.name+".csv")
        
        if safe:
            #Overwrite prevention
            if os.path.exists(file):
                response = input("Do you want to overwrite previously saved file? (y/n)")
                if response == "y":
                    self.data.to_csv(file)
                    print("csv file successfully saved")
            else:
                self.data.to_csv(file)
                print("csv file successfully saved")
                
            input("Press Enter to Continue...")
        else:
            #No overwrite protection, but no user prompts for loops
            self.data.to_csv(file)
            print("csv file successfully saved")
        
    def read_csv(self):
        """
        Reads data from csv to pandas dataframe
        """
        
        if os.path.exists(self.file):
            self.data = pd.read_csv(self.file)
        else:
            print("file does not exist")

In [114]:
test = gdc_data("LIHC")

In [116]:
test.read_csv()

In [117]:
test.data.head()

Unnamed: 0,0,004c60cf-c08e-49df-b4ce-baca41e11250.htseq.counts,0069f64b-8d8f-4426-968d-23483929ee58.htseq.counts,014b9b85-3128-416b-93d4-7ace3b676d4e.htseq.counts,03011a57-3e95-49d1-a927-cff4111d2d5b.htseq.counts,0415a9b4-a58d-4641-ab1a-927ed7a04824.htseq.counts,047bd029-d63b-4f25-8a73-b95ad72d434f.htseq.counts,04dc4da1-1d1a-46da-a9d8-da9964591aec.htseq.counts,04e7f1a4-3173-4a6f-af60-04e1f2e29868.htseq.counts,05ac7b05-e459-4833-97fc-530185a7a55f.htseq.counts,...,fb94b719-4a43-4090-9a61-9f62a30d545d.htseq.counts,fbbb6d26-8bd4-40da-870b-d8db0f653cfa.htseq.counts,fca37687-75b4-4ca6-9963-36b468ca01a7.htseq.counts,fdb62f73-33a7-44c3-950c-739383b9dd30.htseq.counts,fe506b98-0733-43c9-943a-be9b12f1c2fb.htseq.counts,fe625352-dd2e-478d-8d21-06659f854945.htseq.counts,fe76a5ca-f70a-4ab7-b080-5a19ae36dc2b.htseq.counts,feae9113-b2f3-4dd4-9faf-6076eb32c925.htseq.counts,ff8776f1-5499-459c-989f-0bc5268e6631.htseq.counts,ffeed225-c2a3-4b4c-954c-4816903782a9.htseq.counts
0,ENSG00000000003.13,2155,3871,5085,6070,3231,6762,2285,3395,6637,...,33015,2853,9818,9805,2644,2874,2136,5630,2364,4941
1,ENSG00000000005.5,1,4,16,1,0,0,1,1,1,...,13,2,0,0,0,0,3,1,0,2
2,ENSG00000000419.11,405,1133,1326,1047,1130,1079,1105,1302,476,...,1250,537,1390,1598,817,1026,569,1317,1087,1717
3,ENSG00000000457.12,170,727,587,327,638,834,585,853,184,...,574,58,535,910,465,814,304,1129,1428,1287
4,ENSG00000000460.15,49,167,1528,235,340,204,119,451,91,...,146,14,226,361,176,183,95,244,346,450


In [118]:
test.data.iloc[:10,:50]

Unnamed: 0,0,004c60cf-c08e-49df-b4ce-baca41e11250.htseq.counts,0069f64b-8d8f-4426-968d-23483929ee58.htseq.counts,014b9b85-3128-416b-93d4-7ace3b676d4e.htseq.counts,03011a57-3e95-49d1-a927-cff4111d2d5b.htseq.counts,0415a9b4-a58d-4641-ab1a-927ed7a04824.htseq.counts,047bd029-d63b-4f25-8a73-b95ad72d434f.htseq.counts,04dc4da1-1d1a-46da-a9d8-da9964591aec.htseq.counts,04e7f1a4-3173-4a6f-af60-04e1f2e29868.htseq.counts,05ac7b05-e459-4833-97fc-530185a7a55f.htseq.counts,...,160aee04-df36-4e94-90c5-b01b2991ba48.htseq.counts,16bbd77f-b39b-4f19-9d6b-58a21dde3e84.htseq.counts,16fc8611-259d-4cd8-9e93-60d97bebb6bf.htseq.counts,179c5822-3d5a-42b9-adeb-46a2fefd3df8.htseq.counts,18c32c46-00f9-4437-bdbb-05233fdd676e.htseq.counts,18e687c3-7d76-4a4d-a665-95331732ef9c.htseq.counts,198ca92e-342e-4890-8ec3-45b045711531.htseq.counts,19f2bfb6-33ba-4aea-9281-c5009d539562.htseq.counts,1a228668-b2a0-469f-8801-fdc8be449b44.htseq.counts,1b73c4fc-b155-44ef-b91a-ef7dfdb6a5e6.htseq.counts
0,ENSG00000000003.13,2155,3871,5085,6070,3231,6762,2285,3395,6637,...,3428,2422,5578,3704,9069,4838,6708,5459,8909,3053
1,ENSG00000000005.5,1,4,16,1,0,0,1,1,1,...,5,0,0,3,3,0,1,0,1,0
2,ENSG00000000419.11,405,1133,1326,1047,1130,1079,1105,1302,476,...,821,380,1681,1101,3776,1919,1494,1220,2590,1392
3,ENSG00000000457.12,170,727,587,327,638,834,585,853,184,...,502,302,958,913,1302,1184,714,533,759,885
4,ENSG00000000460.15,49,167,1528,235,340,204,119,451,91,...,537,84,480,622,251,526,354,165,696,636
5,ENSG00000000938.11,23,77,275,246,306,222,30,315,116,...,278,147,267,99,116,328,458,134,825,81
6,ENSG00000000971.14,18706,91848,120808,148950,23153,32093,57084,93174,33089,...,103347,44404,148451,119274,53085,29285,43721,64706,36911,70139
7,ENSG00000001036.12,3365,4954,3721,3148,3497,5544,1903,3407,1423,...,2307,1389,5978,444,6335,3545,4212,3015,14202,2870
8,ENSG00000001084.9,1616,9951,6941,2843,4088,4203,787,5080,2318,...,4815,2449,5884,14344,8113,1733,6372,6544,5479,7985
9,ENSG00000001167.13,332,1090,655,391,890,1725,246,2139,350,...,486,374,1159,1435,1693,3156,1280,1680,2977,732


In [8]:
test.size

(60488, 424)

In [18]:
test.data.iloc[-10:,0:5]

Unnamed: 0_level_0,004c60cf-c08e-49df-b4ce-baca41e11250.htseq.counts,0069f64b-8d8f-4426-968d-23483929ee58.htseq.counts,014b9b85-3128-416b-93d4-7ace3b676d4e.htseq.counts,03011a57-3e95-49d1-a927-cff4111d2d5b.htseq.counts,0415a9b4-a58d-4641-ab1a-927ed7a04824.htseq.counts
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSGR0000275287.3,0,0,0,0,0
ENSGR0000276543.3,0,0,0,0,0
ENSGR0000277120.3,0,0,0,0,0
ENSGR0000280767.1,0,0,0,0,0
ENSGR0000281849.1,0,0,0,0,0
__no_feature,2156872,1573921,2161657,2078657,1560207
__ambiguous,2069326,3381699,3639526,2126608,2201179
__too_low_aQual,0,0,0,0,0
__not_aligned,0,0,0,0,0
__alignment_not_unique,24722665,14525218,25061297,12907050,10551624


In [7]:
from sklearn.decomposition import PCA

In [8]:
pca = PCA(n_components=3)
#Everything except for the last 5 rows
pca.fit(test.data.iloc[:-5,].T)

PCA(copy=True, n_components=3, whiten=False)

In [9]:
sum(pca.explained_variance_ratio_)

0.78030891808336023

In [10]:
pc = pca.transform(test.data.iloc[:-5,].T) 

In [11]:
pc = pd.DataFrame(pc)

In [12]:
pc.shape

(424, 3)

In [13]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [18]:
plt.plot(pc.iloc[:,0],pc.iloc[:,1],'x')
plt.show()

In [30]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pc.iloc[:,0],pc.iloc[:,1],pc.iloc[:,2],c=pc['color'])
plt.show()

In [5]:
cases.project.project_id in ["TCGA-LIHC"] and cases.samples.sample_type_id in ["11"] and 
files.analysis.workflow_type in ["HTSeq - Counts"] and files.experimental_strategy in ["RNA-Seq"]

In [22]:
files_endpt = "https://api.gdc.cancer.gov/files"
# Filters for identifying the file names for Solid Tissue Normal (STN)
# This set of filters is nested under an 'and' operator.
filt = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-LIHC"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.samples.sample_type_id",
            "value": ["11"]
            }
        },
        {
        "op": "in",
        "content":{
           "field": "files.analysis.workflow_type",
            "value": ["HTSeq - Counts"]
            }
        },
        {"op":"in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        }
    ]
}

params = {'filters':json.dumps(filt),
         'size': 1000,
         'fields': 'file_name' #Set to only return the file name 
         }
# requests URL-encodes automatically
response = requests.get(files_endpt, params = params)
#Solid Tissue Normal (stn), file names
stnJson = json.dumps(response.json(), indent=2)

In [23]:
#Parse the json file so it can be used as a dictionary
parsed_json = json.loads(stnJson)

In [24]:
num = parsed_json['data']['pagination']['total'] #Acquire the number of hits
seq = [] #Initialize empty list for file names
#Iterate through the list of dictionarys and append the file name to the list
for dic in parsed_json['data']['hits']:
    seq.append(dic['file_name'])

In [25]:
seq
seq_cut = seq
for i in range(50):
    seq_cut[i] = seq_cut[i][:-3]
seq_cut

['2b8bf629-3c22-4dcb-a9a5-ec01c5099167.htseq.counts',
 '4d3842d1-c643-4659-98e9-8d613144fa91.htseq.counts',
 'f43699d8-3acc-445d-8ab2-9475194ebbc9.htseq.counts',
 '9abf327b-d44f-435d-986d-09107bcd2643.htseq.counts',
 'bc4f3a78-39a4-4a92-ad83-df251c5bf994.htseq.counts',
 'fa7877bb-9e46-4ecb-8c47-501893ce2410.htseq.counts',
 '6892a129-ae07-49df-a018-30c63c4d3bd7.htseq.counts',
 '5f94c33f-588b-4b6a-9c13-4505b0f94403.htseq.counts',
 'ec8fb205-0d42-4a66-b8ec-5bdd5da8a23b.htseq.counts',
 'b275a5d0-b9fe-4838-adb0-34db16a246af.htseq.counts',
 '8bf84ab0-7a48-4d98-bfc0-88537003ecd0.htseq.counts',
 '2d40e20c-bee4-48f2-a8b0-cdf8dff75dc3.htseq.counts',
 'a9123cf0-c6c0-42e7-b546-f11dfac355ce.htseq.counts',
 'd2141823-4f44-42d5-ad67-08a28d31eb63.htseq.counts',
 '2cc2e3ce-68cd-4690-9fff-5ecf86c2f57a.htseq.counts',
 '576fda32-4e0c-414d-b54c-259f2503fc11.htseq.counts',
 '6e146a57-21b2-4587-b888-01ed03ea9498.htseq.counts',
 '6cd9b90b-0c6c-44bd-8536-5a93553a7730.htseq.counts',
 '5745ce68-bb15-4d52-9132-4b

In [36]:
test.data.head()

Unnamed: 0_level_0,004c60cf-c08e-49df-b4ce-baca41e11250.htseq.counts,0069f64b-8d8f-4426-968d-23483929ee58.htseq.counts,014b9b85-3128-416b-93d4-7ace3b676d4e.htseq.counts,03011a57-3e95-49d1-a927-cff4111d2d5b.htseq.counts,0415a9b4-a58d-4641-ab1a-927ed7a04824.htseq.counts,047bd029-d63b-4f25-8a73-b95ad72d434f.htseq.counts,04dc4da1-1d1a-46da-a9d8-da9964591aec.htseq.counts,04e7f1a4-3173-4a6f-af60-04e1f2e29868.htseq.counts,05ac7b05-e459-4833-97fc-530185a7a55f.htseq.counts,05efced9-d60f-43ea-9b61-c6efaf902e7d.htseq.counts,...,fb94b719-4a43-4090-9a61-9f62a30d545d.htseq.counts,fbbb6d26-8bd4-40da-870b-d8db0f653cfa.htseq.counts,fca37687-75b4-4ca6-9963-36b468ca01a7.htseq.counts,fdb62f73-33a7-44c3-950c-739383b9dd30.htseq.counts,fe506b98-0733-43c9-943a-be9b12f1c2fb.htseq.counts,fe625352-dd2e-478d-8d21-06659f854945.htseq.counts,fe76a5ca-f70a-4ab7-b080-5a19ae36dc2b.htseq.counts,feae9113-b2f3-4dd4-9faf-6076eb32c925.htseq.counts,ff8776f1-5499-459c-989f-0bc5268e6631.htseq.counts,ffeed225-c2a3-4b4c-954c-4816903782a9.htseq.counts
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.13,2155,3871,5085,6070,3231,6762,2285,3395,6637,3560,...,33015,2853,9818,9805,2644,2874,2136,5630,2364,4941
ENSG00000000005.5,1,4,16,1,0,0,1,1,1,2,...,13,2,0,0,0,0,3,1,0,2
ENSG00000000419.11,405,1133,1326,1047,1130,1079,1105,1302,476,1174,...,1250,537,1390,1598,817,1026,569,1317,1087,1717
ENSG00000000457.12,170,727,587,327,638,834,585,853,184,461,...,574,58,535,910,465,814,304,1129,1428,1287
ENSG00000000460.15,49,167,1528,235,340,204,119,451,91,127,...,146,14,226,361,176,183,95,244,346,450


In [42]:
apple = "String.counts.gz"

In [43]:
apple[:-3]

'String.counts'

In [57]:
test.data.columns

Index(['004c60cf-c08e-49df-b4ce-baca41e11250.htseq.counts',
       '0069f64b-8d8f-4426-968d-23483929ee58.htseq.counts',
       '014b9b85-3128-416b-93d4-7ace3b676d4e.htseq.counts',
       '03011a57-3e95-49d1-a927-cff4111d2d5b.htseq.counts',
       '0415a9b4-a58d-4641-ab1a-927ed7a04824.htseq.counts',
       '047bd029-d63b-4f25-8a73-b95ad72d434f.htseq.counts',
       '04dc4da1-1d1a-46da-a9d8-da9964591aec.htseq.counts',
       '04e7f1a4-3173-4a6f-af60-04e1f2e29868.htseq.counts',
       '05ac7b05-e459-4833-97fc-530185a7a55f.htseq.counts',
       '05efced9-d60f-43ea-9b61-c6efaf902e7d.htseq.counts',
       ...
       'fb94b719-4a43-4090-9a61-9f62a30d545d.htseq.counts',
       'fbbb6d26-8bd4-40da-870b-d8db0f653cfa.htseq.counts',
       'fca37687-75b4-4ca6-9963-36b468ca01a7.htseq.counts',
       'fdb62f73-33a7-44c3-950c-739383b9dd30.htseq.counts',
       'fe506b98-0733-43c9-943a-be9b12f1c2fb.htseq.counts',
       'fe625352-dd2e-478d-8d21-06659f854945.htseq.counts',
       'fe76a5ca-f70a-4ab7-b0

In [79]:
pc['label']

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
394    0
395    0
396    0
397    0
398    0
399    0
400    0
401    0
402    0
403    0
404    0
405    0
406    0
407    0
408    0
409    0
410    0
411    0
412    0
413    0
414    0
415    0
416    0
417    0
418    0
419    0
420    0
421    0
422    0
423    0
Name: label, dtype: int64

In [26]:
lap = np.in1d(test.data.columns,seq_cut)

In [27]:
lap

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False,  True,

In [28]:
pc['color'] = ["Blue" if x else "Red" for x in lap]

In [29]:
pc

Unnamed: 0,0,1,2,color
0,-2.296973e+06,-3.414047e+06,5.700028e+05,Red
1,-7.759751e+05,2.038060e+05,-9.005815e+05,Red
2,-2.441984e+05,-2.470902e+06,-2.718885e+06,Red
3,2.044551e+06,-3.217580e+05,-9.252919e+04,Red
4,3.009963e+06,1.369600e+06,3.763464e+04,Red
5,3.839479e+05,7.718369e+05,4.436665e+05,Red
6,-9.125394e+05,3.899077e+05,9.006378e+04,Red
7,1.291448e+06,2.947950e+05,2.888563e+05,Red
8,-2.114823e+06,-8.774298e+05,-1.189472e+06,Red
9,-2.162015e+06,-1.603421e+06,4.405083e+05,Red


In [141]:
plt.scatter(pc.iloc[:,0],pc.iloc[:,1],c=pc['color'])
plt.show()

KeyError: 'color'