In [4]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np
import pandas as pd
import glob
import os

In [5]:
#read in the human reference genome and parse it into a dictionary
record_dict = SeqIO.to_dict(SeqIO.parse('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', 'fasta'))

In [6]:
file_bank = glob.glob(os.path.join('gdc_download_20200427_231049.833434', '*.seg.v2.txt'))

In [7]:
final_results = pd.DataFrame()
#write a for loop to read in the files as a bed format
for file in file_bank:
    file_bed = pd.read_csv(file, sep='\t', header=0)
    
    #filter the csv frile to a pandas dataframe with these specified columns
    results1 = file_bed[['GDC_Aliquot', 'Chromosome', 'Start','End','Segment_Mean']]
    
     #filter segment length, copy gain, then find the sequence
    Length = results1['End'] - results1['Start']
    results1['Length'] = Length
    Length_less = results1['Length'] <= 500000
    results2 = results1[Length_less]
    copy_gain = results2['Segment_Mean'] >= 0.4
    results3 = results2[copy_gain]
    
    sequences = []
    
    for index ,row in results3.iterrows():
        chromosome = 'chr' + str(row['Chromosome'])
        start = int(row['Start'])
        end = int(row['End'])
        chromosome_sequence = record_dict[chromosome].seq 
        sequences.append(str(chromosome_sequence[start:end]))
    
    #set sequence as a separate dataframe
    sequences1 = pd.DataFrame(sequences, columns=['Sequence'])
     
    #reset the index of the previous generated dataframe
    results3 = results3.reset_index()
    results3.drop("index", axis=1)
    
    #join the sequence dataframe to the original dataframe 
    results3 = results3.join(sequences1, how='right')

    final_results = final_results.append(results3)

## Shape of Data

In [11]:
print(final_results.shape)

(20647, 8)


In [8]:
#turn it into a csv file
final_results.to_csv('cnv_results_final.csv', sep='\t', index=False)

## Head of Data

In [9]:
pd.read_csv('cnv_results_final.csv', sep='\t', header=0).head()

Unnamed: 0,index,GDC_Aliquot,Chromosome,Start,End,Segment_Mean,Length,Sequence
0,6,5a456ee8-0991-45be-92d8-d30f8ec0455c,1,27294806,27376554,0.5868,81748,TCAGTCTTTTTATTTCATTTTTTCTTAATGAACTAGATAGAAATAT...
1,9,5a456ee8-0991-45be-92d8-d30f8ec0455c,1,28782467,28983899,1.0996,201432,AGCAAGTCGTTAGGCTTGGCCTGTTTGGGAAGTGAGGGACCTGGGC...
2,10,5a456ee8-0991-45be-92d8-d30f8ec0455c,1,28984562,29001818,1.6893,17256,GAAGTAAATTTACTTGAATAAATTTAGAGAATGCCTTTGCAGTTTG...
3,20,5a456ee8-0991-45be-92d8-d30f8ec0455c,1,40267986,40673375,1.1267,405389,TAGTCCTTGGTTACCCATGCTGTATTGGCATAACATTAGTTACAAT...
4,21,5a456ee8-0991-45be-92d8-d30f8ec0455c,1,40673912,40688337,0.6618,14425,ATTAAATGAGAAAACATATGTAAAGTGATCAGCAGTGACTGGTATA...


## Null Values Shown as False

In [10]:
print(final_results.notnull())

    index  GDC_Aliquot  Chromosome  Start   End  Segment_Mean  Length  \
0    True         True        True   True  True          True    True   
1    True         True        True   True  True          True    True   
2    True         True        True   True  True          True    True   
3    True         True        True   True  True          True    True   
4    True         True        True   True  True          True    True   
..    ...          ...         ...    ...   ...           ...     ...   
2    True         True        True   True  True          True    True   
3    True         True        True   True  True          True    True   
4    True         True        True   True  True          True    True   
5    True         True        True   True  True          True    True   
6    True         True        True   True  True          True    True   

    Sequence  
0       True  
1       True  
2       True  
3       True  
4       True  
..       ...  
2       True  
3  