In [1]:
import pandas as pd

## Samina's Gene pairs Dataset

In [2]:
df = pd.read_csv('samina_results.csv')[1:].copy()

In [3]:
df.head()

Unnamed: 0,ID_gene1,ID_gene2,ch_gene1,ch_gene2,strand_gene1,strand_gene2,type_gene1,type_gene2,name_gene1,name_gene2,start_end_gene1,start_end_gene2,genename1,genename2
1,ST0002,ST3638,1.0,1.0,+,-,CUTs,CUTs,CUT001,CUT437,"(30073,30905)","(30533,30893)",unknown,unknown
2,ST0003,ST3639,1.0,1.0,+,-,ORF-T,SUTs,YAL062W,SUT434,"(31153,32985)","(31485,32749)",Glutamate DeHydrogenase,unknown
3,ST3640,ST0004,1.0,1.0,-,+,SUTs,ORF-T,SUT435,YAL061W,"(33077,34381)","(33361,34897)",unknown,
4,ST0004,ST3641,1.0,1.0,+,-,ORF-T,CUTs,YAL061W,CUT438,"(33361,34897)","(34381,34749)",,unknown
5,ST0005,ST3642,1.0,1.0,+,-,ORF-T,CUTs,YAL060W,CUT439,"(35097,36393)","(35797,36349)",Butanediol DeHydrogenase,unknown


In [4]:
df.columns

Index(['ID_gene1', 'ID_gene2', 'ch_gene1', 'ch_gene2', 'strand_gene1',
       'strand_gene2', 'type_gene1', 'type_gene2', 'name_gene1', 'name_gene2',
       'start_end_gene1', 'start_end_gene2', 'genename1', 'genename2'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2954 entries, 1 to 2954
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID_gene1         2954 non-null   object 
 1   ID_gene2         2954 non-null   object 
 2   ch_gene1         2954 non-null   float64
 3   ch_gene2         2954 non-null   float64
 4   strand_gene1     2954 non-null   object 
 5   strand_gene2     2954 non-null   object 
 6   type_gene1       2954 non-null   object 
 7   type_gene2       2954 non-null   object 
 8   name_gene1       2954 non-null   object 
 9   name_gene2       2954 non-null   object 
 10  start_end_gene1  2954 non-null   object 
 11  start_end_gene2  2954 non-null   object 
 12  genename1        2706 non-null   object 
 13  genename2        2743 non-null   object 
dtypes: float64(2), object(12)
memory usage: 323.2+ KB


### Transforming paris of genes into single genes

In [6]:
first_gene_columns = ['ID_gene1','ch_gene1','strand_gene1','type_gene1','name_gene1','start_end_gene1','genename1']
second_gene_columns = ['ID_gene2','ch_gene2','strand_gene2','type_gene2','name_gene2','start_end_gene2','genename2']
correct_column_names = ['id','chrom','strand','type','name','start_end','genname']

In [7]:
first_genes = df[first_gene_columns]
second_genes = df[second_gene_columns]

In [8]:
corrected_first_genes_df = first_genes.rename(columns={first_gene_columns[i]:correct_column_names[i] for i in range(7)})

In [9]:
corrected_second_genes_df = second_genes.rename(columns={second_gene_columns[i]:correct_column_names[i] for i in range(7)})

In [10]:
genes_df = pd.concat([corrected_first_genes_df,corrected_second_genes_df]).reset_index(drop=True)

### Removing Duplicates

In [11]:
genes_df = genes_df.drop_duplicates().reset_index(drop=True)

In [12]:
genes_df['chrom'] = genes_df['chrom'].astype(int)

In [13]:
genes_df

Unnamed: 0,id,chrom,strand,type,name,start_end,genname
0,ST0002,1,+,CUTs,CUT001,"(30073,30905)",unknown
1,ST0003,1,+,ORF-T,YAL062W,"(31153,32985)",Glutamate DeHydrogenase
2,ST3640,1,-,SUTs,SUT435,"(33077,34381)",unknown
3,ST0004,1,+,ORF-T,YAL061W,"(33361,34897)",
4,ST0005,1,+,ORF-T,YAL060W,"(35097,36393)",Butanediol DeHydrogenase
...,...,...,...,...,...,...,...
4940,ST7262,16,-,ORF-T,YPR188C,"(911823,912511)",Myo1p Light Chain
4941,ST3625,16,+,ORF-T,YPR192W,"(921755,922827)",AQuaporin from Yeast
4942,ST3627,16,+,SUTs,SUT429,"(927659,928459)",unknown
4943,ST3629,16,+,other,"YPR196W, YPRWtau4","(931203,933179)",unknown


### Splitting start and end

In [14]:
genes_df['start'] = genes_df['start_end'].apply(lambda x: int(x[1:-1].split(',')[0]))
genes_df['end'] = genes_df['start_end'].apply(lambda x: int(x[1:-1].split(',')[1]))

In [15]:
genes_df.drop(columns=['start_end'],inplace=True)

In [16]:
genes_df

Unnamed: 0,id,chrom,strand,type,name,genname,start,end
0,ST0002,1,+,CUTs,CUT001,unknown,30073,30905
1,ST0003,1,+,ORF-T,YAL062W,Glutamate DeHydrogenase,31153,32985
2,ST3640,1,-,SUTs,SUT435,unknown,33077,34381
3,ST0004,1,+,ORF-T,YAL061W,,33361,34897
4,ST0005,1,+,ORF-T,YAL060W,Butanediol DeHydrogenase,35097,36393
...,...,...,...,...,...,...,...,...
4940,ST7262,16,-,ORF-T,YPR188C,Myo1p Light Chain,911823,912511
4941,ST3625,16,+,ORF-T,YPR192W,AQuaporin from Yeast,921755,922827
4942,ST3627,16,+,SUTs,SUT429,unknown,927659,928459
4943,ST3629,16,+,other,"YPR196W, YPRWtau4",unknown,931203,933179


In [17]:
genes_df.to_csv('genes_df.csv')

## Loading Chromosomes

In [18]:
!ls chromosome/

1.fsa  11.fsa 13.fsa 15.fsa 2.fsa  4.fsa  6.fsa  8.fsa
10.fsa 12.fsa 14.fsa 16.fsa 3.fsa  5.fsa  7.fsa  9.fsa


In [19]:
!head -n 10 chromosome/1.fsa

>>tpg|BK006935.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=I] [note=R64-1-1]
CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA
CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT
ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC
CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC
CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC
TGTTCTTCTACCCACCATATTGAAACGCTAACAAATGATCGTAAATAACACACACGTGCT
TACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTCACTTGTATACTGATTT
TACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTC
CACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATG


In [20]:
chromosomes = {}

In [21]:
for i in range(1,17):
    with open(f'chromosome/{i}.fsa','r') as chromosome_file:
        lines = chromosome_file.read().splitlines()
        chromosomes[i] = ''.join(lines[1:])
        chromosomes[i] = chromosomes[i].replace("\n", "").replace(" ", "")

In [22]:
for key,value in chromosomes.items():
    print(f'Chromosome {key}: length->{len(value)}')

Chromosome 1: length->230218
Chromosome 2: length->813184
Chromosome 3: length->316620
Chromosome 4: length->1531933
Chromosome 5: length->576874
Chromosome 6: length->270161
Chromosome 7: length->1090940
Chromosome 8: length->562643
Chromosome 9: length->439888
Chromosome 10: length->745751
Chromosome 11: length->666816
Chromosome 12: length->1078177
Chromosome 13: length->924431
Chromosome 14: length->784333
Chromosome 15: length->1091291
Chromosome 16: length->948066


### Selecting gene from its start and end index from chromosome
- positions are inclusive and start at 1

In [26]:
def select_gene(row):
    string = chromosomes[row['chrom']][row['start']-1:row['end']]
    if row['strand'] == '+':
        return string
    if row['strand'] == '-':
        return(string[::-1].replace("C", "X").replace("G", "C").replace("X", "G").replace("A", "X").replace("T", "A").replace("X", "T"))
    if row['strand'] == 'both':
        return string
genes_df['sequence'] = genes_df.apply(select_gene,axis=1)

In [27]:
genes_df.to_csv('genes_with_sequence.csv')

In [28]:
genesws = pd.read_csv('genes_with_sequence.csv',index_col=0)

In [29]:
genesws

Unnamed: 0,id,chrom,strand,type,name,genname,start,end,sequence
0,ST0002,1,+,CUTs,CUT001,unknown,30073,30905,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,ST0003,1,+,ORF-T,YAL062W,Glutamate DeHydrogenase,31153,32985,TGACAGATATTCTGCACTTAAAAACTAAAAATATTATACCAACTTT...
2,ST3640,1,-,SUTs,SUT435,unknown,33077,34381,TTCTTTCATGCAATGTGATGTCCATCGGAGAAAACTGTATCTTGTG...
3,ST0004,1,+,ORF-T,YAL061W,,33361,34897,CGTCAAGATATTTGAAAGTTAATAGACAGTTAACAATAATAACAAC...
4,ST0005,1,+,ORF-T,YAL060W,Butanediol DeHydrogenase,35097,36393,ACGATTGACCAAGTCAGAAAAAAAAAAAAAAAGGAACTAAAAAAAG...
...,...,...,...,...,...,...,...,...,...
4940,ST7262,16,-,ORF-T,YPR188C,Myo1p Light Chain,911823,912511,CGAATCGTTTATTGAGCAAACTTGAAGTAGATATGGACCATAGTGA...
4941,ST3625,16,+,ORF-T,YPR192W,AQuaporin from Yeast,921755,922827,TGTCCTTTTGATATTTTGTTTTTTTTTTCCTTCTCTTTTTTTATTC...
4942,ST3627,16,+,SUTs,SUT429,unknown,927659,928459,AGATAAGAAGTGTTGCAGTAATATTATGTGTCACACCAGCGTGGCT...
4943,ST3629,16,+,other,"YPR196W, YPRWtau4",unknown,931203,933179,TCCGGATGACTGCTTGCGTTACTATATATTTTTTGCTGTAATTCAG...


In [30]:
genesws.nunique()

id          4945
chrom         16
strand         2
type           4
name        4921
genname     1658
start       4931
end         4932
sequence    4945
dtype: int64

In [31]:
genesws.type.unique()

array(['CUTs', 'ORF-T', 'SUTs', 'other'], dtype=object)

In [34]:
cut_df = genesws[genesws['type'] == 'CUTs'].reset_index(drop=True)
cut_df.to_csv("cut_table.csv")

In [52]:
cut_df_needed_samina = cut_df.drop(columns=['type', 'genname', 'id'])
cut_df_needed_samina

Unnamed: 0,chrom,strand,name,start,end,sequence
0,1,+,CUT001,30073,30905,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,1,+,CUT003,143439,143599,CTTGTCGAAAATACGCGGTGTAGGGAGTTATGGTGGATAACTTTTT...
2,1,+,CUT004,170495,172447,AATTACTTCCCTGGCTCGCTCCTCCACTGCCTGGGTAAATTGTTCC...
3,2,+,CUT006,96354,96570,ATTTTGGTAAATGCACACGGATAAATGCATGATATATAAAATATTA...
4,2,-,CUT445,278428,280876,AATGGTAGTCATATCATGTCAAGAATAGGTATCCAAAACGCAGCGG...
...,...,...,...,...,...,...
838,16,+,CUT433,822051,822163,AAACTTGCCGTTAACAATAAGTGACCTTGCATGGAAGACTGTCGAT...
839,16,-,CUT918,834071,834303,TGCTAACTTTTATCATCAAATCAGTAAACCCCTTCTTGTTTTTTCT...
840,16,-,CUT919,874527,874879,CACAGATGAGGTCAGATTTAACTATTGACCTTCTTGAAACCAAATA...
841,16,-,CUT921,878151,878495,AACAGGTTCTTTCAACTCTGGAAATTCATCCACAATCTTGTCAGCA...


In [53]:
cut_df_needed_samina.nunique()

chrom        16
strand        2
name        843
start       843
end         842
sequence    843
dtype: int64

In [56]:
cut_df_needed_samina.to_csv("overlapping_CUTs.csv")

In [65]:
noncoding_df = pd.read_csv('noncoding_genes.csv', header = None)
noncoding_df.columns = ['chrom', 'strand', 'name', 'start', 'end', 'sequence']

In [174]:
noncoding_df.nunique()

chrom        17
strand        2
name         30
start       435
end         433
sequence    219
dtype: int64

In [67]:
overlapping_cut_sequences = cut_df_needed_samina.drop(columns=['chrom', 'strand', 'name', 'start','end'])
overlapping_cut_sequences

Unnamed: 0,sequence
0,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,CTTGTCGAAAATACGCGGTGTAGGGAGTTATGGTGGATAACTTTTT...
2,AATTACTTCCCTGGCTCGCTCCTCCACTGCCTGGGTAAATTGTTCC...
3,ATTTTGGTAAATGCACACGGATAAATGCATGATATATAAAATATTA...
4,AATGGTAGTCATATCATGTCAAGAATAGGTATCCAAAACGCAGCGG...
...,...
838,AAACTTGCCGTTAACAATAAGTGACCTTGCATGGAAGACTGTCGAT...
839,TGCTAACTTTTATCATCAAATCAGTAAACCCCTTCTTGTTTTTTCT...
840,CACAGATGAGGTCAGATTTAACTATTGACCTTCTTGAAACCAAATA...
841,AACAGGTTCTTTCAACTCTGGAAATTCATCCACAATCTTGTCAGCA...


In [68]:
noncoding_sequences = noncoding_df.drop(columns=['chrom', 'strand', 'name', 'start','end'])
noncoding_sequences

Unnamed: 0,sequence
0,GGGCCCTTTCTTCCGTTTGAACGTAAAGGCATTTTTGAGACCATTA...
1,GTGAATGATGAATTTAATTCTTTGGTCCGTGTTTATGATGGGAAGT...
2,GGTTGTTTGGCCGAGCGGTCTAAGGCGCCTGATTCAAGAAATATCT...
3,GGGCGTGTGGTCTAGTGGTATGATTCTCGCTTTGGGCGACTTCCTG...
4,GGGCACATGGCGCAGTTGGTAGCGCGCTTCCCTTGCAAGGAAGAGG...
...,...
432,GCGGATTTAGCTCAGTTGGGAGAGCGCCAGACTGAAGAAAAAACTT...
433,GCGGATTTAGCTCAGTTGGGAGAGCGCCAGACTGAAGAAAAAACTT...
434,TCCTTGTTAGCTCAGTTGGTAGAGCGTTCGGCTTTTAAGCGCATTT...
435,GCTCGTATGGCGCAGTGGTAGCGCAGCAGATTGCAAATCTGTTGGT...


In [69]:
nonoverlapping_cuts_df = pd.read_csv('nonoverlapping_CUTs.csv')

In [70]:
nonoverlapping_cuts_df

Unnamed: 0,Type,ID,Chromosome,Transcribed_strand,Deletion_start_site,Deletion_end_site,YP 30C+2% Glucose,YP 30C+2% Glucose+5% Methanol,YP 30C+2% Glucose+0.4% TCA,YP 30C+2% Galactose,...,LSS-ESS_C-limited 30C_q.value,LSS-ESS_C-limited 30C_FC,LSS-ESS_N-limited 36C_q.value,LSS-ESS_N-limited 36C_FC,LSS-ESS_C-limited 36C_q.value,LSS-ESS_C-limited 36C_FC,LSS-ESS_N-limited 30C + 100mM LiCl_q.value,LSS-ESS_N-limited 30C + 100mM LiCl_FC,LSS-ESS_C-limited 30C + 100mM LiCl_q.value,LSS-ESS_C-limited 30C + 100mM LiCl_FC
0,CUT,CUT001--CUT437,1,both,30028,30950,-0.128454283,non-significant,non-significant,non-significant,...,1.0,1.966955,2.390000e-04,0.020278,0.389297,0.256269,0.293861,0.058330,0.001968,0.023032
1,CUT,CUT002,1,+,138607,138831,-0.644129986,-0.805297093,-0.216548183,-0.320866634,...,1.0,15.007987,7.280077e-03,0.019060,0.191974,0.284571,1.000000,0.157434,0.579547,0.061475
2,CUT,CUT005,1,+,180255,180511,,,,,...,1.0,5.335316,1.061972e-03,0.031227,0.027180,0.051301,1.000000,0.278810,1.000000,0.773978
3,CUT,CUT007,2,+,191991,192343,,,,,...,1.0,3.583513,3.590000e-07,0.003830,0.152099,0.182194,0.143802,0.065291,1.000000,0.099942
4,CUT,CUT008,2,+,237744,237928,non-significant,non-significant,-0.381190311,non-significant,...,1.0,12.377517,1.000000e+00,0.139347,0.944729,0.178498,1.000000,0.097792,1.000000,0.537298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,CUT,CUT873,15,-,1003415,1004087,,,,,...,1.0,0.269072,6.270000e-04,0.016518,1.000000,0.324277,1.000000,4.442078,0.124352,0.036079
63,SUT-CUT,SUT086--CUT102,5,+,7668,8134,-0.272163352,-0.188385856,-0.377701506,-0.481597931,...,1.0,1.896995,3.557592e-03,0.014247,1.000000,0.858757,1.000000,0.118071,0.816510,0.065340
64,SUT-CUT,SUT233--CUT707,11,both,229966,231479,-0.716207371,-0.644378768,-0.856118859,-0.839265734,...,1.0,8.447845,1.797692e-02,0.019331,0.057661,0.041079,1.000000,0.086992,1.000000,0.563029
65,SUT-CUT,SUT420--CUT425,16,+,607502,608688,non-significant,non-significant,non-significant,non-significant,...,1.0,1.806966,2.340000e-04,0.006599,1.000000,0.291476,1.000000,0.121357,0.884281,0.059028


In [176]:
nonoverlapping_cuts_df.Transcribed_strand.value_counts()

+       35
-       26
both     6
Name: Transcribed_strand, dtype: int64

In [92]:
nonoverlapping_cuts = nonoverlapping_cuts_df[['Type', 'ID', 'Chromosome', "Transcribed_strand", "Deletion_start_site", "Deletion_end_site"]]

In [98]:
nonoverlapping_cuts.Type.value_counts()

CUT        59
CUT-SUT     4
SUT-CUT     4
Name: Type, dtype: int64

In [102]:
nonoverlapping_cuts.rename(columns={'Deletion_start_site' : 'start', 'Deletion_end_site' : 'end', 'Chromosome' : 'chrom'}, inplace = True)
nonoverlapping_cuts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Type,ID,chrom,Transcribed_strand,start,end
0,CUT,CUT001--CUT437,1,both,30028,30950
1,CUT,CUT002,1,+,138607,138831
2,CUT,CUT005,1,+,180255,180511
3,CUT,CUT007,2,+,191991,192343
4,CUT,CUT008,2,+,237744,237928
...,...,...,...,...,...,...
62,CUT,CUT873,15,-,1003415,1004087
63,SUT-CUT,SUT086--CUT102,5,+,7668,8134
64,SUT-CUT,SUT233--CUT707,11,both,229966,231479
65,SUT-CUT,SUT420--CUT425,16,+,607502,608688


In [104]:
nonoverlapping_cuts['sequence'] = genes_df.apply(select_gene,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonoverlapping_cuts['sequence'] = genes_df.apply(select_gene,axis=1)


In [118]:
nonoverlapping_cuts

Unnamed: 0,Type,ID,chrom,Transcribed_strand,start,end,sequence
0,CUT,CUT001--CUT437,1,both,30028,30950,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,CUT,CUT002,1,+,138607,138831,TGACAGATATTCTGCACTTAAAAACTAAAAATATTATACCAACTTT...
2,CUT,CUT005,1,+,180255,180511,TTCTTTCATGCAATGTGATGTCCATCGGAGAAAACTGTATCTTGTG...
3,CUT,CUT007,2,+,191991,192343,CGTCAAGATATTTGAAAGTTAATAGACAGTTAACAATAATAACAAC...
4,CUT,CUT008,2,+,237744,237928,ACGATTGACCAAGTCAGAAAAAAAAAAAAAAAGGAACTAAAAAAAG...
...,...,...,...,...,...,...,...
62,CUT,CUT873,15,-,1003415,1004087,GTGACGACGATATACAGATCCTATATTCTTCATGTATGTAATAGAT...
63,SUT-CUT,SUT086--CUT102,5,+,7668,8134,GCAGTAAAAATAGGGTAAACACAGGTAAAAGACGGTATGGAAACGG...
64,SUT-CUT,SUT233--CUT707,11,both,229966,231479,GAAAAAAATGCAGACCTTTCTTAATTCAACTTTTTTTCTTTCAAAT...
65,SUT-CUT,SUT420--CUT425,16,+,607502,608688,TTGAGTACAGTGGGACGATGTTTCAAAGGTCTGGCGCTGCTCATCA...


In [158]:
nonoverlapping_cuts.to_csv("non_overlapping_CUTs.csv")

In [175]:
nonoverlapping_cuts.nunique()

Type                   3
ID                    67
chrom                 16
Transcribed_strand     3
start                 67
end                   67
sequence              67
dtype: int64

In [111]:
nonoverlapping_cut_sequences = nonoverlapping_cuts.drop(columns=['Type', 'ID', 'chrom', 'Transcribed_strand','end', 'start'])

In [112]:
nonoverlapping_cut_sequences

Unnamed: 0,sequence
0,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,TGACAGATATTCTGCACTTAAAAACTAAAAATATTATACCAACTTT...
2,TTCTTTCATGCAATGTGATGTCCATCGGAGAAAACTGTATCTTGTG...
3,CGTCAAGATATTTGAAAGTTAATAGACAGTTAACAATAATAACAAC...
4,ACGATTGACCAAGTCAGAAAAAAAAAAAAAAAGGAACTAAAAAAAG...
...,...
62,GTGACGACGATATACAGATCCTATATTCTTCATGTATGTAATAGAT...
63,GCAGTAAAAATAGGGTAAACACAGGTAAAAGACGGTATGGAAACGG...
64,GAAAAAAATGCAGACCTTTCTTAATTCAACTTTTTTTCTTTCAAAT...
65,TTGAGTACAGTGGGACGATGTTTCAAAGGTCTGGCGCTGCTCATCA...


In [122]:
nonoverlapping_cut_sequences.nunique()

sequence    67
dtype: int64

In [123]:
overlapping_cut_sequences.nunique()

sequence    843
dtype: int64

In [155]:
noncoding_sequences.nunique()

sequence    219
dtype: int64

In [160]:
noncoding_sequences.drop_duplicates(inplace = True)
noncoding_sequences

Unnamed: 0,sequence
0,GGGCCCTTTCTTCCGTTTGAACGTAAAGGCATTTTTGAGACCATTA...
1,GTGAATGATGAATTTAATTCTTTGGTCCGTGTTTATGATGGGAAGT...
2,GGTTGTTTGGCCGAGCGGTCTAAGGCGCCTGATTCAAGAAATATCT...
3,GGGCGTGTGGTCTAGTGGTATGATTCTCGCTTTGGGCGACTTCCTG...
4,GGGCACATGGCGCAGTTGGTAGCGCGCTTCCCTTGCAAGGAAGAGG...
...,...
424,GTGCTACTAAAGTTGTGCGTTTCAGTTAGATTTCTCAAAGGCATTA...
425,GTCGACGTACTTCAGTATGTTTTATACCATATACTTTATTAGGAAT...
426,TATTATGATGATTTTTTTATATTCACACTGTACTAGATTGGTCTCT...
427,TGATATGATGATTTGTTGTCGACCGGGCGGACATATTAGTATCTGT...


In [161]:
merged_cuts_sequences = nonoverlapping_cut_sequences.append(overlapping_cut_sequences)
merged_cuts_sequences.reset_index(drop = True)

Unnamed: 0,sequence
0,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,TGACAGATATTCTGCACTTAAAAACTAAAAATATTATACCAACTTT...
2,TTCTTTCATGCAATGTGATGTCCATCGGAGAAAACTGTATCTTGTG...
3,CGTCAAGATATTTGAAAGTTAATAGACAGTTAACAATAATAACAAC...
4,ACGATTGACCAAGTCAGAAAAAAAAAAAAAAAGGAACTAAAAAAAG...
...,...
905,AAACTTGCCGTTAACAATAAGTGACCTTGCATGGAAGACTGTCGAT...
906,TGCTAACTTTTATCATCAAATCAGTAAACCCCTTCTTGTTTTTTCT...
907,CACAGATGAGGTCAGATTTAACTATTGACCTTCTTGAAACCAAATA...
908,AACAGGTTCTTTCAACTCTGGAAATTCATCCACAATCTTGTCAGCA...


In [178]:
merged_cuts_sequences.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
838    False
839    False
840    False
841    False
842    False
Length: 910, dtype: bool

In [163]:
merged_cuts_ncoding = merged_cuts_sequences.drop_duplicates().append(noncoding_sequences)

In [164]:
merged_cuts_ncoding.reset_index(drop = True)

Unnamed: 0,sequence
0,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,TGACAGATATTCTGCACTTAAAAACTAAAAATATTATACCAACTTT...
2,TTCTTTCATGCAATGTGATGTCCATCGGAGAAAACTGTATCTTGTG...
3,CGTCAAGATATTTGAAAGTTAATAGACAGTTAACAATAATAACAAC...
4,ACGATTGACCAAGTCAGAAAAAAAAAAAAAAAGGAACTAAAAAAAG...
...,...
1120,GTGCTACTAAAGTTGTGCGTTTCAGTTAGATTTCTCAAAGGCATTA...
1121,GTCGACGTACTTCAGTATGTTTTATACCATATACTTTATTAGGAAT...
1122,TATTATGATGATTTTTTTATATTCACACTGTACTAGATTGGTCTCT...
1123,TGATATGATGATTTGTTGTCGACCGGGCGGACATATTAGTATCTGT...


In [165]:
merged_cuts_ncoding.nunique()

sequence    1125
dtype: int64

In [166]:
merged_cutsoverlap_noncoding = overlapping_cut_sequences.append(noncoding_sequences)
merged_cutsoverlap_noncoding.reset_index(drop = True)

Unnamed: 0,sequence
0,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,CTTGTCGAAAATACGCGGTGTAGGGAGTTATGGTGGATAACTTTTT...
2,AATTACTTCCCTGGCTCGCTCCTCCACTGCCTGGGTAAATTGTTCC...
3,ATTTTGGTAAATGCACACGGATAAATGCATGATATATAAAATATTA...
4,AATGGTAGTCATATCATGTCAAGAATAGGTATCCAAAACGCAGCGG...
...,...
1057,GTGCTACTAAAGTTGTGCGTTTCAGTTAGATTTCTCAAAGGCATTA...
1058,GTCGACGTACTTCAGTATGTTTTATACCATATACTTTATTAGGAAT...
1059,TATTATGATGATTTTTTTATATTCACACTGTACTAGATTGGTCTCT...
1060,TGATATGATGATTTGTTGTCGACCGGGCGGACATATTAGTATCTGT...


In [167]:
merged_cutsoverlap_noncoding.nunique()

sequence    1062
dtype: int64

In [169]:
merged_cutsnonoverlap_noncoding = nonoverlapping_cut_sequences.append(noncoding_sequences)
merged_cutsnonoverlap_noncoding.reset_index(drop = True)

Unnamed: 0,sequence
0,TGTATTTTCCACATAGAAAATTCGATTTTTTTTTTTCAATGCACCA...
1,TGACAGATATTCTGCACTTAAAAACTAAAAATATTATACCAACTTT...
2,TTCTTTCATGCAATGTGATGTCCATCGGAGAAAACTGTATCTTGTG...
3,CGTCAAGATATTTGAAAGTTAATAGACAGTTAACAATAATAACAAC...
4,ACGATTGACCAAGTCAGAAAAAAAAAAAAAAAGGAACTAAAAAAAG...
...,...
281,GTGCTACTAAAGTTGTGCGTTTCAGTTAGATTTCTCAAAGGCATTA...
282,GTCGACGTACTTCAGTATGTTTTATACCATATACTTTATTAGGAAT...
283,TATTATGATGATTTTTTTATATTCACACTGTACTAGATTGGTCTCT...
284,TGATATGATGATTTGTTGTCGACCGGGCGGACATATTAGTATCTGT...


In [170]:
merged_cutsnonoverlap_noncoding.nunique()

sequence    286
dtype: int64

In [180]:
nonoverlapping_cut_sequences.to_csv("nonoverlapping_cut_sequences.csv")

In [181]:
overlapping_cut_sequences.to_csv("overlapping_cut_sequences.csv")

In [187]:
noncoding_sequences.drop_duplicates().reset_index(drop=True).to_csv("noncoding_sequences.csv")

In [188]:
!wget -c https://repo.continuum.io/archive/Anaconda3-5.1.0-Linux-x86_64.sh

!chmod +x Anaconda3-5.1.0-Linux-x86_64.sh

!bash ./Anaconda3-5.1.0-Linux-x86_64.sh -b -f -p /usr/local

/bin/bash: wget: command not found
chmod: Anaconda3-5.1.0-Linux-x86_64.sh: No such file or directory
bash: ./Anaconda3-5.1.0-Linux-x86_64.sh: No such file or directory
