### Basic functions
this notebook is used for basic calculations

    - Adding read counts when counted in terminal
    - Making sure the reads are present as both forward and reverse reads when used for trimming the failed reads based on fastQC per base sequence content.

In [2]:
import pandas as pd 
import numpy as np

#### adding readcounts from a list

In [3]:
def summing_numbers_in_file(filepath):
    f = open(filepath, "r")
    file = f.read()
    
    fileList = file.split("\n")
   
    # removing items that is not numeric 
    for l in fileList:
        if l.isnumeric()==False:
            fileList.remove(l)
            
    fileIntList = list(map(int, fileList)) 
    
    return sum(fileIntList)

In [4]:
def summing_numbers_with_more_seperators(filepath, seperator):

    f = open(filepath, "r")
    file = f.read()
    
    fileList = file.split("\n")
    
    fileList_split = [i.split(seperator, 1)[1] for i in fileList]

    # removing items the is not numeric 
    for l in fileList_split:
        if l.isnumeric()==False:
            fileList_split.remove(l)
            
    fileIntList = list(map(int, fileList_split)) 
    
    return sum(fileIntList)

In [5]:
singles_count = summing_numbers_in_file("count_singles.list")

In [6]:
r1_count = summing_numbers_in_file("count_r1.list")

In [7]:
r1_count

5850393084

In [8]:
total_count = r1_count * 2 + singles_count

In [9]:
reads_count = total_count/4

In [10]:
reads_count

2957817888.0

In [11]:
contigs_count = summing_numbers_with_more_seperators("count_contigs.list", ":")

In [12]:
# contigs_count only contains headers so no furhter calculations needed 
print(contigs_count)

31217582


In [13]:
raw_reads_R1 = summing_numbers_in_file("count_rawreads_R1.list")

In [14]:
(raw_reads_R1 * 2) /4

3042581492.0

### making sure both forward and reverse reads are present

In [124]:
Cl_R1 = pd.read_csv('3regionData/failBaseContentCL_1R.txt', header=None)
Cl_R2 = pd.read_csv('3regionData/failBaseContentCL_2R.txt', header=None)
SRR_R1 = pd.read_csv('3regionData/failBaseContentSRR_1R.txt', header=None)
SRR_R2 = pd.read_csv('3regionData/failBaseContentSRR_2R.txt', header=None)


In [125]:
Cl_R1[0] = Cl_R1[0].str.replace(r'_1$', '')
Cl_R2[0] = Cl_R2[0].str.replace(r'_2$', '')
SRR_R1[0] = SRR_R1[0].str.replace(r'_1$', '')
SRR_R2[0] = SRR_R2[0].str.replace(r'_2$', '')

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [126]:
Cl_R1

Unnamed: 0,0
0,CL100006210_L01_1
1,CL100006210_L01_2
2,CL100006210_L01_3
3,CL100006210_L01_4
4,CL100006210_L01_5
...,...
147,CL100020565_L01_5
148,CL100020565_L02_13
149,CL100023189_L01_23
150,CL100023189_L02_35


In [127]:
def get_different_rows(source_df, new_df):
    """Returns just the rows from the new dataframe that differ from the source dataframe"""
    merged_df = source_df.merge(new_df, indicator=True, how='outer')
    changed_rows_df = merged_df[merged_df['_merge'] == 'right_only']
    return changed_rows_df.drop('_merge', axis=1)

In [128]:
missing_in_Cl_R1 = get_different_rows(Cl_R1, Cl_R2) # show the R2 that is not in R1

In [129]:
missing_in_Cl_R2 = get_different_rows(Cl_R2, Cl_R1) # shows the R1 that is not in R2

In [130]:
len(Cl_R1)

152

In [131]:
len(Cl_R2)

57

In [132]:
all_Cl_R1 = pd.concat([Cl_R1[0], missing_in_Cl_R1[0]])
all_Cl_R2 = pd.concat([Cl_R2[0], missing_in_Cl_R2[0]])

In [133]:
len(all_Cl_R2)

153

In [134]:
missing_in_SRR_R1 = get_different_rows(SRR_R1, SRR_R2) # show the R2 that is not in R1

In [135]:
missing_in_SRR_R2 = get_different_rows(SRR_R2, SRR_R1) # shows the R1 that is not in R2

In [136]:
len(missing_in_SRR_R1)

2

In [137]:
len(missing_in_SRR_R2)

3

In [138]:
all_SRR_R1 = pd.concat([SRR_R1[0], missing_in_SRR_R1[0]])
all_SRR_R2 = pd.concat([SRR_R2[0], missing_in_SRR_R2[0]])

In [139]:
len(all_SRR_R2)

205

In [140]:
get_different_rows(pd.DataFrame(all_Cl_R1), pd.DataFrame(all_Cl_R1))

Unnamed: 0,0


In [145]:
pd.DataFrame(all_Cl_R1).sort_values(by=0).to_csv('3regionData/all_Cl_R1.txt', header = None, index=False)
pd.DataFrame(all_Cl_R2).sort_values(by=0).to_csv('3regionData/all_Cl_R2.txt', header = None, index=False)
pd.DataFrame(all_SRR_R1).sort_values(by=0).to_csv('3regionData/all_SRR_R1.txt', header = None, index=False)
pd.DataFrame(all_SRR_R2).sort_values(by=0).to_csv('3regionData/all_SRR_R2.txt', header = None, index=False)