In [1]:
## After CuffDiff

#### This file contains the basic function of analysis from cuffdiff results
#### Author: Xiang Li
#### Sample

import numpy as np
import pandas as pd
import os

In [26]:
# For any input of /gene_exp.diff, return its All genes with
### gene_id, cond1, cond2, log2(fold_change), p_value, plus a number of order.
def generate_All_Genes(Input_Path, number):
#### READ FILE
    df = pd.read_csv(Input_Path+'/gene_exp.diff', sep='\t', header=0, usecols={'test_id',\
    'status','sample_1','sample_2','value_1','value_2','log2(fold_change)','p_value','q_value'})
#### Rename columns
    df=df.rename(columns={'test_id':'gene_id', 'value_1': df['sample_1'].unique()[0], 'value_2': df['sample_2'].unique()[0]})
#### Output   
    return df.loc[:,['gene_id',df['sample_1'].unique()[0],df['sample_2'].unique()[0],'log2(fold_change)',
                     'p_value','q_value', 'End_'+str(number), '||']].fillna('')
####################################################################################


### For any input of /gene_exp.diff, return its up_DEGs genes with
### gene_id, cond1, cond2, log2(fold_change), p_value, plus a number of order.
### Parameters for DEGs:
FC_UP=1.5
q_value_less=0.05
FPKM_threshold=1.0
####################################################################################

def generate_Upregulated_Genes(Input_Path):
#### READ FILE
    df = pd.read_csv(Input_Path+'/gene_exp.diff', sep='\t', header=0, usecols={'test_id',\
    'status','sample_1','sample_2', 'value_1','value_2','log2(fold_change)','p_value','q_value'})
#### Filter
    df=df[(df['status']=='OK') & (df['q_value']<=q_value_less) & (df['value_2']>=FPKM_threshold) & (df['log2(fold_change)'] >= np.log2(FC_UP))]
#### Rearrange Columns
    df=df.rename(columns={'test_id':'gene_id','value_1': df['sample_1'].unique()[0], 'value_2': df['sample_2'].unique()[0]})
#### Output
    return df.loc[:,['gene_id', df['sample_1'].unique()[0],df['sample_2'].unique()[0],'log2(fold_change)','p_value','q_value']]
####################################################################################

def generate_Downregulated_Genes(Input_Path):
#### READ FILE FROM CuffDiff Results
    df = pd.read_csv(Input_Path+'/gene_exp.diff', sep='\t', header=0, usecols={'test_id',\
    'status','sample_1','sample_2', 'value_1','value_2','log2(fold_change)','p_value','q_value'})
#### Filter
    df=df[(df['status']=='OK') & (df['q_value']<=q_value_less) & (df['value_1']>=FPKM_threshold) & (df['log2(fold_change)']<= -np.log2(FC_UP))]   
#### Rearrange Columns
    df=df.rename(columns={'test_id':'gene_id','value_1': df['sample_1'].unique()[0], 'value_2': df['sample_2'].unique()[0]})
#### Output
    return df.loc[:,['gene_id', df['sample_1'].unique()[0],df['sample_2'].unique()[0],'log2(fold_change)','p_value','q_value']]


#### Functions to read /genes.read_group_tracking and output FPKM
def generate_genes_FPKM_df(Input_Path):
    df = pd.read_csv(INPUT_PATH+'/genes.read_group_tracking', sep='\t', header=0)
    FPKM_df = None
    for cond in df['condition'].unique():
        for replica in df[df['condition'] == cond]['replicate'].unique():
            #print (cond, replica)
            temp_df = (df[ (df['condition'] == cond) & (df['replicate'] == replica)].loc[:,['tracking_id', 'FPKM']])
            cond_name= cond+'_'+str(replica)
            temp_df.rename(columns={'tracking_id': 'gene_id', 'FPKM': cond_name }, inplace=True)
            if FPKM_df is None:
                FPKM_df
                FPKM_df = temp_df  
            else:
                FPKM_df = FPKM_df.merge(temp_df, on='gene_id', how='outer', suffixes=('','_'))
    return FPKM_df

def Add_common_header(df,common_header):
    name_list=[common_header]*len(df.columns)
    tuples = list(zip(name_list,df.columns))
    df.columns = pd.MultiIndex.from_tuples(tuples)
    return df


def DIR_CHECK_CREATE(Input_Path):
    if (not os.path.isdir(Input_Path)):
        print ("Dir check and create is" + Input_Path)
        os.mkdir(Input_Path)
    else: print ('Input_Path exists')

In [None]:
INPUT_LIST=os.listdir(PATH_FOLDER)

In [27]:
PATH_FOLDER=os.getcwd()+ '/CuffDiff_Results/'
OUT_FOLDER=os.getcwd()+'/genelist/'

DIR_CHECK_CREATE(PATH_FOLDER)
DIR_CHECK_CREATE(OUT_FOLDER)


INPUT_LIST=os.listdir(PATH_FOLDER)
writer = pd.ExcelWriter(OUT_FOLDER+'CD8-HP_CuffDiff_Summary201806.xlsx', engine='xlsxwriter')

i=0
for input_name in INPUT_LIST[:]:
    INPUT_PATH = PATH_FOLDER+input_name
    if (i==0):
        df_all=generate_genes_FPKM_df(INPUT_PATH)
        df_all[input_name]=''
        df_all = df_all.merge(generate_All_Genes(INPUT_PATH,i+1), on='gene_id', how='inner', suffixes=('','_')) 
        i+=1
        continue
    df_all = df_all.merge(generate_genes_FPKM_df(INPUT_PATH), on='gene_id', how='inner', suffixes=('','_'))
    df_all[input_name]=''
    df_all = df_all.merge(generate_All_Genes(INPUT_PATH,i+1), on='gene_id', how='inner', suffixes=('','_')) 
    
    i+=1
df_all.to_excel(writer, sheet_name='All_Genes', index=None)

for input_name in INPUT_LIST:
    INPUT_PATH = PATH_FOLDER+input_name
    print ('Library:' + input_name)
    df_up = generate_genes_FPKM_df(INPUT_PATH)
    df_up[input_name]=''
    df_up = df_up.merge(generate_Upregulated_Genes(INPUT_PATH), on='gene_id', how='inner', suffixes=('','_'))
    
    df_up.to_excel( writer, sheet_name='up_'+input_name, index=None)
    print ('# of Up:' )
    print(df_up.shape)
    
    df_down = generate_genes_FPKM_df(INPUT_PATH)
    df_down[input_name]=''
    df_down = df_down.merge(generate_Downregulated_Genes(INPUT_PATH), on='gene_id', how='inner', suffixes=('','_')) 
    df_down.to_excel( writer, sheet_name='down_'+input_name,index=None)
    print ('# of Down')
    print(df_down.shape)
    print ('')


#writer.save()

Input_Path exists
Input_Path exists
Library:DKO_0h_vs_Ctrl_34n_ref
# of Up:
(1601, 12)
# of Down
(1575, 12)

Library:DKO_0h_vs_WT_0h
# of Up:
(647, 13)
# of Down
(704, 13)

Library:DKO_72h_vs_DKO_0h
# of Up:
(1457, 13)
# of Down
(1025, 13)

Library:DKO_72h_vs_WT_72h
# of Up:
(441, 13)
# of Down
(749, 13)

Library:WT_72h_vs_WT_0h
# of Up:
(1812, 13)
# of Down
(1272, 13)



In [25]:
df_all=generate_genes_FPKM_df(INPUT_PATH)
df_down = generate_genes_FPKM_df(INPUT_PATH)
df_down[df_down['gene_id']=='Rnaset2a,Rnaset2b']

Unnamed: 0,gene_id,WT_0h_0,WT_0h_1,WT_0h_2,WT_72h_0,WT_72h_1,WT_72h_2


#### DEGs Generating


In [12]:
#### DEGs Generating
DIR_CHECK_CREATE(os.getcwd()+'/genelist/')
PATH_FOLDER=os.getcwd()+ '/CuffDiff_Results/'
INPUT_LIST=os.listdir(PATH_FOLDER)
for input_name in INPUT_LIST:
    INPUT_PATH = PATH_FOLDER+input_name
    print ('Library:' + input_name)
    df_up = generate_Upregulated_Genes(INPUT_PATH)
    df_up.to_csv( os.getcwd()+'/genelist/up_'+input_name +'.bed' ,sep='\t', index=None)
    print ('# of Upregulated_genes:' )
    print(df_up.shape)
    
    df_down = generate_Downregulated_Genes(INPUT_PATH)  
    df_down.to_csv( os.getcwd()+'/genelist/down_'+input_name + '.bed' ,sep='\t', index=None)
    
    print ('# of Downregulated_genes')
    print(df_down.shape)
    print ('')
    break
    


Input_Path exists
Library:DKO_0h_vs_Ctrl_34n_ref
# of Upregulated_genes:
(1601, 6)
# of Downregulated_genes
(1575, 6)

