In [15]:
import time
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from matplotlib import cm
from collections import OrderedDict
import matplotlib as mpl
import matplotlib.colors as colors
import re

import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting
import seaborn as sns

pd.set_option('display.max_rows', 100)

bokeh.io.output_notebook

import scipy.stats as sps

In [16]:
start = time.time()

#----------------------------Formatting and Annotating R-output-----------------------------

def annotate_R_abridged(cdhit_file, abridged_R_file):
    """
        Input: CDHIT cluster file and abridged R outputfile containing p-adj values
        Output: newly annotated dataframe
    """
    with open(cdhit_file, 'r') as f:
        file1 = f.readlines()
    cluster_representative_dict1 = {}
    for line1 in file1:
        line2 = line1.split('\t')
        if line1[0] == '>':
            temp_clusternum = line2[0].replace('>Cluster ', '').replace('\n', '')
        elif '... *' in line2[1]:
            line3 = line2[1].split(' ')
            line4 = line3[1].replace('>', '').replace('...', '')
            cluster_representative_dict1[temp_clusternum] = line4
            temp_clusternum = ""
    with open(abridged_R_file, 'r') as g:
        file2 = g.readlines()
    cluster_rep_list = []
    for line1 in file2:
        line2 = line1.split(',')
        if line2[0] in cluster_representative_dict1.keys():
            cluster_rep_list.append(cluster_representative_dict1[line2[0]])
    temp_dfC1 = pd.read_csv(abridged_R_file) 
    temp_dfC1['cluster_representative'] = pd.Series(cluster_rep_list, index=temp_dfC1.index)
    return temp_dfC1

#-------------------------------------------------------------------------------------------

end = time.time()
print(end-start)

0.0003151893615722656


In [17]:
start = time.time()

#---------------Add Protein Labels to R-analysis of Census SLine Data----------------

ChatMouse_65 = annotate_R_abridged('CDHIT/cdhitout-mouse-65.clstr', 'Final_analysis-SC-Deseq2/ChatMouse-SC-DESEQ2-65.csv')
ChatMouse_65.to_csv('ChatMouse-SC-DESEQ2-65.csv', index=False)

ChatMouse_75 = annotate_R_abridged('CDHIT/cdhitout-mouse-75.clstr', 'Final_analysis-SC-Deseq2/ChatMouse-SC-DESEQ2-75.csv')
ChatMouse_75.to_csv('ChatMouse-SC-DESEQ2-75.csv', index=False)

ChatMouse_85 = annotate_R_abridged('CDHIT/cdhitout-mouse-85.clstr', 'Final_analysis-SC-Deseq2/ChatMouse-SC-DESEQ2-85.csv')
ChatMouse_85.to_csv('ChatMouse-SC-DESEQ2-85.csv', index=False)

ChatMouse_95 = annotate_R_abridged('CDHIT/cdhitout-mouse-95.clstr', 'Final_analysis-SC-Deseq2/ChatMouse-SC-DESEQ2-95.csv')
ChatMouse_95.to_csv('ChatMouse-SC-DESEQ2-95.csv', index=False)

ChatMB_65 = annotate_R_abridged('CDHIT/cdhitout-chat-65.clstr', 'Final_analysis-SC-Deseq2/ChatMB-SC-DESEQ2-65.csv')
ChatMB_65.to_csv('ChatMB-SC-DESEQ2-65.csv', index=False)

ChatMB_75 = annotate_R_abridged('CDHIT/cdhitout-chat-75.clstr', 'Final_analysis-SC-Deseq2/ChatMB-SC-DESEQ2-75.csv')
ChatMB_75.to_csv('ChatMB-SC-DESEQ2-75.csv', index=False)

ChatMB_85 = annotate_R_abridged('CDHIT/cdhitout-chat-85.clstr', 'Final_analysis-SC-Deseq2/ChatMB-SC-DESEQ2-85.csv')
ChatMB_85.to_csv('ChatMB-SC-DESEQ2-85.csv', index=False)

ChatMB_95 = annotate_R_abridged('CDHIT/cdhitout-chat-95.clstr', 'Final_analysis-SC-Deseq2/ChatMB-SC-DESEQ2-95.csv')
ChatMB_95.to_csv('ChatMB-SC-DESEQ2-95.csv', index=False)

#----------

THMouse_65 = annotate_R_abridged('CDHIT/cdhitout-mouse-65.clstr', 'Final_analysis-SC-Deseq2/THMouse-SC-DESEQ2-65.csv')
THMouse_65.to_csv('THMouse-SC-DESEQ2-65.csv', index=False)

THMouse_75 = annotate_R_abridged('CDHIT/cdhitout-mouse-75.clstr', 'Final_analysis-SC-Deseq2/THMouse-SC-DESEQ2-75.csv')
THMouse_75.to_csv('THMouse-SC-DESEQ2-75.csv', index=False)

THMouse_85 = annotate_R_abridged('CDHIT/cdhitout-mouse-85.clstr', 'Final_analysis-SC-Deseq2/THMouse-SC-DESEQ2-85.csv')
THMouse_85.to_csv('THMouse-SC-DESEQ2-85.csv', index=False)

THMouse_95 = annotate_R_abridged('CDHIT/cdhitout-mouse-95.clstr', 'Final_analysis-SC-Deseq2/THMouse-SC-DESEQ2-95.csv')
THMouse_95.to_csv('THMouse-SC-DESEQ2-95.csv', index=False)

THMB_65 = annotate_R_abridged('CDHIT/cdhitout-TH-65.clstr', 'Final_analysis-SC-Deseq2/THMB-SC-DESEQ2-65.csv')
THMB_65.to_csv('THMB-SC-DESEQ2-65.csv', index=False)

THMB_75 = annotate_R_abridged('CDHIT/cdhitout-TH-75.clstr', 'Final_analysis-SC-Deseq2/THMB-SC-DESEQ2-75.csv')
THMB_75.to_csv('THMB-SC-DESEQ2-75.csv', index=False)

THMB_85 = annotate_R_abridged('CDHIT/cdhitout-TH-85.clstr', 'Final_analysis-SC-Deseq2/THMB-SC-DESEQ2-85.csv')
THMB_85.to_csv('THMB-SC-DESEQ2-85.csv', index=False)

THMB_95 = annotate_R_abridged('CDHIT/cdhitout-TH-95.clstr', 'Final_analysis-SC-Deseq2/THMB-SC-DESEQ2-95.csv')
THMB_95.to_csv('THMB-SC-DESEQ2-95.csv', index=False)

#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

3.954814910888672
