In [1]:
import pyranges as pr
import numpy as np
import pandas as pd

# Scratches

In [2]:
def perfcheck(func, *args, **kwargs):
    import time
    t1 = time.perf_counter()
    result = func(*args, **kwargs)
    t2 = time.perf_counter()
    print(f'{t2 - t1} sec')
    
    return result

In [19]:
import functools
import itertools

def join_common(left_df, right_df):
    left_annot_cols = left_df.columns.drop(['Chromosome', 'Start', 'End'])
    right_annot_cols = right_df.columns.drop(['Chromosome', 'Start', 'End'])
    assert not set(left_annot_cols).intersection(right_annot_cols), f'Input DataFrames has overlapping annotation columns'
    
    chrom_compare = (left_df.Chromosome.to_numpy()[:, np.newaxis] == right_df.Chromosome.to_numpy())
    pos_compare_1 = (right_df.End.to_numpy() > left_df.Start.to_numpy()[:, np.newaxis])
    pos_compare_2 = (right_df.Start.to_numpy() < left_df.End.to_numpy()[:, np.newaxis])
    compare_result = functools.reduce(np.logical_and, (chrom_compare, pos_compare_1, pos_compare_2))
    
    return compare_result, left_annot_cols, right_annot_cols

# slow
def join_common2(left_df, right_df):
    left_annot_cols = left_df.columns.drop(['Chromosome', 'Start', 'End'])
    right_annot_cols = right_df.columns.drop(['Chromosome', 'Start', 'End'])
    assert not set(left_annot_cols).intersection(right_annot_cols), f'Input DataFrames has overlapping annotation columns'
    
    chrom_compare = (left_df.Chromosome.to_numpy()[:, np.newaxis] == right_df.Chromosome.to_numpy())
    # pos_compare_1 = (left_df.Start.to_numpy()[:, np.newaxis] < right_df.End.to_numpy()[np.newaxis, :])
    # pos_compare_2 = (-left_df.End.to_numpy()[:, np.newaxis] < -right_df.Start.to_numpy()[np.newaxis, :])
    pos_compare = np.array([
        left_df.Start.to_numpy()[:, np.newaxis],
        -left_df.End.to_numpy()[:, np.newaxis],
    ]) < np.array([
        right_df.End.to_numpy()[np.newaxis, :],
        -right_df.Start.to_numpy()[np.newaxis, :],
    ])
    compare_result = chrom_compare & pos_compare.all(axis=0)
    
    return compare_result, left_annot_cols, right_annot_cols


def join_common3(left_df, right_df):
    assert not set(left_df.columns[3:]).intersection(right_df.columns[3:]), f'Input DataFrames has overlapping annotation columns'
    
    chrom_compare = (left_df.Chromosome.to_numpy()[:, np.newaxis] == right_df.Chromosome.to_numpy())
    pos_compare_1 = (right_df.End.to_numpy() > left_df.Start.to_numpy()[:, np.newaxis])
    pos_compare_2 = (right_df.Start.to_numpy() < left_df.End.to_numpy()[:, np.newaxis])
    compare_result = functools.reduce(np.logical_and, (chrom_compare, pos_compare_1, pos_compare_2))
    
    return compare_result


def join_common4(left_df, right_df):
    assert not set(left_df.columns[3:]).intersection(right_df.columns[3:]), f'Input DataFrames has overlapping annotation columns'
    
    pos_compare_1 = (right_df.End.to_numpy() > left_df.Start.to_numpy()[:, np.newaxis])
    pos_compare_2 = (right_df.Start.to_numpy() < left_df.End.to_numpy()[:, np.newaxis])
    compare_result = np.logical_and(pos_compare_1, pos_compare_2)
    
    return compare_result


def inner_join(left_df, right_df):
    print(0)
    compare_result, left_annot_cols, right_annot_cols = join_common(left_df, right_df)
    
    print(1)
    left_rowidxs, right_rowidxs = np.where(compare_result)
    right_all_cols = right_df.columns.to_list()
    right_annot_cols_idxs = [right_all_cols.index(x) for x in right_annot_cols]
    print(2)
    left_subdf = left_df.iloc[left_rowidxs, :].reset_index(drop=True)
    print(3)
    right_subdf = right_df.iloc[right_rowidxs, right_annot_cols_idxs].reset_index(drop=True)
    print(4)
    result_df = pd.concat([left_subdf, right_subdf], axis=1, ignore_index=False)
    print(5)
    return result_df


def inner_join2(left_df, right_df):
    leading_cols = ['Chromosome', 'Start', 'End']
    left_df = left_df.loc[:, (leading_cols + left_df.columns.drop(leading_cols).to_list())]
    right_df = right_df.loc[:, (leading_cols + right_df.columns.drop(leading_cols).to_list())]
    compare_result = join_common3(left_df, right_df)
    
    left_rowidxs, right_rowidxs = np.where(compare_result)
    left_subdf = left_df.iloc[left_rowidxs, :].reset_index(drop=True)
    right_subdf = right_df.iloc[right_rowidxs, 3:].reset_index(drop=True)
    result_df = pd.concat([left_subdf, right_subdf], axis=1, ignore_index=False)
    
    return result_df


def inner_join3(left_df, right_df, how='inner'):
    assert how in ('inner', 'left')
    
    leading_cols = ['Chromosome', 'Start', 'End']
    left_df = left_df.loc[:, (leading_cols + left_df.columns.drop(leading_cols).to_list())].reset_index(drop=True)
    right_df = right_df.loc[:, (leading_cols + right_df.columns.drop(leading_cols).to_list())].reset_index(drop=True)
    compare_result = join_common3(left_df, right_df)
    
    left_rowidxs, right_rowidxs = np.where(compare_result)
    right_subdf = right_df.iloc[right_rowidxs, 3:]
    right_subdf.index = left_rowidxs
    result_df = left_df.join(right_subdf, how=how)
    
    return result_df


def inner_join4(left_df, right_df, how='inner'):
    assert how in ('inner', 'left')
    
    leading_cols = ['Chromosome', 'Start', 'End']
    left_df = left_df.loc[:, (leading_cols + left_df.columns.drop(leading_cols).to_list())].reset_index(drop=True)
    right_df = right_df.loc[:, (leading_cols + right_df.columns.drop(leading_cols).to_list())].reset_index(drop=True)
    compare_result = join_common4(left_df, right_df)
    
    left_rowidxs, right_rowidxs = np.where(compare_result)
    right_subdf = right_df.iloc[right_rowidxs, 3:]
    right_subdf.index = left_rowidxs
    result_df = left_df.join(right_subdf, how=how)
    
    return result_df


def process_right_df(left_df, right_df):
    compare_result = join_common4(left_df, right_df)
    
    left_rowidxs, right_rowidxs = np.where(compare_result)
    
    right_subdf = right_df.iloc[right_rowidxs, 3:]
    right_subdf.index = pd.MultiIndex.from_product([left_df.Chromosome[:1], left_rowidxs])
    
    return right_subdf


def inner_join_bychrom(left_df, right_df, how='inner'):
    assert how in ('inner', 'left')
    print(0)
    left_bychrom = dict((key, subdf) for key, subdf in left_df.groupby('Chromosome'))
    right_bychrom = dict((key, subdf) for key, subdf in right_df.groupby('Chromosome'))
    print(1)
    left_chroms = set(left_bychrom.keys())
    right_chroms = set(right_bychrom.keys())
    common_chroms = left_chroms.intersection(right_chroms)
    leftonly_chroms = left_chroms.difference(right_chroms)
    rightonly_chroms = right_chroms.difference(left_chroms)
    print(2)
    joined_bychrom = dict()
    for chrom in common_chroms:
        print(f'chrom {chrom}')
        joined_bychrom[chrom] = inner_join4(left_bychrom[chrom], right_bychrom[chrom], how=how)
    print(3)
    if how == 'inner':
        result = pd.concat(list(joined_bychrom.values()), axis=0)
    elif how == 'left':
        result = pd.concat(
            (
                list(joined_bychrom.values()) 
                + [left_bychrom[chrom] for chrom in leftonly_chroms]
            ), 
            axis=0,
        )
    print(4)
    return result


# slower than inner_join_bychrom
def inner_join_bychrom2(left_df, right_df, how='inner'):
    assert how in ('inner', 'left')
    print(0)
    leading_cols = ['Chromosome', 'Start', 'End']
    left_df = left_df.loc[:, (leading_cols + left_df.columns.drop(leading_cols).to_list())]
    right_df = right_df.loc[:, (leading_cols + right_df.columns.drop(leading_cols).to_list())]
    
    left_bychrom = dict((key, subdf) for key, subdf in left_df.groupby('Chromosome'))
    right_bychrom = dict((key, subdf) for key, subdf in right_df.groupby('Chromosome'))
    for subdf in itertools.chain(left_bychrom.values(), right_bychrom.values()):
        subdf.index = pd.MultiIndex.from_arrays([subdf.Chromosome, range(subdf.shape[0])])

    print(1)
    left_chroms = set(left_bychrom.keys())
    right_chroms = set(right_bychrom.keys())
    common_chroms = left_chroms.intersection(right_chroms)
    leftonly_chroms = left_chroms.difference(right_chroms)
    rightonly_chroms = right_chroms.difference(left_chroms)
    
    print(2)
    processed_right = dict()
    for chrom in common_chroms:
        print(f'chrom {chrom}')
        processed_right[chrom] = process_right_df(left_bychrom[chrom], right_bychrom[chrom])
    print(3)
    left_df_concat = pd.concat(list(left_bychrom.values()), axis=0)
    right_df_concat = pd.concat(list(processed_right.values()), axis=0)
    result = left_df_concat.join(right_df_concat, how=how)
    print(4)
    return result

In [140]:
left_nrow = 10000
right_nrow = 10000

left_df = pd.DataFrame.from_dict({
    # 'Chromosome': pd.Categorical(np.random.choice([str(x) for x in range(1, 6)], left_nrow)),
    'Chromosome': pd.Series(np.random.choice([str(x) for x in range(1, 6)], left_nrow), dtype='string'),
    'Start': np.random.randint(0, 10000, left_nrow),
    'End': np.random.randint(15000, 30000, left_nrow),
    'val1': np.random.normal(0, 1, left_nrow),
    'val2': np.random.normal(0, 1, left_nrow),
})
left_gr = pr.PyRanges(left_df, int64=False)

right_df = pd.DataFrame.from_dict({
    # 'Chromosome': pd.Categorical(np.random.choice([str(x) for x in range(1, 5)], right_nrow)),
    'Chromosome': pd.Series(np.random.choice([str(x) for x in range(1, 5)], right_nrow), dtype='string'),
    'Start': np.random.randint(23000, 33000, right_nrow),
    'End': np.random.randint(50000, 60000, right_nrow),
    'val3': np.random.normal(0, 1, right_nrow),
    'val4': np.random.normal(0, 1, right_nrow),
})
right_gr = pr.PyRanges(right_df)

In [132]:
def make_df_for_hash1(gr):
    tmp_list = list()
    tmp_list.append(gr.Chromosome.reset_index(drop=True).astype('string'))
    tmp_list.append(gr.Start.reset_index(drop=True).astype('int64'))
    tmp_list.append(gr.End.reset_index(drop=True).astype('int64'))
    for x in gr.columns[3:]:
        tmp_list.append(getattr(gr, x).reset_index(drop=True))
    df = pd.concat(tmp_list, axis=1)
    df.sort_values(['Chromosome', 'Start', 'End'], inplace=True)
    return df

def make_df_for_hash2(gr):
    df = pd.concat([getattr(gr, x) for x in gr.columns], axis=1)
    # df = df.sort_values(['Chromosome', 'Start', 'End'], inplace=False).reset_index(drop=True)
    df = df.sort_values(['Chromosome', 'Start', 'End'], inplace=False)
    return df

def hash_gr(gr):
    return pd.util.hash_pandas_object(make_df_for_hash2(gr), index=False)

def hash_df(df):
    # df_for_hash = df.astype({'Chromosome': 'string', 'Start': 'int64', 'End': 'int64'})
    # df_for_hash = df.sort_values(['Chromosome', 'Start', 'End'], inplace=False).reset_index(drop=True)
    df_for_hash = df.sort_values(['Chromosome', 'Start', 'End'], inplace=False)
    return pd.util.hash_pandas_object(
        df_for_hash,
        index=False,
    )

def compare_df_gr(df, gr):
    return (hash_gr(gr) == hash_df(df)).to_numpy().all()

In [141]:
left_gr.dtypes

Chromosome    category
Start            int64
End              int64
val1           float64
val2           float64
dtype: object

In [135]:
compare_df_gr(right_df, right_gr)

True

In [107]:
%%timeit
(hash_gr(left_gr) == hash_df(left_df))

13.3 ms ± 176 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [108]:
%%timeit
(hash_gr(left_gr).to_numpy() == hash_df(left_df).to_numpy())

13.7 ms ± 877 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [109]:
%%timeit
hash_compare.to_numpy().all()

2.56 µs ± 28.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [110]:
%%timeit
hash_compare.all()

13.1 µs ± 1.8 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [6]:
left_df

Unnamed: 0,Chromosome,Start,End,val1,val2
0,3,6511,25699,0.793044,3.058646
1,4,2935,25956,-2.103077,0.976404
2,4,3960,21301,-0.817666,-1.149480
3,3,5429,16045,-1.670927,-0.360027
4,3,3078,16239,-1.354032,-1.255546
...,...,...,...,...,...
9995,4,6642,27738,1.205940,0.622400
9996,5,384,21109,-0.115985,-0.021554
9997,4,9093,16895,0.283714,1.072938
9998,1,8641,27613,0.464857,-0.772163


In [5]:
right_df

Unnamed: 0,Chromosome,Start,End,val3,val4
0,1,23953,53422,-1.235985,0.882320
1,3,30700,58903,0.001686,0.105742
2,1,26887,50825,-0.458170,0.683960
3,3,31072,53210,1.670941,-0.245919
4,2,30533,51264,0.338132,0.934240
...,...,...,...,...,...
9995,3,25168,58696,2.005012,-0.533917
9996,4,25988,50547,-1.554049,0.773042
9997,1,32506,54788,1.211318,0.611327
9998,4,26356,50799,-1.158595,-0.556208


In [113]:
import handygenome

ModuleNotFoundError: No module named 'handygenome'

## Assign

In [5]:
import itertools
query.assign('row_index', lambda df: pd.Series(np.repeat(1, df.shape[0])))

Unnamed: 0,Chromosome,Start,End,Name,row_index
0,1,200,210,a,1
1,1,180,202,b,1
2,1,18,27,c,1
3,1,25,30,d,1


In [61]:
clustered = query.cluster()

In [62]:
clustered

Unnamed: 0,Chromosome,Start,End,Name,Cluster
0,1,18,27,c,1
1,1,25,30,d,1
2,1,180,202,b,2
3,1,200,210,a,2


In [63]:
clustered[clustered.Cluster == 1]

Unnamed: 0,Chromosome,Start,End,Name,Cluster
0,1,18,27,c,1
1,1,25,30,d,1


In [36]:
query.merge()

Unnamed: 0,Chromosome,Start,End
0,1,9,13
1,1,14,15


In [33]:
target.merge()

Unnamed: 0,Chromosome,Start,End
0,1,6,13
1,1,20,25


In [12]:
concat = pr.concat(x for x in [query, target])

In [13]:
concat

Unnamed: 0,Chromosome,Start,End,Name
0,1,9,12,
1,1,6,13,a
2,1,20,25,b


In [10]:
concat.sort()

Unnamed: 0,Chromosome,Start,End,Name
0,1,6,13,a
1,1,9,12,
2,1,20,25,b


# Initialize without from_dict

In [8]:
pr.PyRanges(
    chromosomes=['1', '2', '3'], 
    starts=[0, 0, 0], 
    ends=[100, 1000, 10000],
    names=['a', 'b', 'c']
)

TypeError: __init__() got an unexpected keyword argument 'names'

# window

In [331]:
hg19_gr = CHROMDICT_HG19.to_gr()

In [333]:
hkbins = hg19_gr.window(1_000_000)

In [336]:
hkbins['1'].df.iloc[-1, :]

Chromosome            1
Start         249000000
End           249250621
Name: 249, dtype: object

# concat

In [9]:
gr = pr.from_dict({"Chromosome": ["chr1", 'chr2', 'chr1'], "Start": [1, 4, 10],
                   "End": [3, 9, 11], "ID": ["a", "b", "c"]})
gr2 = pr.from_dict({"Chromosome": ['chr2', 'chr1', 'chr2'], "Start": [2, 2, 9], "End": [3, 9, 10]})

In [10]:
gr

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,1,3,a
1,chr1,10,11,c
2,chr2,4,9,b


In [11]:
gr2

Unnamed: 0,Chromosome,Start,End
0,chr1,2,9
1,chr2,2,3
2,chr2,9,10


In [12]:
pr.concat([gr, gr2])

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,1,3,a
1,chr1,10,11,c
2,chr1,2,9,
3,chr2,4,9,b
4,chr2,2,3,
5,chr2,9,10,


# subtract

In [6]:
gr = pr.from_dict({"Chromosome": ["chr1"] * 3, "Start": [1, 4, 10],
                   "End": [3, 9, 11], "ID": ["a", "b", "c"]})
gr2 = pr.from_dict({"Chromosome": ["chr1"] * 3, "Start": [2, 2, 9], "End": [3, 9, 10]})

In [7]:
gr

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,1,3,a
1,chr1,4,9,b
2,chr1,10,11,c


In [8]:
gr2

Unnamed: 0,Chromosome,Start,End
0,chr1,2,3
1,chr1,2,9
2,chr1,9,10


In [10]:
gr.subtract(gr2)

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,1,2,a
1,chr1,10,11,c


# intersect

In [2]:
gr = pr.from_dict({"Chromosome": ["chr1"] * 3, "Start": [1, 4, 10],
                   "End": [3, 9, 11], "ID": ["a", "b", "c"]})
gr2 = pr.from_dict({"Chromosome": ["chr1"] * 3, "Start": [2, 2, 9], "End": [3, 9, 10]})

In [3]:
gr

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,1,3,a
1,chr1,4,9,b
2,chr1,10,11,c


In [4]:
gr2

Unnamed: 0,Chromosome,Start,End
0,chr1,2,3
1,chr1,2,9
2,chr1,9,10


In [5]:
gr.intersect(gr2)

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,2,3,a
1,chr1,2,3,a
2,chr1,4,9,b


In [7]:
gr.intersect(gr2, how='first')

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,2,3,a
1,chr1,4,9,b


In [8]:
gr.intersect(gr2, how='containment')

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,4,9,b


# set_intersect

In [9]:
gr = pr.from_dict({"Chromosome": ["chr1"] * 3, "Start": [1, 4, 10],
                   "End": [3, 9, 11], "ID": ["a", "b", "c"]})
gr2 = pr.from_dict({"Chromosome": ["chr1"] * 3, "Start": [2, 2, 9], "End": [3, 9, 10]})

In [11]:
gr

Unnamed: 0,Chromosome,Start,End,ID
0,chr1,1,3,a
1,chr1,4,9,b
2,chr1,10,11,c


In [12]:
gr2

Unnamed: 0,Chromosome,Start,End
0,chr1,2,3
1,chr1,2,9
2,chr1,9,10


In [10]:
gr.set_intersect(gr2)

Unnamed: 0,Chromosome,Start,End
0,chr1,2,3
1,chr1,4,9


In [14]:
gr.set_intersect(gr2, how='containment').empty

False

In [19]:
import pandas as pd
import numpy as np

In [22]:
df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=['col1', 'col2', 'col3'])

In [25]:
df

Unnamed: 0,col1,col2,col3
0,0,1,2
1,3,4,5
2,6,7,8


In [26]:
df.loc[:, ['col2', 'col1', 'col3']]

Unnamed: 0,col2,col1,col3
0,1,0,2
1,4,3,5
2,7,6,8


# read_gff3

In [312]:
geneset = pr.read_gff3('/home/users/pjh/References/ensembl_data_files/modified_files/grch37/geneset_gff3_sorted.gz')

In [340]:
geneset

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,external_name,logic_name,...,havana_version,tag,transcript_id,constitutive,ensembl_end_phase,ensembl_phase,exon_id,rank,ccdsid,protein_id
0,1,cpg,biological_region,10468,11240,1.3e+03,.,.,oe %3D 0.79,cpg,...,,,,,,,,,,
1,1,Eponine,biological_region,10649,10657,0.999,+,.,,eponine,...,,,,,,,,,,
2,1,Eponine,biological_region,10655,10658,0.999,-,.,,eponine,...,,,,,,,,,,
3,1,Eponine,biological_region,10677,10687,0.999,+,.,,eponine,...,,,,,,,,,,
4,1,Eponine,biological_region,10681,10689,0.999,-,.,,eponine,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2626588,Y,havana,exon,28779491,28779578,.,-,.,,,...,,,,1,-1,-1,ENSE00001638296,2,,
2626589,Y,havana,exon,28780669,28780799,.,-,.,,,...,,,,1,-1,-1,ENSE00001797328,1,,
2626590,Y,havana,pseudogene,59001390,59001635,.,+,.,,havana,...,,,,,,,,,,
2626591,Y,havana,processed_pseudogene,59001390,59001635,.,+,.,,,...,1,basic,ENST00000431853,,,,,,,


In [None]:
geneset.columns

# join

In [22]:
f1 = pr.from_dict({
    'Chromosome': ['chr1', 'chr1', 'chr1'], 
    'Start': [3, 8, 5],
    'End': [6, 9, 7], 
    # 'Name': ['interval1', 'interval3', 'interval2']
})
f2 = pr.from_dict({'Chromosome': ['chr1', 'chr1'], 'Start': [1, 6],
                   'End': [2, 7], 'Name': ['a', 'b']})

In [23]:
f1

Unnamed: 0,Chromosome,Start,End
0,chr1,3,6
1,chr1,8,9
2,chr1,5,7


In [24]:
f2

Unnamed: 0,Chromosome,Start,End,Name
0,chr1,1,2,a
1,chr1,6,7,b


In [25]:
f1.join(f2)

Unnamed: 0,Chromosome,Start,End,Start_b,End_b,Name
0,chr1,5,7,6,7,b


In [20]:
joined = f1.join(f2, how=None)
joined

Unnamed: 0,Chromosome,Start,End,Name,Start_b,End_b,Name_b
0,chr1,5,7,interval2,6,7,b


In [17]:
joined_row = next(joined.df.iterrows())[1]
for idx, row in f2.df.iterrows():
    if (
        joined_row['Chromosome'] == row['Chromosome'] and
        joined_row['Start_b'] == row['Start'] and
        joined_row['End_b'] == row['End']
    ):
        break
        
print(idx)

1


In [6]:
f1.join(f2, how='left')

  scdf = scdf.append(sh)
  ocdf = ocdf.append(oh)


Unnamed: 0,Chromosome,Start,End,Name,Start_b,End_b,Name_b
0,chr1,5,7,interval2,6,7,b
1,chr1,3,6,interval1,-1,-1,-1
2,chr1,8,9,interval3,-1,-1,-1


In [7]:
f1.join(f2, how='right')

  scdf = scdf.append(sh)
  ocdf = ocdf.append(oh)


Unnamed: 0,Chromosome,Start,End,Name,Start_b,End_b,Name_b
0,chr1,5,7,interval2,6,7,b
1,chr1,-1,-1,-1,1,2,a


In [8]:
gr1 = pr.from_dict({
    'Chromosome': ['chr1', 'chr1'], 
    'Start': [4, 12],
    'End': [8, 16],
    'Val1': ['a', 'b']
})
gr2 = pr.from_dict({
    'Chromosome': ['chr1', 'chr1', 'chr1'], 
    'Start': [0, 7, 15],
    'End': [6, 13, 20],
    'Val2': ['A', 'B', 'C']
})

In [9]:
gr1

Unnamed: 0,Chromosome,Start,End,Val1
0,chr1,4,8,a
1,chr1,12,16,b


In [10]:
gr2

Unnamed: 0,Chromosome,Start,End,Val2
0,chr1,0,6,A
1,chr1,7,13,B
2,chr1,15,20,C


In [12]:
gr1.join(gr2)

Unnamed: 0,Chromosome,Start,End,Val1,Start_b,End_b,Val2
0,chr1,4,8,a,0,6,A
1,chr1,4,8,a,7,13,B
2,chr1,12,16,b,7,13,B
3,chr1,12,16,b,15,20,C


In [13]:
v = gr1.join(gr2).new_position('intersection')

In [14]:
v

Unnamed: 0,Chromosome,Start,End,Val1,Start_b,End_b,Val2
0,chr1,4,6,a,0,6,A
1,chr1,7,8,a,7,13,B
2,chr1,12,13,b,7,13,B
3,chr1,15,16,b,15,20,C


# new_position

In [29]:
# gr = pr.from_dict({'Chromosome': ['chr1', 'chr1', 'chr1'],
#                    'Start': [3, 8, 5], 'End': [6, 9, 7]})
# gr2 = pr.from_dict({'Chromosome': ['chr1', 'chr1'], 'Start': [1, 6],
#                     'End': [4, 7]})

j = pr.from_dict(
    {
        'Chromosome': ['chr1', 'chr1'],
        'Start': [3, 5],
        'End': [6, 7],
        'Start_b': [1, 6],
        'End_b': [4, 7],
    }
)

In [30]:
j

Unnamed: 0,Chromosome,Start,End,Start_b,End_b
0,chr1,3,6,1,4
1,chr1,5,7,6,7


In [32]:
j.copy().new_position('swap')

Unnamed: 0,Chromosome,Start,End,Start_b,End_b
0,chr1,1,4,3,6
1,chr1,6,7,5,7


In [33]:
j.copy().new_position('intersection')

Unnamed: 0,Chromosome,Start,End,Start_b,End_b
0,chr1,3,4,1,4
1,chr1,6,7,6,7


In [34]:
j.copy().new_position('union')

Unnamed: 0,Chromosome,Start,End,Start_b,End_b
0,chr1,1,6,1,4
1,chr1,5,7,6,7


# assign

In [3]:
gr = pr.from_dict({"Chromosome": [1, 1], "Start": [1, 2], "End": [3, 5], "Name": ["a", "b"]})

In [57]:
gr

Unnamed: 0,Chromosome,Start,End,Name
0,1,1,3,a
1,1,2,5,b


In [4]:
gr.assign('End', lambda gr: gr.End + 100)

Unnamed: 0,Chromosome,Start,End,Name
0,1,1,103,a
1,1,2,105,b


In [61]:
gr.assign('Chromosome', lambda gr: pd.Series(['chr' + x for x in gr.Chromosome]))

Unnamed: 0,Chromosome,Start,End,Name
0,chr1,1,3,a
1,chr1,2,5,b
