In [2]:
import pandas as pd
import numpy as np
import random

In [15]:
import pkg_resources
pkg_resources.get_distribution("pandas").version

'0.18.1'

# Bug scenarios with ordered categories:

The four scenarios:
 * Default (`sort = True`)
 * `chromosome 1` filtered out and `sort=True`: No error 
 * `chromosome 1` filtered out and `sort=False`: **Error**
 * `sort = False`: **Error**


In [3]:
random.seed(88)
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
chromosomes = [str(x) for x in range(1,23)] + ["X","Y"]
df.insert(0, 'chromosomes', sorted([random.choice(chromosomes) for x in range(100)]))
df.chromosomes = df.chromosomes.astype('category', categories=chromosomes, ordered=True)
df.head()

Unnamed: 0,chromosomes,A,B,C,D
0,1,73,35,53,41
1,1,86,6,0,72
2,1,66,68,79,50
3,1,41,75,11,48
4,1,68,96,95,69


In [4]:
df.chromosomes.head()

0    1
1    1
2    1
3    1
4    1
Name: chromosomes, dtype: category
Categories (24, object): [1 < 2 < 3 < 4 ... 21 < 22 < X < Y]

In [5]:
for c, g in df.query("chromosomes != '1'").groupby('chromosomes', sort=True):
    print(c, g.chromosomes.cat.categories, g.shape)
    break

1 Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
      dtype='object') (0, 5)


In [6]:
for c, g in df.groupby('chromosomes', sort=True):
    print(c, g.chromosomes.cat.categories, g.shape)
    break

1 Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
      dtype='object') (7, 5)


In [7]:
for c, g in df.query("chromosomes != '1'").groupby('chromosomes', sort=False):
    print(c, g.chromosomes.cat.categories, g.shape)


ValueError: items in new_categories are not the same as in old categories

In [8]:
for c, g in df.groupby('chromosomes', sort=False):
    print(c, g.chromosomes.cat.categories, g.shape)
    break

ValueError: items in new_categories are not the same as in old categories

# Bug scenarios without ordered categories:


the 4 scenarios:
 * Default (`sort = True`)
 * `chromosome 1` filtered out and `sort=True`: No error 
 * `sort = False`: No error
 * `chromosome 1` filtered out and `sort=False`: **Error**


In [9]:
random.seed(88)
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
chromosomes = [str(x) for x in range(1,23)] + ["X","Y"]
df.insert(0, 'chromosomes', sorted([random.choice(chromosomes) for x in range(100)]))
df.chromosomes = df.chromosomes.astype('category')
df.head()

Unnamed: 0,chromosomes,A,B,C,D
0,1,45,68,32,4
1,1,5,42,89,20
2,1,83,0,82,78
3,1,16,66,48,90
4,1,95,91,11,36


In [10]:
df.chromosomes.head()

0    1
1    1
2    1
3    1
4    1
Name: chromosomes, dtype: category
Categories (22, object): [1, 10, 11, 12, ..., 6, 7, 8, X]

In [11]:
for c, g in df.query("chromosomes != '1'").groupby('chromosomes', sort=True):
    print(c, g.chromosomes.cat.categories, g.shape)
    break

1 Index(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2',
       '20', '21', '22', '3', '4', '5', '6', '7', '8', 'X'],
      dtype='object') (0, 5)


In [12]:
for c, g in df.groupby('chromosomes', sort=True):
    print(c, g.chromosomes.cat.categories, g.shape)
    break

1 Index(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2',
       '20', '21', '22', '3', '4', '5', '6', '7', '8', 'X'],
      dtype='object') (7, 5)


In [13]:
for c, g in df.groupby('chromosomes', sort=False):
    print(c, g.chromosomes.cat.categories, g.shape)
    break

1 Index(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2',
       '20', '21', '22', '3', '4', '5', '6', '7', '8', 'X'],
      dtype='object') (7, 5)


In [14]:
for c, g in df.query("chromosomes != '1'").groupby('chromosomes', sort=False):
    print(c, g.chromosomes.cat.categories, g.shape)


ValueError: items in new_categories are not the same as in old categories