In [2]:
import pandas as pd

In [3]:
data = pd.Series(['RNA', 'gene', 'protein'])

data

0        RNA
1       gene
2    protein
dtype: object

In [4]:
data = pd.Series(
    ['RNA', 'gene', 'protein'],
    index = ['ENSG', 'ENSP', 'ENST']
)

data

ENSG        RNA
ENSP       gene
ENST    protein
dtype: object

In [6]:
map_dict = {'ENST': 'RNA', 'ENSG':'gene', 'ENSP': "protein"}
data = pd.Series(map_dict)

data

ENSG       gene
ENSP    protein
ENST        RNA
dtype: object

In [7]:
data['ENSG']

'gene'

In [9]:
data['ENSP':]

ENSP    protein
ENST        RNA
dtype: object

In [10]:
count_dict = {'ENST':3300, 'ENSG':18345, 'ENSP': 12034}
groups_dict = {'ENST':13, 'ENSG':42, 'ENSP':157}

df = pd.DataFrame( {'mapping type': map_dict, 'counts': count_dict, 
              'classes': groups_dict} )

df

Unnamed: 0,classes,counts,mapping type
ENSG,42,18345,gene
ENSP,157,12034,protein
ENST,13,3300,RNA


In [12]:
df.index, df.columns

(Index(['ENSG', 'ENSP', 'ENST'], dtype='object'),
 Index(['classes', 'counts', 'mapping type'], dtype='object'))

In [15]:
df['counts']

ENSG    18345
ENSP    12034
ENST     3300
Name: counts, dtype: int64

In [16]:
df.counts

ENSG    18345
ENSP    12034
ENST     3300
Name: counts, dtype: int64

In [17]:
df['mapping type']

ENSG       gene
ENSP    protein
ENST        RNA
Name: mapping type, dtype: object

In [None]:
# df.mapping type <--- no!

In [20]:
df.averages = df.counts / df.classes

df

Unnamed: 0,classes,counts,mapping type,averages
ENSG,42,18345,gene,436.785714
ENSP,157,12034,protein,76.649682
ENST,13,3300,RNA,253.846154


In [None]:
#Indexers: loc, iloc

In [22]:
data = pd.Series(['a','b','c'], index = [1, 3 ,5])

print(data[1])
print(data[1:3])

a
3    b
5    c
dtype: object


In [24]:
print(data.loc[1])
print(data.loc[1:3])

a
1    a
3    b
dtype: object


In [26]:
print(data.iloc[1])
print(data.iloc[1:3])

b
3    b
5    c
dtype: object


In [27]:
data = pd.Series(['a','b','c'], index = [1, 5 ,3])
print(data.loc[1:3])


1    a
5    b
3    c
dtype: object


In [32]:
data = pd.Series(['a','b','c'], index = ['hello', 'python' ,'pandas'])
print(data.loc[:])

hello     a
python    b
pandas    c
dtype: object


In [34]:
df

Unnamed: 0,classes,counts,mapping type,averages
ENSG,42,18345,gene,436.785714
ENSP,157,12034,protein,76.649682
ENST,13,3300,RNA,253.846154


In [35]:
df.values

array([[42, 18345, 'gene', 436.7857142857143],
       [157, 12034, 'protein', 76.64968152866243],
       [13, 3300, 'RNA', 253.84615384615384]], dtype=object)

In [36]:
df.iloc[:2, :3]

Unnamed: 0,classes,counts,mapping type
ENSG,42,18345,gene
ENSP,157,12034,protein


In [38]:
df.iloc[: , :3]

Unnamed: 0,classes,counts,mapping type
ENSG,42,18345,gene
ENSP,157,12034,protein
ENST,13,3300,RNA


In [39]:
df.loc[:'ENSP', 'counts':]

Unnamed: 0,counts,mapping type,averages
ENSG,18345,gene,436.785714
ENSP,12034,protein,76.649682


In [41]:
df['counts']

ENSG    18345
ENSP    12034
ENST     3300
Name: counts, dtype: int64

In [43]:
df['ENSG':]

Unnamed: 0,classes,counts,mapping type,averages
ENSG,42,18345,gene,436.785714
ENSP,157,12034,protein,76.649682
ENST,13,3300,RNA,253.846154


In [44]:
df.counts > 5000

ENSG     True
ENSP     True
ENST    False
Name: counts, dtype: bool

In [45]:
df[ df.counts > 5000 ]

Unnamed: 0,classes,counts,mapping type,averages
ENSG,42,18345,gene,436.785714
ENSP,157,12034,protein,76.649682


In [53]:

df[ (df.counts > 5000 ) & (df.averages< 400)]

Unnamed: 0,classes,counts,mapping type,averages
ENSP,157,12034,protein,76.649682


In [54]:
True and False

False

In [61]:
(True, True, False) and (True, True)

(True, True)

In [57]:
(df.counts > 5000 ) & (df.averages < 400)

ENSG    False
ENSP     True
ENST    False
dtype: bool

In [67]:
import numpy as np

None

In [None]:
np.nan

In [68]:
val = np.array([1, None, 3, 4])

val

array([1, None, 3, 4], dtype=object)

In [69]:
val.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [70]:
val2 = np.array([1, np.nan, 3, 4])

val2

array([  1.,  nan,   3.,   4.])

In [71]:
val2.sum()

nan

In [74]:
val2[val2 == np.nan]

array([], dtype=float64)

In [77]:
val2 [ np.isnan(val2)]

array([ nan])

In [78]:
val2.dtype

dtype('float64')

In [81]:
val2.sum(), val2.min(), val2.max()

(nan, nan, nan)

In [84]:
np.nansum(val2), np.nanmin(val2), np.nanmax(val2), np.nanmean(val2)

(8.0, 1.0, 4.0, 2.6666666666666665)

In [85]:
df

Unnamed: 0,classes,counts,mapping type,averages
ENSG,42,18345,gene,436.785714
ENSP,157,12034,protein,76.649682
ENST,13,3300,RNA,253.846154


In [86]:
df.loc['ENSG', 'counts'] = np.nan
df.loc['ENSG', 'classes'] = np.nan
df.loc['ENST', 'classes'] = np.nan

df

Unnamed: 0,classes,counts,mapping type,averages
ENSG,,,gene,436.785714
ENSP,157.0,12034.0,protein,76.649682
ENST,,3300.0,RNA,253.846154


In [87]:
#DROP
df.dropna()

Unnamed: 0,classes,counts,mapping type,averages
ENSP,157.0,12034.0,protein,76.649682


In [88]:
df.dropna(axis=1)

Unnamed: 0,mapping type,averages
ENSG,gene,436.785714
ENSP,protein,76.649682
ENST,RNA,253.846154


In [89]:
df.dropna(axis=1, thresh=2)

Unnamed: 0,counts,mapping type,averages
ENSG,,gene,436.785714
ENSP,12034.0,protein,76.649682
ENST,3300.0,RNA,253.846154


In [91]:
df.fillna(df.mean())

Unnamed: 0,classes,counts,mapping type,averages
ENSG,157.0,7667.0,gene,436.785714
ENSP,157.0,12034.0,protein,76.649682
ENST,157.0,3300.0,RNA,253.846154


In [92]:
df

Unnamed: 0,classes,counts,mapping type,averages
ENSG,,,gene,436.785714
ENSP,157.0,12034.0,protein,76.649682
ENST,,3300.0,RNA,253.846154


In [93]:
df.mean()

classes      157.000000
counts      7667.000000
averages     255.760517
dtype: float64

Unnamed: 0,classes,counts,averages
classes,,,
counts,,1.0,-1.0
averages,,-1.0,1.0
