In [1]:
import numpy as np
import pandas as pd

In [3]:
# numpy => manipulation de vecteurs, matrices, tenseurs => 
# librairie d'algèbre linéaire

In [4]:
# pandas => manipulation de données tabulaires (structurées) : 
# colonnes ayant un titre / interprétation, des index permettant 
# d'identifier chaque échantillon
# DataFrame : axe vertical : différents échantillons
# axe horizontal : différentes colonnes, features, caractéristiques, attributs

In [5]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

In [6]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [7]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
data[1]

0.5

In [9]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [10]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [12]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [14]:
data['b']

0.5

In [15]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])

In [16]:
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [17]:
data[2]

0.25

In [20]:
d = np.array([3, 4, 9])
data = pd.Series(d, index=['2', '9', '0'])

In [21]:
data

2    3
9    4
0    9
dtype: int64

In [22]:
pd.Series({2: 'a', 1: 'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [23]:
area_dict = {'idf': 87676, 'paca': 76352, 'bretagne': 89656}
area = pd.Series(area_dict)

In [24]:
area

idf         87676
paca        76352
bretagne    89656
dtype: int64

In [27]:
pop_dict = {'idf': 15000000, 'paca': 9000000, 'bretagne': 5000000}
population = pd.Series(pop_dict)

In [29]:
regions = pd.DataFrame({'population': population, 'area': area})

In [30]:
regions

Unnamed: 0,population,area
idf,15000000,87676
paca,9000000,76352
bretagne,5000000,89656


In [31]:
regions.index

Index(['idf', 'paca', 'bretagne'], dtype='object')

In [32]:
regions.columns

Index(['population', 'area'], dtype='object')

In [33]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [34]:
area.index

Index(['idf', 'paca', 'bretagne'], dtype='object')

In [35]:
area.index[1]

'paca'

In [36]:
area.index[1:3]

Index(['paca', 'bretagne'], dtype='object')

In [38]:
print(area.index.size, area.index.shape, area.index.ndim, area.index.dtype)

3 (3,) 1 object


In [39]:
area.index[1] = 0

TypeError: Index does not support mutable operations

In [40]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [41]:
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [42]:
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

In [43]:
# selection dans les séries / dataframes

In [44]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [45]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [46]:
data['b']

0.5

In [47]:
'a' in data

True

In [48]:
'e' in data

False

In [49]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [50]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [51]:
data['e'] = 1.25

In [52]:
'e' in data

True

In [53]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [54]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [55]:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [56]:
data[data > 0.3]

b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [57]:
data > 0.3

a    False
b     True
c     True
d     True
e     True
dtype: bool

In [58]:
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [59]:
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [60]:
# loc, iloc, ix
# selection sur dataframe
# operations sur dataframe
# gestion des valeurs manquantes

In [3]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [4]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967,695662,141297,170312,149995
pop,38332521,26448193,19651127,19552860,12882135


In [5]:
data.values

array([[  423967, 38332521],
       [  695662, 26448193],
       [  141297, 19651127],
       [  170312, 19552860],
       [  149995, 12882135]])

In [6]:
data.values[0]

array([  423967, 38332521])

In [7]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [9]:
data.iloc[:3, :1]

Unnamed: 0,area
California,423967
Texas,695662
New York,141297


In [12]:
data.loc[:'New York', :'area']

Unnamed: 0,area
California,423967
Texas,695662
New York,141297


In [13]:
data.ix[:3, :'pop']

AttributeError: 'DataFrame' object has no attribute 'ix'

In [14]:
data['density'] = data['pop'] / data['area']

In [16]:
data[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [18]:
ser = pd.Series(np.random.randint(0, 10, 4))

In [19]:
ser

0    6
1    2
2    4
3    4
dtype: int64

In [20]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 4)),
                  columns=['a', 'b', 'c', 'd'])

In [21]:
df

Unnamed: 0,a,b,c,d
0,5,9,3,2
1,5,1,8,0
2,5,8,8,3


In [22]:
np.exp(ser)

0    403.428793
1      7.389056
2     54.598150
3     54.598150
dtype: float64

In [23]:
np.sin(df * np.pi / 4)

Unnamed: 0,a,b,c,d
0,-0.707107,0.7071068,0.7071068,1.0
1,-0.707107,0.7071068,-2.449294e-16,0.0
2,-0.707107,-2.449294e-16,-2.449294e-16,0.707107


In [24]:
# element-wise operations

In [25]:
area = pd.Series({'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995, 'Colorado': 872636})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})

In [26]:
pop / area

California           NaN
Colorado             NaN
Florida       114.806121
Illinois       85.883763
New York      139.076746
Texas          38.018740
dtype: float64

In [27]:
area.index | pop.index

Index(['California', 'Colorado', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [29]:
(pop / area).fillna(0)

California      0.000000
Colorado        0.000000
Florida       114.806121
Illinois       85.883763
New York      139.076746
Texas          38.018740
dtype: float64

In [30]:
# + => add()
# - => sub(), subtract()
# * => mul(), multiply()
# / => div(), divide()
# // => floordiv()
# % => mod()
# ** => pow()

In [31]:
# broadcasting

In [34]:
A = np.random.randint(10, size=(3, 4))

In [36]:
A

array([[1, 3, 6, 0],
       [5, 6, 5, 7],
       [8, 7, 5, 9]])

In [37]:
A - 1

array([[ 0,  2,  5, -1],
       [ 4,  5,  4,  6],
       [ 7,  6,  4,  8]])

In [38]:
A

array([[1, 3, 6, 0],
       [5, 6, 5, 7],
       [8, 7, 5, 9]])

In [39]:
A[0]

array([1, 3, 6, 0])

In [40]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 4,  3, -1,  7],
       [ 7,  4, -1,  9]])

In [42]:
A.T - A[:, 0]

array([[ 0,  0,  0],
       [ 2,  1, -1],
       [ 5,  0, -3],
       [-1,  2,  1]])

In [44]:
df = pd.DataFrame(A, columns=list('QRST'))

In [45]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,4,3,-1,7
2,7,4,-1,9


In [46]:
df

Unnamed: 0,Q,R,S,T
0,1,3,6,0
1,5,6,5,7
2,8,7,5,9


In [49]:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,-2,0,3,-3
1,-1,0,-1,1
2,1,0,-2,2


In [51]:
data = pd.Series([1, np.nan, 'hello', None])

In [52]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [53]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [54]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [55]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [56]:
data.dropna()

0        1
2    hello
dtype: object

In [57]:
data.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

In [58]:
data.fillna(method='ffill')

0        1
1        1
2    hello
3    hello
dtype: object

In [59]:
data.fillna(method='bfill')

0        1
1    hello
2    hello
3     None
dtype: object

In [60]:
# concatenation

In [61]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

In [62]:
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [64]:
x = [[1, 2],
    [3, 4]]

In [65]:
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [66]:
np.concatenate([x, x], axis=0)

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [67]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [68]:
df = pd.DataFrame(x)

In [70]:
dfx = pd.DataFrame(x)

In [73]:
pd.concat([dfx, dfx], axis=1)

Unnamed: 0,0,1,0.1,1.1
0,1,2,1,2
1,3,4,3,4


In [74]:
pd.concat([dfx, dfx])

Unnamed: 0,0,1
0,1,2
1,3,4
0,1,2
1,3,4


In [75]:
pd.concat([dfx, dfx], verify_integrity=True)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')

In [76]:
pd.concat([dfx, dfx], ignore_index=True)

Unnamed: 0,0,1
0,1,2
1,3,4
2,1,2
3,3,4


In [77]:
try:
    pd.concat([dfx, dfx], verify_integrity=True)
except ValueError as e:
    print('ValueError:', e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [78]:
dfx.append(dfx)

Unnamed: 0,0,1
0,1,2
1,3,4
0,1,2
1,3,4


In [79]:
A = [[1, 2],
     [3, 4]]
B = [[5, 6],
     [7, 8]]

In [84]:
dfa = pd.DataFrame(A, [1, 2], columns=['a', 'b'])

In [85]:
dfa

Unnamed: 0,a,b
1,1,2
2,3,4


In [86]:
dfb = pd.DataFrame(B, [3, 4], columns=['b', 'c'])

In [88]:
dfb

Unnamed: 0,b,c
3,5,6
4,7,8


In [87]:
pd.concat([dfa, dfb])

Unnamed: 0,a,b,c
1,1.0,2,
2,3.0,4,
3,,5,6.0
4,,7,8.0


In [89]:
# outer join
# inner join

In [90]:
pd.concat([dfa, dfb], join='inner')

Unnamed: 0,b
1,2
2,4
3,5
4,7


In [91]:
pd.concat([dfa, dfb], join='outer')

Unnamed: 0,a,b,c
1,1.0,2,
2,3.0,4,
3,,5,6.0
4,,7,8.0


In [93]:
pd.concat([dfa, dfb]).reindex(['a', 'b']

Unnamed: 0,a,b,c
a,,,
b,,,


In [108]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'group': ['Accounting', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

In [109]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [110]:
df2

Unnamed: 0,group,hire_date
0,Accounting,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [111]:
pd.merge(df1, df2)

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2004


Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2004


In [119]:
df = pd.DataFrame(np.random.randint(0, 100, 1000))

In [122]:
df[0].sort_values(ascending=False)

398    99
732    99
930    99
12     99
507    99
       ..
64      0
589     0
492     0
649     0
262     0
Name: 0, Length: 1000, dtype: int64

In [123]:
df[0].value_counts()

55    18
68    16
71    16
51    16
9     15
      ..
72     6
0      6
21     5
76     5
31     5
Name: 0, Length: 100, dtype: int64

In [8]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 2, 3])

In [9]:
df = pd.DataFrame(ser1)

In [10]:
df

Unnamed: 0,0
1,A
2,B
3,C


In [11]:
df['t'] = ser2

In [12]:
df

Unnamed: 0,0,t
1,A,
2,B,E
3,C,F
