In [1]:
import numpy as np
import pandas as pd


In [2]:
#on peut créer une Series à partie d'une list
data = pd.Series([0.25,0.5,0.75,1.0])
print ("data ressemble à un tableau Numpy: ", data)

data ressemble à un tableau Numpy:  0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [3]:
#spécifier des indices à la main
data = pd.Series([0.25,0.5,0.75,1.0],
        index=['a','b','c','d'])
print("data ressemble à un dict en Python:", data)
print(data['b'])

data ressemble à un dict en Python: a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5


In [4]:
#créer une serie à partie d'une dict
population_dict = {'California': 38332521,
                  'Texas':26448193,
                   'New York':19651127,
                   'Florida':19552860,
                   'Illinois': 12882135}
area_dict={'California':423967,
          'Texas':1412297,
          'New York':695662,
          'Florida': 170312,
          'Illinois':149995}
population = pd.Series(population_dict)
area = pd.Series(area_dict)
print(population)

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


In [5]:
print(population['California':'Florida'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64


In [6]:
# A partir d'une Series
df = pd.DataFrame(population,columns=['population'])
print(df)

            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135


In [7]:
#à partir d'une list de dict
data = [{'a':i,'b':2*i}
           for i in range(3)]
df = pd.DataFrame(data)
print(df)

   a  b
0  0  0
1  1  2
2  2  4


In [8]:
# a parir de plusieurs Series
df = pd.DataFrame({'population':population,
                  'area': area})
print(df)

            population     area
California    38332521   423967
Texas         26448193  1412297
New York      19651127   695662
Florida       19552860   170312
Illinois      12882135   149995


In [9]:
# a partir d'un tableau Numpy de dimension 2
df = pd.DataFrame(np.random.rand(3,2),
            columns=['foo','bar'],
            index=['a','b','c'])
print(df)

        foo       bar
a  0.589279  0.712122
b  0.708951  0.065530
c  0.467308  0.002136


In [10]:
# Une fonction pour générer facilement des DataFrame.
#utile dans la suite de ce chapitre.

def make_df (cols,ind):
    """Crée rapidement des DataFrame"""
    data = {c:[str(c)+str(i) for i in ind]
           for c in cols}
    return pd.DataFrame(data,ind)


In [11]:
#exemple
make_df('ABC',range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


### projection et restriction

In [12]:
# projection et restriction
data = pd.Series([0.25,0.5,0.75,1.0],
                index=['a','b','c','d'])
print (data)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [13]:
#On peut désigner un element d'une Series par son index
print(data.loc['b'])

0.5


In [14]:
#par sa position:
print(data.iloc[1])

0.5


In [15]:
data = pd.DataFrame({'area':area,'pop':population})
print(data)


               area       pop
California   423967  38332521
Texas       1412297  26448193
New York     695662  19651127
Florida      170312  19552860
Illinois     149995  12882135


In [16]:
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,1412297,26448193
New York,695662,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [17]:
#union grâce à pd.concat

ser1 = pd.Series(['A','B','C'], index=[1,2,3])
ser2 = pd.Series(['D','E','F'], index=[4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [18]:
#union pour un DataFrame
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])
pd.concat([df1,df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [19]:
x = make_df('AB',[0,1])
y = make_df('AB',[2,3])
y.index = x.index # rend les index identiques
#Nous avons lors des index dupliqués

print(pd.concat([x,y]))

    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [20]:
#spécifier les index hierarchiques
hdf = pd.concat([x,y],keys=['x','y'])
print(hdf)

      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


In [21]:
hdf.loc[('x',1),]

A    A1
B    B1
Name: (x, 1), dtype: object

### La jointure

In [22]:
df1 = pd.DataFrame({'employee': ['Bob','Jake','Lisa','Sue'],
                   'departement':['Accounting','Engineering','Engineering','HR']})
df2 = pd.DataFrame({'employee':['Lisa','Bob','Jake','Sue'],
                   'date':[2004,2008,2012,2014]})
df3 = pd.merge(df1, df2)

In [23]:
print(df3)

  employee  departement  date
0      Bob   Accounting  2008
1     Jake  Engineering  2012
2     Lisa  Engineering  2004
3      Sue           HR  2014
