In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
# Generate a pandas series with values in list to matching index values.
listA = pd.Series([1,3,5,np.nan,6,8,10,np.nan])

In [40]:
#Pandas series can be accesed through integer index values.
print(listA)
print()
print(listA[5])

0     1.0
1     3.0
2     5.0
3     NaN
4     6.0
5     8.0
6    10.0
7     NaN
dtype: float64

8.0


In [56]:
#Generate list starting at 2013-01-01, with 8 periods. (8 days in this case, including start date)
dates = pd.date_range('20130101', periods=20)

In [57]:
#Same with a pandas date range: integer indexing.
dates[0]

Timestamp('2013-01-01 00:00:00', freq='D')

In [58]:
# Creates Dataframe with 8 rows and 5 columns. 
# Rows: Sets index to be entries in date list.
# Columns: sets columns to be entries in list 'ABCDE'.
df = pd.DataFrame(np.random.randn(20,5), index = dates, columns=list('ABCDE'))

In [59]:
df

Unnamed: 0,A,B,C,D,E
2013-01-01,1.634966,-0.676421,0.965136,0.68851,0.788234
2013-01-02,1.183045,0.457848,0.52714,-0.411322,-1.515269
2013-01-03,1.03841,-0.280123,-1.16516,-0.435809,-0.86516
2013-01-04,1.889611,0.528276,0.18952,-1.194288,-0.243834
2013-01-05,0.375089,1.83805,1.560914,-0.581231,-0.098355
2013-01-06,0.005357,0.384227,0.404812,0.306244,0.949622
2013-01-07,1.36712,-0.591361,-1.376663,0.84683,0.391403
2013-01-08,1.330238,-0.373497,0.32051,-0.88544,-0.349832
2013-01-09,-0.298429,0.437757,1.174706,-0.736686,0.348387
2013-01-10,-1.12129,0.112611,-0.861955,-1.201839,-0.276085


In [65]:
# Generate dataframe manually, by passing in a dict. Each key becomes a column, 
# values should be either a series, where each value corresponds to the next row in that column ('C': pd.series ...)
# or singular value which gets applied to every row of that column. ('F': 'foo')
df2 = pd.DataFrame({ 'A' : 1., 
                     'B' : pd.Timestamp('20130102'), 
                     'C' : pd.Series(1,index=list(range(4)), dtype='float32'),  
                     'D' : np.array([3] * 4,dtype='int32'), 
                     'E' : pd.Categorical(["test","train","test","train"]), 
                     'F' : 'foo' })

In [66]:
# View top 5 rows.
df.head()

Unnamed: 0,A,B,C,D,E
2013-01-01,1.634966,-0.676421,0.965136,0.68851,0.788234
2013-01-02,1.183045,0.457848,0.52714,-0.411322,-1.515269
2013-01-03,1.03841,-0.280123,-1.16516,-0.435809,-0.86516
2013-01-04,1.889611,0.528276,0.18952,-1.194288,-0.243834
2013-01-05,0.375089,1.83805,1.560914,-0.581231,-0.098355


In [67]:
# View bottom 5 rows.
df.tail()

Unnamed: 0,A,B,C,D,E
2013-01-16,0.424391,0.927089,0.501723,0.124618,-1.113031
2013-01-17,-0.418798,-1.599441,-0.409959,-1.646945,0.436147
2013-01-18,-0.496171,-0.833966,0.360008,-1.24632,0.962729
2013-01-19,-2.056146,0.98101,-0.188793,-1.365209,-0.434637
2013-01-20,-2.094852,-0.223294,1.85462,-1.773589,-1.250617


In [78]:
#View relevant Index data and print out its type, type = pandas index
index_series = df.index
print(index_series)
print(type(index_series))

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10', '2013-01-11', '2013-01-12',
               '2013-01-13', '2013-01-14', '2013-01-15', '2013-01-16',
               '2013-01-17', '2013-01-18', '2013-01-19', '2013-01-20'],
              dtype='datetime64[ns]', freq='D')
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [79]:
#View relevant column data and print out its type, type = pandas index
column_series = df.columns
print(column_series)
print(type(column_series))

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')
<class 'pandas.core.indexes.base.Index'>


In [86]:
# Get a snapshot of dataset, which is a DataFrame itself. 
# Index values correspond to general stats values (mean, std deviation, min, max, etc...)
df.describe()

Unnamed: 0,A,B,C,D,E
count,20.0,20.0,20.0,20.0,20.0
mean,0.05861,0.062134,0.243741,-0.495434,-0.055697
std,1.176631,0.931954,0.8274,0.838445,0.956666
min,-2.094852,-1.688494,-1.376663,-1.773589,-2.098388
25%,-0.641032,-0.612626,-0.212282,-1.196176,-0.542268
50%,-0.113482,0.248419,0.340259,-0.532534,0.075451
75%,1.100645,0.62798,0.573912,0.029389,0.524169
max,1.889611,1.83805,1.85462,1.200937,1.477314


In [129]:
#Transpose dataFrame. aka switch columns and rows.
#WILL NOT AFFECT DF unless you set it equal to itself.
# ********** ex: df = df.T **********
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00,2013-01-07 00:00:00,2013-01-08 00:00:00,2013-01-09 00:00:00,2013-01-10 00:00:00,2013-01-11 00:00:00,2013-01-12 00:00:00,2013-01-13 00:00:00,2013-01-14 00:00:00,2013-01-15 00:00:00,2013-01-16 00:00:00,2013-01-17 00:00:00,2013-01-18 00:00:00,2013-01-19 00:00:00,2013-01-20 00:00:00
A,1.634966,1.183045,1.03841,1.889611,0.375089,0.005357,1.36712,1.330238,-0.298429,-1.12129,-1.102652,-0.617795,1.073178,-0.710743,-0.232321,0.424391,-0.418798,-0.496171,-2.056146,-2.094852
B,-0.676421,0.457848,-0.280123,0.528276,1.83805,0.384227,-0.591361,-0.373497,0.437757,0.112611,1.175125,-0.810956,1.076415,-1.688494,0.401815,0.927089,-1.599441,-0.833966,0.98101,-0.223294
C,0.965136,0.52714,-1.16516,0.18952,1.560914,0.404812,-1.376663,0.32051,1.174706,-0.861955,0.714225,-0.28275,-0.078576,0.453407,0.211948,0.501723,-0.409959,0.360008,-0.188793,1.85462
D,0.68851,-0.411322,-0.435809,-1.194288,-0.581231,0.306244,0.84683,-0.88544,-0.736686,-1.201839,1.200937,-0.054271,-0.483837,-0.002355,-1.056674,0.124618,-1.646945,-1.24632,-1.365209,-1.773589
E,0.788234,-1.515269,-0.86516,-0.243834,-0.098355,0.949622,0.391403,-0.349832,0.348387,-0.276085,0.307592,1.477314,1.220573,-2.098388,0.249257,-1.113031,0.436147,0.962729,-0.434637,-1.250617


In [130]:
#Sort dataframe by an axis.
df.sort_index(axis=1, ascending=False)

Unnamed: 0,E,D,C,B,A
2013-01-01,0.788234,0.68851,0.965136,-0.676421,1.634966
2013-01-02,-1.515269,-0.411322,0.52714,0.457848,1.183045
2013-01-03,-0.86516,-0.435809,-1.16516,-0.280123,1.03841
2013-01-04,-0.243834,-1.194288,0.18952,0.528276,1.889611
2013-01-05,-0.098355,-0.581231,1.560914,1.83805,0.375089
2013-01-06,0.949622,0.306244,0.404812,0.384227,0.005357
2013-01-07,0.391403,0.84683,-1.376663,-0.591361,1.36712
2013-01-08,-0.349832,-0.88544,0.32051,-0.373497,1.330238
2013-01-09,0.348387,-0.736686,1.174706,0.437757,-0.298429
2013-01-10,-0.276085,-1.201839,-0.861955,0.112611,-1.12129


In [131]:
#Sort  dataframe bycolumn values
df.sort_values(by="D")

Unnamed: 0,A,B,C,D,E
2013-01-20,-2.094852,-0.223294,1.85462,-1.773589,-1.250617
2013-01-17,-0.418798,-1.599441,-0.409959,-1.646945,0.436147
2013-01-19,-2.056146,0.98101,-0.188793,-1.365209,-0.434637
2013-01-18,-0.496171,-0.833966,0.360008,-1.24632,0.962729
2013-01-10,-1.12129,0.112611,-0.861955,-1.201839,-0.276085
2013-01-04,1.889611,0.528276,0.18952,-1.194288,-0.243834
2013-01-15,-0.232321,0.401815,0.211948,-1.056674,0.249257
2013-01-08,1.330238,-0.373497,0.32051,-0.88544,-0.349832
2013-01-09,-0.298429,0.437757,1.174706,-0.736686,0.348387
2013-01-05,0.375089,1.83805,1.560914,-0.581231,-0.098355


In [132]:
df

Unnamed: 0,A,B,C,D,E
2013-01-01,1.634966,-0.676421,0.965136,0.68851,0.788234
2013-01-02,1.183045,0.457848,0.52714,-0.411322,-1.515269
2013-01-03,1.03841,-0.280123,-1.16516,-0.435809,-0.86516
2013-01-04,1.889611,0.528276,0.18952,-1.194288,-0.243834
2013-01-05,0.375089,1.83805,1.560914,-0.581231,-0.098355
2013-01-06,0.005357,0.384227,0.404812,0.306244,0.949622
2013-01-07,1.36712,-0.591361,-1.376663,0.84683,0.391403
2013-01-08,1.330238,-0.373497,0.32051,-0.88544,-0.349832
2013-01-09,-0.298429,0.437757,1.174706,-0.736686,0.348387
2013-01-10,-1.12129,0.112611,-0.861955,-1.201839,-0.276085


In [136]:
#Get Series out of the values in a column, with matching index values.
df["A"]

2013-01-01    1.634966
2013-01-02    1.183045
2013-01-03    1.038410
2013-01-04    1.889611
2013-01-05    0.375089
2013-01-06    0.005357
2013-01-07    1.367120
2013-01-08    1.330238
2013-01-09   -0.298429
2013-01-10   -1.121290
2013-01-11   -1.102652
2013-01-12   -0.617795
2013-01-13    1.073178
2013-01-14   -0.710743
2013-01-15   -0.232321
2013-01-16    0.424391
2013-01-17   -0.418798
2013-01-18   -0.496171
2013-01-19   -2.056146
2013-01-20   -2.094852
Freq: D, Name: A, dtype: float64

In [143]:
#Geta sub-slice of a Dataframe, uisng [beg_range : end_range(NOT included)]
# for ex: df[5:10] will grab rows with index 5,6,7,8, and 9 but NOT 10 
df[5:10]

Unnamed: 0,A,B,C,D,E
2013-01-06,0.005357,0.384227,0.404812,0.306244,0.949622
2013-01-07,1.36712,-0.591361,-1.376663,0.84683,0.391403
2013-01-08,1.330238,-0.373497,0.32051,-0.88544,-0.349832
2013-01-09,-0.298429,0.437757,1.174706,-0.736686,0.348387
2013-01-10,-1.12129,0.112611,-0.861955,-1.201839,-0.276085


In [147]:
#df.loc grabs row data. All values in chosen index for all column values in dataframe.
df.loc[dates[5]]

A    0.005357
B    0.384227
C    0.404812
D    0.306244
E    0.949622
Name: 2013-01-06 00:00:00, dtype: float64

In [170]:
#df.loc[column-range,rows to include]
#BOTH endpoints are included!
df.loc['2013-01-05':'2013-01-10', ['A','E']]

Unnamed: 0,A,E
2013-01-05,0.375089,-0.098355
2013-01-06,0.005357,0.949622
2013-01-07,1.36712,0.391403
2013-01-08,1.330238,-0.349832
2013-01-09,-0.298429,0.348387
2013-01-10,-1.12129,-0.276085


In [173]:
# Reduce dataframe, grab allvalues for a single row.
df.loc['20130110',['A','B']]

A   -1.121290
B    0.112611
Name: 2013-01-10 00:00:00, dtype: float64

In [178]:
#grab a scalar value where [row: (seriesname[index]), column of choice]
float_val = df.loc[dates[10],'B']
print(float_val)

1.17512510438


In [180]:
# iLoc spits out whole row based on an index value, in this case, entire 4th (index=3) row.
df.iloc[3]

A    1.889611
B    0.528276
C    0.189520
D   -1.194288
E   -0.243834
Name: 2013-01-04 00:00:00, dtype: float64

In [182]:
# iloc can also splice up dataframe in the same way as loc, only with integer values for iloc[col,row]
#In this case we get:
    #3:5 => columns with index 3, 4, NOT 5.
    #0:2 => Rows with index 0, 1, NOT 2.
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,1.889611,0.528276
2013-01-05,0.375089,1.83805


In [184]:
#ioc can also grab individual rows and columns for select columns (aka values of an array, and not a range)
#In this case we get:
    #[1,2,4] => columns with index 1, 2, and 5.
    #[0,2] => Rows with index 0 and 2
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.183045,0.52714
2013-01-03,1.03841,-1.16516
2013-01-05,0.375089,1.560914


In [190]:
#Slicing Rows by index
#In this case we get:
    # 1:3 => columns with index 1, 2, NOT 3.
    # : => ALL Rows.
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D,E
2013-01-02,1.183045,0.457848,0.52714,-0.411322,-1.515269
2013-01-03,1.03841,-0.280123,-1.16516,-0.435809,-0.86516


In [192]:
#Slicing columns by index
#In this case we get:
    # : => ALL columns.
    # 1:3 => rows with index 1, 2, NOT 3.
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.676421,0.965136
2013-01-02,0.457848,0.52714
2013-01-03,-0.280123,-1.16516
2013-01-04,0.528276,0.18952
2013-01-05,1.83805,1.560914
2013-01-06,0.384227,0.404812
2013-01-07,-0.591361,-1.376663
2013-01-08,-0.373497,0.32051
2013-01-09,0.437757,1.174706
2013-01-10,0.112611,-0.861955


In [205]:
# Grab explicit values aka by index.
# df.iloc[column index, row_index]

iloc_val = df.iloc[1,1]

# Also Equivalent
same_val = df.loc["20130102","B"]
still_same = df.iat[1,1]

print("using df.iloc:",iloc_val,"\n", "using df.iat:",still_same, "\n","using df.loc:",same_val)

using df.iloc: 0.457847655031 
 using df.iat: 0.457847655031 
 using df.loc: 0.457847655031


# Using Booleans to filter Dataframes

In [210]:
#Using column values to select rows to be printed. 
#df[where df.A (column A of df) is greater than 0]
    #This will print df where those A columns are greater than 0 
col_A_greater_than = df[df.A > 0]

#For comparison
col_B_less_than = df[df.A < 0]

In [212]:
print(col_A_greater_than)
print(col_B_less_than)

                   A         B         C         D         E
2013-01-01  1.634966 -0.676421  0.965136  0.688510  0.788234
2013-01-02  1.183045  0.457848  0.527140 -0.411322 -1.515269
2013-01-03  1.038410 -0.280123 -1.165160 -0.435809 -0.865160
2013-01-04  1.889611  0.528276  0.189520 -1.194288 -0.243834
2013-01-05  0.375089  1.838050  1.560914 -0.581231 -0.098355
2013-01-06  0.005357  0.384227  0.404812  0.306244  0.949622
2013-01-07  1.367120 -0.591361 -1.376663  0.846830  0.391403
2013-01-08  1.330238 -0.373497  0.320510 -0.885440 -0.349832
2013-01-13  1.073178  1.076415 -0.078576 -0.483837  1.220573
2013-01-16  0.424391  0.927089  0.501723  0.124618 -1.113031
                   A         B         C         D         E
2013-01-09 -0.298429  0.437757  1.174706 -0.736686  0.348387
2013-01-10 -1.121290  0.112611 -0.861955 -1.201839 -0.276085
2013-01-11 -1.102652  1.175125  0.714225  1.200937  0.307592
2013-01-12 -0.617795 -0.810956 -0.282750 -0.054271  1.477314
2013-01-14 -0.710743 -1.

In [217]:
#This will print out a new dataframe showing only those values that meet the condition required. 
greatar_than = df[df>0]
print(greatar_than)

                   A         B         C         D         E
2013-01-01  1.634966       NaN  0.965136  0.688510  0.788234
2013-01-02  1.183045  0.457848  0.527140       NaN       NaN
2013-01-03  1.038410       NaN       NaN       NaN       NaN
2013-01-04  1.889611  0.528276  0.189520       NaN       NaN
2013-01-05  0.375089  1.838050  1.560914       NaN       NaN
2013-01-06  0.005357  0.384227  0.404812  0.306244  0.949622
2013-01-07  1.367120       NaN       NaN  0.846830  0.391403
2013-01-08  1.330238       NaN  0.320510       NaN       NaN
2013-01-09       NaN  0.437757  1.174706       NaN  0.348387
2013-01-10       NaN  0.112611       NaN       NaN       NaN
2013-01-11       NaN  1.175125  0.714225  1.200937  0.307592
2013-01-12       NaN       NaN       NaN       NaN  1.477314
2013-01-13  1.073178  1.076415       NaN       NaN  1.220573
2013-01-14       NaN       NaN  0.453407       NaN       NaN
2013-01-15       NaN  0.401815  0.211948       NaN  0.249257
2013-01-16  0.424391  0.

In [235]:
#Make a copy of dataframe.
df2 = df.copy()

In [247]:
s1 = pd.Series([1,2,3,4,5,np.nan], index=pd.date_range('20130102', periods=6))

In [248]:
s1

2013-01-02    1.0
2013-01-03    2.0
2013-01-04    3.0
2013-01-05    4.0
2013-01-06    5.0
2013-01-07    NaN
Freq: D, dtype: float64

In [253]:
#Find where cells have NaN values
pd.isnull(s1)

2013-01-02    False
2013-01-03    False
2013-01-04    False
2013-01-05    False
2013-01-06    False
2013-01-07     True
Freq: D, dtype: bool

# Operations on Dataframes

In [256]:
#Find mean of column vals
df.mean()

A    0.058610
B    0.062134
C    0.243741
D   -0.495434
E   -0.055697
dtype: float64

In [258]:
#Find men of row values
df.mean(1)

2013-01-01    0.680085
2013-01-02    0.048288
2013-01-03   -0.341568
2013-01-04    0.233857
2013-01-05    0.618894
2013-01-06    0.410052
2013-01-07    0.127466
2013-01-08    0.008396
2013-01-09    0.185147
2013-01-10   -0.669712
2013-01-11    0.459046
2013-01-12   -0.057692
2013-01-13    0.561551
2013-01-14   -0.809315
2013-01-15   -0.085195
2013-01-16    0.172958
2013-01-17   -0.727799
2013-01-18   -0.250744
2013-01-19   -0.612755
2013-01-20   -0.697546
Freq: D, dtype: float64

In [279]:
# Apply the equivalent of an arrow function in python to a dataset using apply and the lambda function notation.
    #Lambda notation => (lambda (param): param + 2), in this case param is a row.
df.apply(lambda x: x.max() - x.min())

A    3.984463
B    3.526544
C    3.231283
D    2.974526
E    3.575703
dtype: float64

In [306]:
#Numerical operations on a pandas series
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
s.value_counts()

0    4
1    3
2    5
3    5
4    0
5    3
6    3
7    0
8    3
9    3
dtype: int32


3    5
5    2
0    2
4    1
dtype: int64

In [307]:
#String operations on a pandas series.
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s)
s.str.lower()

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object


0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Merging Dataframes

In [311]:
# generate a random dataframe with 10 rows (0-9) and 4 columns (0-3)
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.827049,1.233997,-0.853327,0.615208
1,-0.193758,-0.00436,0.539132,0.016295
2,-1.219261,1.248029,1.521215,-0.557823
3,0.119526,1.0872,0.128311,-1.315405
4,0.327184,1.258326,0.523944,-0.604892
5,-1.034214,-1.012781,0.205796,-0.803814
6,1.565481,0.425242,-0.771752,0.936594
7,-0.335501,0.844694,2.051308,0.184368
8,0.246874,-0.008751,0.296059,1.04131
9,1.132319,1.959247,-0.224585,-1.584019


In [321]:
#Break apart dataframe
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  0.827049  1.233997 -0.853327  0.615208
 1 -0.193758 -0.004360  0.539132  0.016295
 2 -1.219261  1.248029  1.521215 -0.557823,
           0         1         2         3
 3  0.119526  1.087200  0.128311 -1.315405
 4  0.327184  1.258326  0.523944 -0.604892
 5 -1.034214 -1.012781  0.205796 -0.803814
 6  1.565481  0.425242 -0.771752  0.936594,
           0         1         2         3
 7 -0.335501  0.844694  2.051308  0.184368
 8  0.246874 -0.008751  0.296059  1.041310
 9  1.132319  1.959247 -0.224585 -1.584019]

In [322]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.827049,1.233997,-0.853327,0.615208
1,-0.193758,-0.00436,0.539132,0.016295
2,-1.219261,1.248029,1.521215,-0.557823
3,0.119526,1.0872,0.128311,-1.315405
4,0.327184,1.258326,0.523944,-0.604892
5,-1.034214,-1.012781,0.205796,-0.803814
6,1.565481,0.425242,-0.771752,0.936594
7,-0.335501,0.844694,2.051308,0.184368
8,0.246874,-0.008751,0.296059,1.04131
9,1.132319,1.959247,-0.224585,-1.584019


In [338]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 9]})
print(left)
print()
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print(right)

pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  foo     9

   key  rval
0  foo     4
1  foo     5


Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,9,4
3,foo,9,5


In [339]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 9]})
print(left)
print()
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(right)

pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  bar     9

   key  rval
0  foo     4
1  bar     5


Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,9,5


# Appending Rows to end of Dataframe.

In [349]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s = df.iloc[3]
print(df)
print()
print(s)

#Append row at the end of dataframe, 
    #ignore_index sets index based on location in in dataframe, had we set it to False, 
    #index for appended row would be 3 and not 8.
df.append(s, ignore_index=True)

          A         B         C         D
0 -0.668634 -0.184054 -0.988512  0.506282
1 -0.307943  0.966569 -0.587074 -0.791615
2 -0.458846  0.540408  0.765563 -0.164819
3 -0.897853 -0.975944 -1.405578  0.148589
4 -0.097950  0.290904  2.104854 -0.397234
5  0.862101 -0.280224 -1.870525 -1.412725
6  1.518110 -0.610119 -1.954349  0.607304
7 -0.710187  0.729248 -1.381562  0.849026

A   -0.897853
B   -0.975944
C   -1.405578
D    0.148589
Name: 3, dtype: float64


Unnamed: 0,A,B,C,D
0,-0.668634,-0.184054,-0.988512,0.506282
1,-0.307943,0.966569,-0.587074,-0.791615
2,-0.458846,0.540408,0.765563,-0.164819
3,-0.897853,-0.975944,-1.405578,0.148589
4,-0.09795,0.290904,2.104854,-0.397234
5,0.862101,-0.280224,-1.870525,-1.412725
6,1.51811,-0.610119,-1.954349,0.607304
7,-0.710187,0.729248,-1.381562,0.849026
8,-0.897853,-0.975944,-1.405578,0.148589


# Grouping!

In [353]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.768746,0.595173
1,bar,one,-0.422745,0.353691
2,foo,two,1.215908,1.436752
3,bar,three,1.745009,0.987088
4,foo,two,0.155578,-0.275787
5,bar,two,-0.987453,0.885089
6,foo,one,-1.255522,-1.183205
7,foo,three,-0.905008,0.273974


In [362]:
df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.334812,2.225868
foo,-0.020298,0.846907


In [363]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.422745,0.353691
bar,three,1.745009,0.987088
bar,two,-0.987453,0.885089
foo,one,-0.486776,-0.588032
foo,three,-0.905008,0.273974
foo,two,1.371486,1.160965


# Reshaping!