# HW 0 - Intro to Pandas
Credit: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

In [114]:
import numpy as np
import pandas as pd
import matplotlib as plt

## Object Creation


Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [3]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.460068,0.066345,1.988739,0.440172
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831
2013-01-04,-0.760899,0.943461,0.521913,1.524727
2013-01-05,-0.689914,-1.07715,-0.96435,1.452788
2013-01-06,1.030526,-0.104526,-0.789752,-1.14739


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [6]:
df2 = pd.DataFrame({'A': 1.,
                     'B': pd.Timestamp('20130102'),
                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                     'D': np.array([3] * 4, dtype='int32'),
                     'E': pd.Categorical(["test", "train", "test", "train"]),
                     'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


The columns of the resulting DataFrame have different dtypes.

In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

View the top and bottom rows of the frame:

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.460068,0.066345,1.988739,0.440172
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831
2013-01-04,-0.760899,0.943461,0.521913,1.524727
2013-01-05,-0.689914,-1.07715,-0.96435,1.452788


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.760899,0.943461,0.521913,1.524727
2013-01-05,-0.689914,-1.07715,-0.96435,1.452788
2013-01-06,1.030526,-0.104526,-0.789752,-1.14739


Display the index, columns:

In [11]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy() gives a NumPy representation of the underlying data. Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

For df, our DataFrame of all floating-point values, DataFrame.to_numpy() is fast and doesn’t require copying data.

In [16]:
df.to_numpy()

AttributeError: 'DataFrame' object has no attribute 'to_numpy'

In [18]:
df2.to_numpy()

AttributeError: 'DataFrame' object has no attribute 'to_numpy'

describe() shows a quick statistic summary of your data:

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.446804,-0.097409,-0.228127,-0.075396
std,0.856591,0.675165,1.319509,1.386995
min,-1.577401,-1.07715,-1.761993,-1.380831
25%,-0.743153,-0.409174,-0.9207,-1.293232
50%,-0.574991,-0.01909,-0.576534,-0.353609
75%,-0.282318,0.09019,0.300606,1.199634
max,1.030526,0.943461,1.988739,1.524727


Transposing your data:

In [20]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.460068,-0.223068,-1.577401,-0.760899,-0.689914,1.030526
B,0.066345,-0.510724,0.098139,0.943461,-1.07715,-0.104526
C,1.988739,-1.761993,-0.363316,0.521913,-0.96435,-0.789752
D,0.440172,-1.341846,-1.380831,1.524727,1.452788,-1.14739


Sorting by an axis:

In [21]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.440172,1.988739,0.066345,-0.460068
2013-01-02,-1.341846,-1.761993,-0.510724,-0.223068
2013-01-03,-1.380831,-0.363316,0.098139,-1.577401
2013-01-04,1.524727,0.521913,0.943461,-0.760899
2013-01-05,1.452788,-0.96435,-1.07715,-0.689914
2013-01-06,-1.14739,-0.789752,-0.104526,1.030526


Sorting by values:

In [22]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,-0.689914,-1.07715,-0.96435,1.452788
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846
2013-01-06,1.030526,-0.104526,-0.789752,-1.14739
2013-01-01,-0.460068,0.066345,1.988739,0.440172
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831
2013-01-04,-0.760899,0.943461,0.521913,1.524727


## Selection

### Getting 

Selecting a single column, which yields a Series, equivalent to df.A:

In [23]:
df['A']

2013-01-01   -0.460068
2013-01-02   -0.223068
2013-01-03   -1.577401
2013-01-04   -0.760899
2013-01-05   -0.689914
2013-01-06    1.030526
Freq: D, Name: A, dtype: float64

Selecting via [ ], which slices the rows.

In [25]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.460068,0.066345,1.988739,0.440172
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831


In [26]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831
2013-01-04,-0.760899,0.943461,0.521913,1.524727


### Selection by label

For getting a cross section using a label:

In [27]:
df.loc[dates[0]]

A   -0.460068
B    0.066345
C    1.988739
D    0.440172
Name: 2013-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label:

In [28]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.460068,0.066345
2013-01-02,-0.223068,-0.510724
2013-01-03,-1.577401,0.098139
2013-01-04,-0.760899,0.943461
2013-01-05,-0.689914,-1.07715
2013-01-06,1.030526,-0.104526


Showing label slicing, both endpoints are included:

In [29]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.223068,-0.510724
2013-01-03,-1.577401,0.098139
2013-01-04,-0.760899,0.943461


Reduction in the dimensions of the returned object:

In [30]:
df.loc['20130102', ['A', 'B']]

A   -0.223068
B   -0.510724
Name: 2013-01-02 00:00:00, dtype: float64

For getting a scalar value:

In [31]:
df.loc[dates[0], 'A']

-0.46006775549057666

For getting fast access to a scalar (equivalent to the prior method):

In [32]:
df.at[dates[0], 'A']

-0.46006775549057666

### Selection by position

Select via the position of the passed integers:

In [33]:
df.iloc[3]

A   -0.760899
B    0.943461
C    0.521913
D    1.524727
Name: 2013-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/python:

In [34]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.760899,0.943461
2013-01-05,-0.689914,-1.07715


By lists of integer position locations, similar to the numpy/python style:

In [35]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.223068,-1.761993
2013-01-03,-1.577401,-0.363316
2013-01-05,-0.689914,-0.96435


For slicing rows explicitly:

In [36]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831


For slicing columns explicitly:

In [37]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.066345,1.988739
2013-01-02,-0.510724,-1.761993
2013-01-03,0.098139,-0.363316
2013-01-04,0.943461,0.521913
2013-01-05,-1.07715,-0.96435
2013-01-06,-0.104526,-0.789752


For getting a value explicitly:

In [38]:
df.iloc[1, 1]

-0.5107238260189693

For getting fast access to a scalar (equivalent to the prior method):

In [39]:
df.iat[1, 1]

-0.5107238260189693

### Boolean Indexing

Selecting values from a DataFrame where a boolean condition is met.

In [41]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.066345,1.988739,0.440172
2013-01-02,,,,
2013-01-03,,0.098139,,
2013-01-04,,0.943461,0.521913,1.524727
2013-01-05,,,,1.452788
2013-01-06,1.030526,,,


Using the isin() method for filtering:

In [42]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.460068,0.066345,1.988739,0.440172,one
2013-01-02,-0.223068,-0.510724,-1.761993,-1.341846,one
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831,two
2013-01-04,-0.760899,0.943461,0.521913,1.524727,three
2013-01-05,-0.689914,-1.07715,-0.96435,1.452788,four
2013-01-06,1.030526,-0.104526,-0.789752,-1.14739,three


In [43]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.577401,0.098139,-0.363316,-1.380831,two
2013-01-05,-0.689914,-1.07715,-0.96435,1.452788,four


### Setting

Setting a new column automatically aligns the data by the indexes.

In [44]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [45]:
df['F'] = s1

Setting values by label:

In [46]:
df.at[dates[0], 'A'] = 0

Setting values by position:

In [47]:
df.iat[0, 1] = 0

Setting by assigning with a NumPy array:

In [48]:
df.loc[:, 'D'] = np.array([5] * len(df))

The result of the prior setting operations.

In [49]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.988739,5,
2013-01-02,-0.223068,-0.510724,-1.761993,5,1.0
2013-01-03,-1.577401,0.098139,-0.363316,5,2.0
2013-01-04,-0.760899,0.943461,0.521913,5,3.0
2013-01-05,-0.689914,-1.07715,-0.96435,5,4.0
2013-01-06,1.030526,-0.104526,-0.789752,5,5.0


A where operation with setting.

In [50]:
df2 = df.copy()

df2[df2 > 0] = -df2

df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.988739,-5,
2013-01-02,-0.223068,-0.510724,-1.761993,-5,-1.0
2013-01-03,-1.577401,-0.098139,-0.363316,-5,-2.0
2013-01-04,-0.760899,-0.943461,-0.521913,-5,-3.0
2013-01-05,-0.689914,-1.07715,-0.96435,-5,-4.0
2013-01-06,-1.030526,-0.104526,-0.789752,-5,-5.0


### Missing Data

pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations. See the Missing Data section.

Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.

In [51]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

df1.loc[dates[0]:dates[1], 'E'] = 1

df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.988739,5,,1.0
2013-01-02,-0.223068,-0.510724,-1.761993,5,1.0,1.0
2013-01-03,-1.577401,0.098139,-0.363316,5,2.0,
2013-01-04,-0.760899,0.943461,0.521913,5,3.0,


** To drop any rows that have missing data. **

In [53]:
df1.dropna()

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-0.223068,-0.510724,-1.761993,5,1.0,1.0


Filling missing data.

In [54]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.988739,5,5.0,1.0
2013-01-02,-0.223068,-0.510724,-1.761993,5,1.0,1.0
2013-01-03,-1.577401,0.098139,-0.363316,5,2.0,5.0
2013-01-04,-0.760899,0.943461,0.521913,5,3.0,5.0


To get the boolean mask where values are nan

In [58]:
df1.isnull()

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## Operations

### Stats

Performing a descriptive statistic:

In [59]:
df.mean()

A   -0.370126
B   -0.108467
C   -0.228127
D    5.000000
F    3.000000
dtype: float64

Same operation on the other axis:

In [60]:
df.mean(1)

2013-01-01    1.747185
2013-01-02    0.700843
2013-01-03    1.031484
2013-01-04    1.740895
2013-01-05    1.253717
2013-01-06    2.027250
Freq: D, dtype: float64

Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.

In [62]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [63]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-2.577401,-0.901861,-1.363316,4.0,1.0
2013-01-04,-3.760899,-2.056539,-2.478087,2.0,0.0
2013-01-05,-5.689914,-6.07715,-5.96435,0.0,-1.0
2013-01-06,,,,,


### Apply 

Applying functions to the data:

In [64]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.988739,5,
2013-01-02,-0.223068,-0.510724,0.226745,10,1.0
2013-01-03,-1.800469,-0.412585,-0.13657,15,3.0
2013-01-04,-2.561368,0.530876,0.385343,20,6.0
2013-01-05,-3.251281,-0.546274,-0.579007,25,10.0
2013-01-06,-2.220755,-0.650799,-1.368759,30,15.0


### Histrogramming

In [67]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    3
1    0
2    1
3    5
4    5
5    3
6    5
7    4
8    2
9    4
dtype: int64

In [68]:
s.value_counts()

5    3
4    2
3    2
2    1
1    1
0    1
dtype: int64

### String Method

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in str generally uses regular expressions by default (and in some cases always uses them).

In [72]:
s = pd.Series(['A', 'B', 'C', 'AaBa', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    AaBa
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [73]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Merge

### Concat

pandas provides various facilities for easily combining together Series and DataFrame objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.

Concatenating pandas objects together with concat():

In [74]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.476272,0.351803,-0.42238,-0.229154
1,5.7e-05,-0.614143,-1.405732,0.817366
2,-2.112307,-0.707446,0.443268,-0.760635
3,0.282393,-1.130718,-0.568913,-0.303105
4,-0.343657,-0.735229,1.052896,-1.9374
5,0.803854,0.179877,0.763747,1.095092
6,0.225387,0.102943,-1.054495,-1.041044
7,-0.136198,-0.630153,1.8539,1.285891
8,0.279703,1.313958,0.154146,0.627603
9,1.935228,0.315876,0.516842,-0.897316


In [78]:
# Break it into pieces
pieces = [df[0:3], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.476272,0.351803,-0.42238,-0.229154
1,5.7e-05,-0.614143,-1.405732,0.817366
2,-2.112307,-0.707446,0.443268,-0.760635
7,-0.136198,-0.630153,1.8539,1.285891
8,0.279703,1.313958,0.154146,0.627603
9,1.935228,0.315876,0.516842,-0.897316


### Join

SQL style merges

In [86]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


Another Example:

In [87]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


### Append

Append rows to a dataframe. 

In [89]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.376686,-0.449592,1.380506,-0.719438
1,1.114091,-0.463619,1.272401,0.806992
2,-0.784094,-0.767482,0.857727,-0.590087
3,0.842061,1.456743,-1.126706,-1.329746
4,1.147554,-1.122077,-1.082039,-0.079621
5,-0.036777,-1.56133,1.113757,1.284969
6,-0.514966,-1.452835,-0.288221,-1.005774
7,2.634116,-1.914134,0.254542,-0.772458


In [92]:
s = df.iloc[3]
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.376686,-0.449592,1.380506,-0.719438
1,1.114091,-0.463619,1.272401,0.806992
2,-0.784094,-0.767482,0.857727,-0.590087
3,0.842061,1.456743,-1.126706,-1.329746
4,1.147554,-1.122077,-1.082039,-0.079621
5,-0.036777,-1.56133,1.113757,1.284969
6,-0.514966,-1.452835,-0.288221,-1.005774
7,2.634116,-1.914134,0.254542,-0.772458
8,0.842061,1.456743,-1.126706,-1.329746


## Grouping

By “group by” we are referring to a process involving one or more of the following steps:

* Splitting the data into groups based on some criteria
* Applying a function to each group independently
* Combining the results into a data structure

In [96]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.679781,0.404165
1,bar,one,-0.069471,1.704262
2,foo,two,0.55842,-0.15706
3,bar,three,-1.246237,-0.438941
4,foo,two,-0.626878,3.784242
5,bar,two,1.050694,0.834929
6,foo,one,0.531512,-0.191411
7,foo,three,-0.885796,-0.637072


Grouping and then applying the sum() function to the resulting groups.

In [106]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.265015,2.10025
foo,-1.102523,3.202864


Grouping by multiple columns forms a hierarchical index, and again we can apply the sum function.

In [107]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.069471,1.704262
bar,three,-1.246237,-0.438941
bar,two,1.050694,0.834929
foo,one,-0.148269,0.212754
foo,three,-0.885796,-0.637072
foo,two,-0.068458,3.627182


## Plotting

In [112]:
ts = pd.Series(np.random.randn(1000),
                index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x119e1e198>

In [121]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                   columns=['A', 'B', 'C', 'D'])
df = df.cumsum()

df.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x10f7303c8>

## Getting data in/out

### CSV

In [116]:
df.to_csv('foo.csv')

In [118]:
pd.read_csv('foo.csv').head()

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,2000-01-01,-0.109814,1.35371,-0.304542,-0.246742
1,2000-01-02,-0.592615,2.515739,-1.219836,-0.150111
2,2000-01-03,-0.354486,3.08453,0.863061,-0.38218
3,2000-01-04,0.059785,2.31242,-0.089832,-2.227083
4,2000-01-05,0.89026,2.946729,-0.855488,-0.642063
