In [None]:
import numpy as np
import pandas as pd
import datetime as dt

pd.options.display.float_format='{:.2f}'.format

df = pd.read_csv('../data/000015', index_col= 'Date', names=['Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'Money', 'PE', 'PB'], parse_dates=True, header=None)
df['Return'] = df['Close'].pct_change() * 100
df['Year'] = [i.year for i in df.index]
df['Month'] = [i.month for i in df.index]
df['Day'] = [i.day for i in df.index]

# Basic indexing

In [None]:
df_prices = df[['Close', 'High', 'Low']]
   
series_close = df['Close']
close_of_a_day = series_close['2010-01-04']
close_of_a_day

# can't get row of dataframe like:
try:
    df_prices['2010-01-04']
except KeyError:
    pass

## Accessing attributes using dot operator

In [None]:
df.Close

# select by specifying column indexes
df[[1, 2]]

## Range slicing

The synttax of the slicing operator exactly matches that of NumPy:

```python
ar[startIndex: endIndex: stepValue]
```

where the default values if not specified are as follows:

* 0 for startIndex
* arraysize-1 for endIndex
* 1 for stepValue

# Label, integer and mixed indexing

* The `.loc` operator: Allows label-oriented indexing
* The `.iloc` operator: Allows integer-based indexing
* The `.ix` operator: Allows mixed label and integer-based indexing


## Label-oriented indexing

The `.loc` operator supports pure label-based indexing. It accepts the following as valid inputs:

* A single label.
* List or array of labels.
* A slice object with labels.
* A Boolean array.

In [None]:
df.loc['2010-01-04']

# follows are same
df.loc['2010-01-04', 'Close']
df.loc['2010-01-04']['Close']
df['Close']['2010-01-04']

df.loc[['2010-01-04', '2010-01-05']]
df.loc['2010-01-04': '2010-02-05']

### Selection using a Boolean array

In [None]:
df.loc[df['Close'] <= df['Close'].min(),:]

## Integer-oriented indexing

The `iloc` operator supports integer-based positional indexing. It accepts the following as inputs:

* A single integer.
* A list or array of integers.
* A slice object with integers.

In [None]:
df.iloc[0:10,]

## Mixed indexing with the .ix opeator

The `.ix` operator behaves like a mixture of the `.loc` and `.iloc` operators, with the `.loc` behavior taking precedence. It takes the following as possible inputs:

* A single label or integer
* A list of integers or labels
* An integer slice or label slice
* A Boolean array

In [None]:
df.ix['2010-01-04']
df.ix[['2010-01-04', '2010-01-05']]
df.ix[df.index[-3:]]
df.ix[0]
df.ix[[0, 2]]
df.ix[1: 3]
df.ix[df['Close'] > 4044.6640]

## MultiIndexing

In [None]:
df1 = df.reset_index()
df1.set_index(['Year', 'Month', 'Day'], inplace=True)

df1.index.get_level_values(0)
df1.index.get_level_values(1)
df1.index.get_level_values(2)

df1.ix[2011, 2]
df1.ix[2011: 2012]
df1.ix[(2011, 1): (2012,2)]

## Swapping and reordering levels

In [None]:
df_swapped = df1.swaplevel(0, 1, axis=0)
df_swapped.sortlevel(0).ix[(1,2010):(1,2011)]

# recorder_levels function is more general

df_recorded = df1.reorder_levels(['Month', 'Day', 'Year'], axis = 0)

## Cross sections

The `xs` method provides a shortcut means of selecting data based on a particular index level value.

In [None]:
df1.xs(2, level='Month')

# same as 

df1.swaplevel(0, 1, axis=0).ix[2]

# Boolean indexing

## isin and any all methods

In [None]:
df.loc[df.index.isin(['2010-01-04', '2010-01-05'])]
df.loc[(df.astype('int') == 2656).any(axis = 1)]

## using the where() method

The `where` method is used to ensure that the result of Boolean filtering is the same shape as the original data.

In [None]:
df[df['Close'] > 2800]
df.where(df > 2800)

## Operations on indexes

In [None]:
df1 = df.reset_index()
df1.set_index('Date')

# Grouping of data

## The groupby operation

The `groupby` operation can be thought of as part of a process that involves the following three steps:

* Splitting the dataset
* Analyzing the data
* Aggregating or combining the data

The result of a `groupby` operation is not a DataFrame but `dict` of DataFrame objects.

In [None]:
df['Year'] = [i.year for i in df.index]
df['Month'] = [i.month for i in df.index]
df['Day'] = [i.day for i in df.index]

df_group_by_year = df.groupby('Year')
type(df_group_by_year)
len(df_group_by_year)
df_group_by_year.size().sort_values(ascending=False)

df_group_by_year_month = df.groupby(['Year', 'Month'])
df_group_by_year_month.size().sort_values(ascending=False)

df_group_by_year = df.groupby(lambda x: x.year)
#for name, group in df_group_by_year:
#    print(name)
#    print(group)

df_index_year_month = df.reset_index()
df_index_year_month = df_index_year_month.set_index(['Year', 'Month'])
df_group_by_year = df_index_year_month.groupby(level=['Year', 'Month'])
df_group_by_year.size()

## Using groupby with a MultiIndex

In [None]:
df_index_year_month = df.reset_index()
df_index_year_month = df_index_year_month.set_index(['Year', 'Month'])

grouped = df_index_year_month.groupby(level='Month')
grouped.mean()
# same as
df_index_year_month.mean(level='Month')

grouped.sum()

## Using the aggregate method

Another way to generate summary statistics by using the aggregate method explicitly:

In [None]:
grouped.aggregate(np.sum)

## Applying multiple functions

In [None]:
grouped.agg([np.sum, np.mean, np.size])

grouped['Return'].agg({'Size': np.size, 'Total': np.sum, 'Average': np.mean, 'Deviation': np.std, 'Max': np.max, 'Min': np.min})

## The transform() method

The `groupby-transform` function is used to perform transormation operation on a `group` object. For example, we could replace NaN values in the `groupby` object using the `fillna` method. The resulting object after using the transform has the same size as the original `groupby` object.

In [None]:
grouped.transform(lambda x: x.fillna(x.mean()))

## Filtering

The `filter` method enables to apply filtering on a `groupby` object that results in a subset of the initial object.

In [None]:
grouped.filter(lambda x: np.all([x[col] > 1640 for col in ['High', 'Close', 'Low']]))

# Merging and joining

## Using concat

The `concat` function is used to join multiple pandas data structures along a specified axis and possibly perform union or intersection operations along other axes.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close']]
df3 = df.ix['2010-01-06': '2010-01-9', ['High']]

pd.concat([df1, df2, df3], axis=1) # outer join
pd.concat([df1, df2, df3], axis=1, join='inner') # inner join
pd.concat([df1, df2, df3], axis=1, join_axes=[df2.index]) # inner join

## Using append

The `append` function is a simpler version of `concat` that concatenates along `axis=0`.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close']]

df1.append(df2).reindex_axis(df.columns, axis=1)

## Appending a single row to a DataFrame

A DataFrame can be appended a single row by passing a series or dictionary to the `append` method.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open', 'Close']]

df1.append({'Open': 100, 'Close': 100}, ignore_index=True)

## SQL-like merging/joining of DataFrame objects

The `merge` function is used to obtain joins of two DataFrame objects similar to those used in SQL database queries. The DataFrame objects are analogous SQL tables.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open', 'High']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close', 'High']]

pd.merge(df1, df2, how='inner')
pd.merge(df1, df2, how='outer')
pd.merge(df1, df2, how='left')
pd.merge(df1, df2, how='right', left_index=True, right_index=True)

### The join function

The `DataFrame.join` function is used to combine two DataFrames that have different columns with nothing in common. Essentially, this does a longitudinal join of two DataFrames.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close']]

df1.join(df2)

# Pivots and reshaping data

## Stacking and unstacking

### The stack() function

## Other methods to reshape DataFrames

### Using the melt function

# Handling Missing data

In [None]:
df1 = df.asfreq('D')

df1.fillna(method='ffill')
df1.fillna(method='bfill')
df1.dropna()
df1.interpolate()

# Handling time series

## Reading in time series data

### DataOffset and TimeDelta objects

A `DateOffset` object represents a change or offset in time. The key features of a `DataOffset` object are as follows:

* This can be added/subtracted to/from a `datetime` object to obtain a shifted date
* This can be multiplied by an integer (positive or negative) so that the increment can be applied multiple times
* This has the rollforward and rollback methods to move a date forward to the next offset date or backward to the previous offset date

In [None]:
xmasDay = pd.datetime(2014, 12, 25)
boxingDay = xmasDay + pd.DateOffset(days=1)
today = pd.datetime.now()
today + pd.DateOffset(weeks=1)
today + 2 * pd.DateOffset(years=2, months=6)

lastDay = pd.datetime(2013, 12, 31)
from pandas.tseries.offsets import QuarterBegin
dtoffset = QuarterBegin()
lastDay + dtoffset
dtoffset.rollforward(lastDay)

weekDelta = dt.timedelta(weeks=1)
today + weekDelta

## Time series-related instance methods

### Shifting/lagging

In [None]:
df['Close'].shift(3)
df['Close'].shift(3, freq='B')

### Frequency conversion

In [None]:
df['Close'].asfreq('BM', method='ffill')

### Resampling of data

### Aliases for Time Series frequencies

## Time series concepts and datatypes

### Period and PeriodIndex

### Conversion between Time Series datatypes