In [None]:
import numpy as np
import pandas as pd
import datetime as dt

pd.options.display.float_format='{:.2f}'.format

df = pd.read_csv('../data/000015', index_col= 'Date', names=['Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'Money', 'PE', 'PB'], parse_dates=True, header=None)
df['Return'] = df['Close'].pct_change() * 100
df['Year'] = [i.year for i in df.index]
df['Month'] = [i.month for i in df.index]
df['Day'] = [i.day for i in df.index]

# Basic indexing

In [None]:
df_prices = df[['Close', 'High', 'Low']]
   
series_close = df['Close']
close_of_a_day = series_close['2010-01-04']
close_of_a_day

# can't get row of dataframe like:
try:
    df_prices['2010-01-04']
except KeyError:
    pass

## Accessing attributes using dot operator

In [None]:
df.Close

# select by specifying column indexes
df[[1, 2]]

## Range slicing

The synttax of the slicing operator exactly matches that of NumPy:

```python
ar[startIndex: endIndex: stepValue]
```

where the default values if not specified are as follows:

* 0 for startIndex
* arraysize-1 for endIndex
* 1 for stepValue

# Label, integer and mixed indexing

* The `.loc` operator: Allows label-oriented indexing
* The `.iloc` operator: Allows integer-based indexing
* The `.ix` operator: Allows mixed label and integer-based indexing


Gg## Label-oriented indexing

The `.loc` operator supports pure label-based indexing. It accepts the following as valid inputs:

* A single label.
* List or array of labels.
* A slice object with labels.
* A Boolean array.

In [None]:
df.loc['2010-01-04']

# follows are same
df.loc['2010-01-04', 'Close']
df.loc['2010-01-04']['Close']
df['Close']['2010-01-04']

df.loc[['2010-01-04', '2010-01-05']]
df.loc['2010-01-04': '2010-02-05']

### Selection using a Boolean array

In [None]:
df.loc[df['Close'] <= df['Close'].min(),:]

## Integer-oriented indexing

The `iloc` operator supports integer-based positional indexing. It accepts the following as inputs:

* A single integer.
* A list or array of integers.
* A slice object with integers.

In [None]:
df.iloc[0:10,]

## Mixed indexing with the .ix opeator

The `.ix` operator behaves like a mixture of the `.loc` and `.iloc` operators, with the `.loc` behavior taking precedence. It takes the following as possible inputs:

* A single label or integer
* A list of integers or labels
* An integer slice or label slice
* A Boolean array

In [None]:
df.ix['2010-01-04']
df.ix[['2010-01-04', '2010-01-05']]
df.ix[df.index[-3:]]
df.ix[0]
df.ix[[0, 2]]
df.ix[1: 3]
df.ix[df['Close'] > 4044.6640]

## MultiIndexing

In [None]:
df1 = df.reset_index()
df1.set_index(['Year', 'Month', 'Day'], inplace=True)

df1.index.get_level_values(0)
df1.index.get_level_values(1)
df1.index.get_level_values(2)

df1.ix[2011, 2]
df1.ix[2011: 2012]
df1.ix[(2011, 1): (2012,2)]

## Swapping and reordering levels

In [None]:
df_swapped = df1.swaplevel(0, 1, axis=0)
df_swapped.sortlevel(0).ix[(1,2010):(1,2011)]

# recorder_levels function is more general

df_recorded = df1.reorder_levels(['Month', 'Day', 'Year'], axis = 0)

## Cross sections

The `xs` method provides a shortcut means of selecting data based on a particular index level value.

In [None]:
df1.xs(2, level='Month')

# same as 

df1.swaplevel(0, 1, axis=0).ix[2]

## Reindexing

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

In [10]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])

states = ['Texas', 'Utah', 'California']
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', columns=states)

# reindex can be done by label-indexing with ix
frame.ix[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


# Boolean indexing

## isin and any all methods

In [None]:
df.loc[df.index.isin(['2010-01-04', '2010-01-05'])]
df.loc[(df.astype('int') == 2656).any(axis = 1)]

## using the where() method

The `where` method is used to ensure that the result of Boolean filtering is the same shape as the original data.

In [None]:
df[df['Close'] > 2800]
df.where(df > 2800)

## Operations on indexes

In [None]:
df1 = df.reset_index()
df1.set_index('Date')

# Function application and mapping

In [None]:
df.apply(lambda x: x.max() - x.min())
df.apply(lambda x: x.max() - x.min(), axis=1)

In [None]:
df.apply(lambda x: pd.Series([x.min(), x.max()], index=['min', 'max']))

In [None]:
# Element-wise apply
df.applymap(lambda x: '%.2f' % x)

The reason for the name `applymap` is that Series has a `map` method for applying an element-wise function.

In [None]:
df['Close'].map(lambda x: '%.2f' % x)

# Sorting and ranking

In [None]:
df.sort_index(axis=1, ascending=False)

df['Close'].sort_values()

df.sort_values(by=['Year', 'Month'])

df['Close'].rank(method='first')

# Grouping of data

## The groupby operation

The `groupby` operation can be thought of as part of a process that involves the following three steps:

* Splitting the dataset
* Analyzing the data
* Aggregating or combining the data

The result of a `groupby` operation is not a DataFrame but `dict` of DataFrame objects.

In [None]:
df['Year'] = [i.year for i in df.index]
df['Month'] = [i.month for i in df.index]
df['Day'] = [i.day for i in df.index]

df_group_by_year = df.groupby('Year')
type(df_group_by_year)
len(df_group_by_year)
df_group_by_year.size().sort_values(ascending=False)

df_group_by_year_month = df.groupby(['Year', 'Month'])
df_group_by_year_month.size().sort_values(ascending=False)

df_group_by_year = df.groupby(lambda x: x.year)
#for name, group in df_group_by_year:
#    print(name)
#    print(group)

df_index_year_month = df.reset_index()
df_index_year_month = df_index_year_month.set_index(['Year', 'Month'])
df_group_by_year = df_index_year_month.groupby(level=['Year', 'Month'])
df_group_by_year.size()

## Using groupby with a MultiIndex

In [None]:
df_index_year_month = df.reset_index()
df_index_year_month = df_index_year_month.set_index(['Year', 'Month'])

grouped = df_index_year_month.groupby(level='Month')
grouped.mean()
# same as
df_index_year_month.mean(level='Month')

grouped.sum()

## Using the aggregate method

Another way to generate summary statistics by using the aggregate method explicitly:

In [None]:
grouped.aggregate(np.sum)

## Applying multiple functions

In [None]:
grouped.agg([np.sum, np.mean, np.size])

grouped['Return'].agg({'Size': np.size, 'Total': np.sum, 'Average': np.mean, 'Deviation': np.std, 'Max': np.max, 'Min': np.min})

## The transform() method

The `groupby-transform` function is used to perform transormation operation on a `group` object. For example, we could replace NaN values in the `groupby` object using the `fillna` method. The resulting object after using the transform has the same size as the original `groupby` object.

In [None]:
grouped.transform(lambda x: x.fillna(x.mean()))

## Filtering

The `filter` method enables to apply filtering on a `groupby` object that results in a subset of the initial object.

In [None]:
grouped.filter(lambda x: np.all([x[col] > 1640 for col in ['High', 'Close', 'Low']]))

## Quantile and bucket analysis

In [None]:
closes = df['Close'][-240:]
factor = pd.cut(closes, 10)

def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 
            'count': group.count(), 'mean': group.mean()}

grouped = closes.groupby(factor)
grouped.apply(get_stats).unstack()

# Merging and joining

## Using concat

The `concat` function is used to join multiple pandas data structures along a specified axis and possibly perform union or intersection operations along other axes.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close']]
df3 = df.ix['2010-01-06': '2010-01-9', ['High']]

pd.concat([df1, df2, df3], axis=1) # outer join
pd.concat([df1, df2, df3], axis=1, join='inner') # inner join
pd.concat([df1, df2, df3], axis=1, join_axes=[df2.index]) # inner join

## Using append

The `append` function is a simpler version of `concat` that concatenates along `axis=0`.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close']]

df1.append(df2).reindex_axis(df.columns, axis=1)

## Appending a single row to a DataFrame

A DataFrame can be appended a single row by passing a series or dictionary to the `append` method.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open', 'Close']]

df1.append({'Open': 100, 'Close': 100}, ignore_index=True)

## Combining data with overlap

With DataFrames, `combine_first` can thought of as 'patching' missing dta the the calling object with data from the object passed. 

In [None]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.], 'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]})

df1.combine_first(df2)

## SQL-like merging/joining of DataFrame objects

The `merge` function is used to obtain joins of two DataFrame objects similar to those used in SQL database queries. The DataFrame objects are analogous SQL tables.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open', 'High']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close', 'High']]

pd.merge(df1, df2, how='inner')
pd.merge(df1, df2, how='outer')
pd.merge(df1, df2, how='left')
pd.merge(df1, df2, how='right', left_index=True, right_index=True)

### The join function

The `DataFrame.join` function is used to combine two DataFrames that have different columns with nothing in common. Essentially, this does a longitudinal join of two DataFrames.

In [None]:
df1 = df.ix['2010-01-04': '2010-01-10', ['Open']]
df2 = df.ix['2010-01-05': '2010-01-12', ['Close']]

df1.join(df2)

# Pivots and reshaping data

## Stacking and unstacking

### The stack() function

In [None]:
result = df.stack(dropna=False)
df.unstack('Date') # or df.unstack(0)

## Other methods to reshape DataFrames

### Using the melt function

# Handling Missing data

| Argument            | Description                                                                                                                                                                                                 |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dropna`            | Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.                                                                 |
| `fillna`            | Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.                                                                                                           |
| `isnull`            | Return like-type object containing boolean values indicating which values are missing / NA.                                                                                                                 |
| `notnull`           | Negation of `isnull`.

## Filtering out missing data

In [None]:
df1 = df.asfreq('D')

df1.dropna()
df1[df.notnull()]

## Filling in missing data

In [None]:
df.fillna(0, inplace=True)
df.fillna({'PE': -100, 'PB': 1000})

df1.fillna(method='ffill', limit=2)
df1.fillna(method='bfill')

# Data transformation

## Removing duplicates

In [None]:
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})

data.duplicated()
data.drop_duplicates()
data.drop_duplicates(['k1'])
data.drop_duplicates(['k1', 'k2'], keep='last')

## Transforming data using a function or mapping

In [None]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                            'corned beef', 'Bacon', 'pastrami', 'honey ham',
                            'nova lox'],
                   'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)

#or
data['food'].map(lambda x: meat_to_animal[x.lower()])

## Replacing values

In [None]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})

## Renaming axis indexes

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data.index.map(str.upper)
data.rename(index=str.title, columns=str.upper)
data.rename(index={'OHIO': 'INDIANA'},columns={'three': 'peekaboo'})
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

## Discretization and binning

In [None]:
cats = pd.cut(df['Close'], 10, precision=2)
pd.value_counts(cats)

In [None]:
cats = pd.qcut(df['Close'], 10, precision=2)
pd.value_counts(cats)

## Detecting and filtering outliers

In [None]:
np.random.seed(12345)

data = pd.DataFrame(np.random.randn(1000, 4))
col = data[3]

col[np.abs(col) > 3]
data[(np.abs(data) > 3).any(1)]
data[np.abs(data) > 3] = np.sign(data) * 3

## Permutation and random sampling

In [None]:
data = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
sampler = np.random.permutation(5)
data.take(sampler)
data.take(np.random.permutation(len(data))[:3])

bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
draws = bag.take(sampler)

## Computing indicator/dummy variables

# Handling time series

## Reading in time series data

### DataOffset and TimeDelta objects

A `DateOffset` object represents a change or offset in time. The key features of a `DataOffset` object are as follows:

* This can be added/subtracted to/from a `datetime` object to obtain a shifted date
* This can be multiplied by an integer (positive or negative) so that the increment can be applied multiple times
* This has the rollforward and rollback methods to move a date forward to the next offset date or backward to the previous offset date

In [None]:
xmasDay = pd.datetime(2014, 12, 25)
boxingDay = xmasDay + pd.DateOffset(days=1)
today = pd.datetime.now()
today + pd.DateOffset(weeks=1)
today + 2 * pd.DateOffset(years=2, months=6)

lastDay = pd.datetime(2013, 12, 31)
from pandas.tseries.offsets import QuarterBegin
dtoffset = QuarterBegin()
lastDay + dtoffset
dtoffset.rollforward(lastDay)

weekDelta = dt.timedelta(weeks=1)
today + weekDelta

## Time series-related instance methods

### Shifting/lagging

In [None]:
df['Close'].shift(3)
df['Close'].shift(3, freq='B')

### Frequency conversion

### Resampling of data

### Aliases for Time Series frequencies

To specify offsets, a number of aliases are available; some of the most commonly used ones are as follows:

* B, BM: This stands for business day, business month. These are the working days of the month, that is, any day that is not a holiday or a weekend.
* D, W, M, Q, A: It stands for calendar day, week, month, quarter, year-end.
* H, T, S, L, U: It stands for hour, minute, second, millisecond, and microsecond.

Suffixes can be applied to the frequency aliases to specify when in a frequency period to start. These are known as anchoring offsets:

* W-Sun, MON,...
* Q-JAN, FEB, ... DEC
* A-JAN, FEB, ... DEC

These offsets can be used as arguments to the `date_range` and `bdate_range` functions as well as constructors for index types such as `PeriodIndex` and `DatetimeIndex`

## Time series concepts and datatypes

When dealing with time series, there are two main concepts: points in time and range, or time spans. In pandas, the former is represented by the Timestamp datatype, which is equivalent to Python's `datetime`. `datetime` datatype is interchangeable with it. The latter (time span) is represented by the Period datatype, which is specific to pandas.

Each of these datatypes has index datatypes associated with them: `DatetimeIndex` for `Timestamp/Datetime` and `PeriodIndex` for `Period`. These index datatypes are basically subtypes of `numpy.ndarray` that contain the corresponding Timestamp and Period datatypes and can be used as indexes for Series and DataFrame objects.

### Period and PeriodIndex

In [None]:
pd.Period('2014', freq='A-May')
pd.Period('2014-6-11')
pd.Period('2014-6-11 11:00', freq='H')

pd.Period('2014-6-11') + 4
pd.Period('2014-6-11 11:00', freq='H') - 48

pd.Period('2014-04', freq='M')-pd.Period('2013-02', freq='M')

#### PeriodIndex

A `PeriodIndex` object, which is an index type for a `Period` object, can be created in two ways:

* From series of `Period` objects using the `period_range` function, an analogue of `date_range`:

In [None]:
perRng = pd.period_range('02/01/2014','02/06/2014',freq='D')
perRng[:2]

* It can also be done via direct call to the `Period` consturctor:

In [None]:
JulyPeriod=pd.PeriodIndex(['07/01/2014','07/31/2014'], freq='D')

### Conversion between Time Series datatypes

In [None]:
worldCupFinal = pd.to_datetime('07/13/2014', errors='raise')
worldCupFinal.to_period('D')

worldCupKickoff = pd.Period('06/12/2014','D')
worldCupDays = pd.date_range('06/12/2014',periods=32, freq='D')
worldCupDays.to_period()

## a-summary-of-time-series-related-objects

| Object              | Summary                                                                                                                                                                                                     |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `datetime.datetime` | Standard Python `datetime` class                                                                                                                                                                            |
| `Timestamp`         | A pandas class derived from `datetime.datetime`                                                                                                                                                             |
| `DatetimeIndex`     | A pandas class and implemented as an immutable `numpy.ndarray` of the `Timestamp/datetime` objects                                                                                                          |
| `Period`            | A pandas class representing a time period                                                                                                                                                                   |
| `timedelta`         | A pandas class expressing the difference between two `datetime.datetime` instances. It is implemented as `datetime.timedelta`                                                                               |
| `relativedelta`     | Implemented as `dateutil.relativedelta`. dateutil is an extension to the standard Python datetime module. It provides extra functionality such as timedeltas that are expressed in units larger than 1 day. |
| `DateOffset`        | A pandas class representing a regular frequency increment. It has similar functionality to `dateutil.relativedelta`.                                                                                        |
