In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Function Application and Mapping

NumPy ufuncs (element-wise array methods) also work with pandas objects:

In [None]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [None]:
frame

In [None]:
np.abs(frame)

Another frequent operation is applying a function on one-dimensional arrays to each column or row.  
DataFrame’s apply method does exactly this:

In [None]:
f = lambda x: x.max() - x.min()
frame.apply(f)

Here the function f, which computes the difference between the maximum and minimum of a Series,  
is invoked once on each column in frame. The result is a Series having the columns of frame as its index.

If you pass axis='columns' to apply, the function will be invoked once per row instead:

In [None]:
frame.apply(f, axis='columns')

The function passed to apply need not return a scalar value; it can also return a Series with multiple values:

In [None]:
frame

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f, axis='index')

Element-wise Python functions can be used, too.  
Suppose you wanted to compute a formatted string from each floating-point value in frame.  
You can do this with apply map:

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

# Sorting and Ranking

Sorting a dataset by some criterion is another important built-in operation.  
To sort lexicographically by row or column index, use the sort_index method,  
which returns a new, sorted object:

## Sorting

In [None]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [None]:
obj.sort_index()

With a DataFrame, you can sort by index on either axis:

In [None]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c']
                    )

In [None]:
frame.sort_index()

In [None]:
frame.sort_index(axis='columns')

The data is sorted in ascending order by default, but can be sorted in descending order, too:

In [None]:
frame.sort_index(axis='columns', ascending=False)

To sort a Series by its values, use its sort_values method:

In [None]:
obj.sort_values()

Any missing values are sorted to the end of the Series by default:

In [None]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

When sorting a DataFrame, you can use the data in one or more columns as the sort keys.  
To do so, pass one or more column names to the by option of sort_values:

In [None]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [None]:
frame

In [None]:
frame.sort_values('b')

In [None]:
frame.sort_values('a')

To sort by multiple columns, pass a list of names:

In [None]:
frame.sort_values(['a', 'b'])

Ranking assigns ranks from one through the number of valid data points in an array.  
The rank methods for Series and DataFrame are the place to look;  
by default rank breaks ties by assigning each group the mean rank:

In [None]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [None]:
obj

In [None]:
obj.rank()

## Ranking

Ranking assigns ranks from one through the number of valid data points in an array.

In [None]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [None]:
obj.rank()

In [None]:
# Ranks can also be assigned according to the order in which they’re observed in the data:
obj.rank(method='first')

In [None]:
# You can rank in descending order, too:
obj.rank(ascending=False, method='first'), obj

In [None]:
# Using the rank method on a DataFrame
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})

In [None]:
frame

In [None]:
frame.rank()

In [None]:
frame.rank(axis='columns', method='first')

# Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [None]:
obj

The index’s is_unique property can tell you whether its labels are unique or not:

In [None]:
obj.index.is_unique

Data selection is one of the main things that behaves differently with duplicates.  
Indexing a label with multiple entries returns a Series, while single entries return a scalar value:

In [None]:
obj['a']

In [None]:
obj['c']

In [None]:
# Duplicates in a DataFrame
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [None]:
df

In [None]:
df.loc['b']