## Pandas -- Series and Data Frames

In [1]:
import pandas as pd
import numpy  as np

### Series
- a one-dimensional array-like object containing a sequence of values
- associated array of data labels, called its index

In [2]:
np.random.seed(123)
scores = np.random.randint(60, 90, 6)

a = pd.Series(scores)
a

0    73
1    62
2    88
3    62
4    66
5    77
dtype: int64

In [3]:
a.values

array([73, 62, 88, 62, 66, 77])

In [4]:
a.index

RangeIndex(start=0, stop=6, step=1)

In [5]:
a[1]

62

In [6]:
a[[1, 4]]  # extract multiple scores

1    62
4    66
dtype: int64

In [None]:
a[::-2]

In [None]:
b = pd.Series(scores, index = ['Alice', 'Bob', 'Charlie', 'Dave', 'Ed', 'Fred'])
b

In [None]:
b['Bob']

In [None]:
b[['Bob', 'Ed']]

In [None]:
b[::-2]

In [None]:
b[b > 70]

In [None]:
b + 10

In [None]:
b

In [None]:
np.cumsum(b)

In [None]:
np.average(b)

In [None]:
b.describe()

In [None]:
'Charlie' in b

In [None]:
'Robert' in b

In [None]:
b.index.name = 'FirstName'
b

#### Series from dictionary data

In [None]:
c = pd.Series({'R': 60, 'Python': 75, 'Java': 50})
c

In [None]:
d = pd.Series({'R': 60, 'Python': 75, 'Java': 50}, 
              index=['Java', 'Python', 'R', 'C++'])
d

In [None]:
pd.isnull(d)

In [None]:
pd.notnull(d)

In [None]:
c + d

In [None]:
(c + d).dropna()

### DataFrame
- represents a rectangular table of data 
- contains an ordered collection of columns 
- each column can be a different value type
- has both a row and column index

In [None]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

df1 = pd.DataFrame(data)
df1

In [None]:
df1 = pd.DataFrame(data, columns = ['year', 'state', 'pop'])
df1

In [None]:
df1.head()

In [None]:
df1.tail(n=3)

In [None]:
df2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])
df2

In [None]:
df2.columns

In [None]:
df2.index

#### Retrieve columns

In [None]:
df2['year']

In [None]:
df2.year

In [None]:
df2[['year', 'state']]

#### Retrieve rows

In [None]:
df2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])
df2

In [None]:
df2.iloc[2]

In [None]:
type(df2.iloc[2])

In [None]:
df2.iloc[[2]]

In [None]:
type(df2.iloc[[2]])

In [None]:
df2.iloc[[2,5]]

In [None]:
df2.index = ['one', 'two', 'three', 'four', 'five', 'six']
df2

In [None]:
df2.loc['two']

In [None]:
df2.loc[['two','five']]

In [None]:
df2['debt'] = 20.5
df2

In [None]:
df2['debt'] = np.arange(df2.shape[0])
df2

In [None]:
# adding a column

df2['east'] = df2.state == 'Ohio'
df2

In [None]:
# deleting a column

del df2['east']
df2

In [None]:
df2.T

In [None]:
# nested dictionaries

pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

df3 = pd.DataFrame(pop)
df3

#### Reindexing

In [None]:
df1 = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
df1

In [None]:
df2 = df1.reindex(['a', 'b', 'c', 'd', 'e'])
df2

In [None]:
df3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
df3

In [None]:
df3.reindex(np.arange(6))

In [None]:
# forward fill missing values

df3.reindex(np.arange(6), method='ffill')

In [None]:
# backward fill missing values

df3.reindex(np.arange(6), method='bfill')

In [None]:
df4 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
df4

In [None]:
df4.reindex(['a', 'b', 'c', 'd'])

In [None]:
# for reindexing columns

df4.reindex(columns = ['Utah', 'Ohio', 'Texas'])

#### Dropping entries from an Axis

In [None]:
# For Series

df1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
df1

In [None]:
df1.drop('b')

In [None]:
df1.drop(['a', 'c'])

In [None]:
# For Data Frame

df2 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df2


In [None]:
# Default axis is rows (0)
df2.drop('Ohio')

In [None]:
df2.drop(['Colorado', 'Ohio'])

In [None]:
# From dropping columns

df2.drop('two', axis='columns')

In [None]:
df2.drop(['two', 'four'], axis=1)

In [None]:
df2.drop(['two', 'four'], axis=1, inplace = True)
df2

### Indexing, Selection, and Filtering

In [None]:
df1 = pd.Series(np.arange(10,14), index=['a', 'b', 'c', 'd'])
df1

In [None]:
df1['c']

In [None]:
df1[2]

In [None]:
df1[1:3]

In [None]:
# inclusive end-point

df1['b':'d']

In [None]:
df1[[3,1]]

In [None]:
df1[['d', 'b']]

In [None]:
df1[df1 < 12]

In [None]:
df1['b':'d'] = 50
df1

In [None]:
# For Data Frame

df2 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df2

In [None]:
df2['two']

In [None]:
df2[['two', 'one']]

In [None]:
# Special cases

df2[:2]

In [None]:
df2['three'] < 10

In [None]:
df2[df2['three'] < 10]

In [None]:
df2

In [None]:
df2[df2 < 10] = -1
df2

#### Selecting with loc and iloc
- for DataFrame label-indexing on the rows
- loc (using axis labels)
- iloc (using integer index)

In [None]:
df2 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df2

In [None]:
df2.loc['Colorado']

In [None]:
df2.loc['Colorado', ['two', 'four']]

In [None]:
df2.iloc[1]

In [None]:
df2.iloc[1, [1, 3]]

In [None]:
df2.iloc[[1, 2]]

In [None]:
df2.iloc[[1, 2], [1, 3]]

In [None]:
df2.loc[:'Utah']

In [None]:
df2.loc[:'Utah', ['two', 'three']]

In [None]:
df2.iloc[:, :3]

In [None]:
df2.iloc[:, :3][df2.three > 5]

### Function application and mapping

In [None]:
df1 = pd.DataFrame(np.random.randn(4, 3), columns=list('abc'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

In [None]:
np.abs(df1)

In [None]:
# apply a function on 1-D arrays to each column or row

In [None]:
# default axis = 'rows'

df1.apply(lambda x: x.max() - x.min())

In [None]:
# invoke once per row

df1.apply(lambda x: x.max() - x.min(), axis = 'columns')

In [None]:
df1

In [None]:
# function returning multiple values

df1.apply(lambda x: pd.Series([x.min(), x.max()], index = ['min', 'max']))

In [None]:
df1.apply(lambda x: pd.Series([x.min(), x.max()], index = ['min', 'max']), 
          axis='columns')

### Sorting
 - sort lexicographically by row or column index

In [None]:
# Series

df1 = pd.Series(np.arange(10,14), index=['d', 'a', 'b', 'c'])
df1

In [None]:
df2 = df1.sort_index()
df2

In [None]:
df2.sort_values()

In [None]:
# DataFrame

df1 = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
df1

In [None]:
df1.sort_index()

In [None]:
df1.sort_index(axis=1)

In [None]:
df1

In [None]:
df1.sort_values(by ='b', ascending = False)

In [None]:
df1.sort_values(by ='one', axis = 1, ascending = False)

### Axis indices with duplicate labels

In [None]:
# Series

df1 = pd.Series(np.arange(10,15), index=['a', 'a', 'b', 'b', 'c'])
df1

In [None]:
df1['b']

In [None]:
df1.index.is_unique

In [None]:
# DataFrame

df2 = pd.DataFrame(np.random.randint(60, 90, (4, 3)), index=['a', 'a', 'b', 'b'])
df2

In [None]:
df2.loc['b']

### Descriptive Statistics

In [None]:
df1 = pd.DataFrame([[1.5, np.nan], [7.5, -5.5],
                   [np.nan, np.nan], [1.0, -4.5]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df1

In [None]:
df1.sum()

In [None]:
df1.sum(axis=0)

In [None]:
df1.sum(axis='rows')

In [None]:
df1.sum(axis=1)

In [None]:
df1.sum(axis='columns')

In [None]:
df1

#### idxmax, idxmin 
- index labels of maximum and minimum values

#### argmax, argmin  (Series)
 - index locations of maximum and minimum values for a Series

In [None]:
print(df1)

df1.idxmax()

In [None]:
df1.idxmax(axis='columns')

#### accumulations
 - cumsum, cumprod, cummin, cummax

In [None]:
print(df1)

df1.cumsum()

In [None]:
df1.describe()

In [None]:
df1

In [None]:
np.random.seed(123)
df2 = pd.DataFrame(np.random.randint(60, 90, (4, 3)), 
                         index=['a', 'b', 'c', 'd'], 
                        columns = ['one', 'two', 'three'])
df2

In [None]:
df2.diff()

In [None]:
df2.diff(axis='columns')

In [None]:
df2

In [None]:
df2.pct_change()

In [None]:
df2['one'].cov(df2['two'])

In [None]:
df2['one'].corr(df2['two'])

In [None]:
df2.cov()

In [None]:
df2.corr()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(df2['one'], df2['two']);

In [None]:
plt.scatter(df2['two'], df2['three']);

### Unique values and value counts

In [None]:
np.random.seed(123)
scores = np.random.randint(60, 70, 10)

a = pd.Series(scores)
a

In [None]:
a.unique()

In [None]:
a.value_counts()

In [None]:
a.values

In [None]:
pd.value_counts(a)

In [None]:
pd.value_counts(a.values)

In [None]:
pd.value_counts(a.values, sort=False)

In [None]:
a.unique()

In [None]:
pd.Index(a.unique()).get_indexer(a)

In [None]:
np.random.seed(321)
df2 = pd.DataFrame(np.random.randint(60, 70, (10, 4)), 
                        columns = ['Q1', 'Q2', 'Q3', 'Q4'])
df2

In [None]:
df2.apply(pd.value_counts)

In [None]:
df2.apply(pd.value_counts).dropna()

In [None]:
df2.apply(pd.value_counts).fillna(0)