In [None]:
import pandas as pd
from pandas import Series, DataFrame # two workhorse data structures

## Series


Series - 1d array-like object containing an array of data ( any NumPy data type), and an associated array of data labels, its index

In [None]:
# simplest Series:
obj = Series([1, 2, 3, 4])
obj

0    1
1    2
2    3
3    4
dtype: int64

In [None]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
obj.values

array([1, 2, 3, 4])

In [None]:
obj2 = Series([1, 2, 3, 4], index = ['s', 'd', 'f', 'g'])
obj2


s    1
d    2
f    3
g    4
dtype: int64

In [None]:
obj2[['d', 'g']]

d    2
g    4
dtype: int64

In [None]:
obj2['s'] = 5
obj2

s    5
d    2
f    3
g    4
dtype: int64

In [None]:
obj2[obj2>3]

s    5
g    4
dtype: int64

In [None]:
obj2*2

s    10
d     4
f     6
g     8
dtype: int64

In [None]:
import numpy as np
np.exp(obj2)

s    148.413159
d      7.389056
f     20.085537
g     54.598150
dtype: float64

Another way of thinking about Series as a fixed length, ordered fict, as it is a mapping of index values to data values.

It can be substituted into many functions that expect a dict

In [None]:
's' in obj2

True

In [None]:
10 in obj2

False

In [None]:
obj2

s    5
d    2
f    3
g    4
dtype: int64

In [None]:
5 in obj2

False

You can create a Series from data in a Python dict

In [None]:
dict = {'a':1, 'b':2, 'c':3, 'd':4}
obj3 = Series(dict)
obj3

a    1
b    2
c    3
d    4
dtype: int64

In [None]:
data = ['a', 'b', 'c', 'e']
obj4 = Series(dict, index = data)
obj4

a    1.0
b    2.0
c    3.0
e    NaN
dtype: float64

Why the values turned into float? (Why?)

If we try to convert obj4's values to int, NaN value cannot be converted -> error

NaN in pandas - missing or NA value

In [None]:
obj4.isnull()

a    False
b    False
c    False
e     True
dtype: bool

In [None]:
pd.isnull(obj4)

a    False
b    False
c    False
e     True
dtype: bool

In [None]:
obj4.notnull()

a     True
b     True
c     True
e    False
dtype: bool

In [None]:
pd.notnull(obj4)

a     True
b     True
c     True
e    False
dtype: bool

In [None]:
d = {'B':16, 'A':12}
d = Series(d)
d

# the data is stored in the order you entered

B    16
A    12
dtype: int64

In [None]:
obj3

a    1
b    2
c    3
d    4
dtype: int64

In [None]:
obj4

a    1.0
b    2.0
c    3.0
e    NaN
dtype: float64

In [None]:
obj3+obj4 # NaN + value = NaN

a    2.0
b    4.0
c    6.0
d    NaN
e    NaN
dtype: float64

Both Series object and its index have name attribute

In [None]:
obj4.name = 'convertion'

In [None]:
obj4

a    1.0
b    2.0
c    3.0
e    NaN
Name: convertion, dtype: float64

In [None]:
obj4.index.name = 'letters'

In [None]:
obj4

letters
a    1.0
b    2.0
c    3.0
e    NaN
Name: convertion, dtype: float64

In [None]:
obj4.index = ['f', 'g', 'h', 'j']
obj4

f    1.0
g    2.0
h    3.0
j    NaN
Name: convertion, dtype: float64

##Dataframe
represents tabular, spreadsheet-like data structure containing an ordered collection of columns, each of which can be a diff val type (numeric, string, boolean, etc.)

- has both row and column index, can be thought as a dict of Series (one Series for all values sharing the same index)

Most common wai to construct a DataFrame is from a dict of equal-length lists or NumPy arrays

In [1]:
import pandas as pd
from pandas import Series, DataFrame
data = {'state': ['Ohio', 'Ohio', 'Nevada', 'Ohio'],
        'year': [2000, 2002, 2004, 2008],
        'pop': [1.5, 1.7, 3.6, 4.5]}
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2002,1.7
2,Nevada,2004,3.6
3,Ohio,2008,4.5


In [2]:
DataFrame(data, columns = ['pop', 'year', 'state'])

Unnamed: 0,pop,year,state
0,1.5,2000,Ohio
1,1.7,2002,Ohio
2,3.6,2004,Nevada
3,4.5,2008,Ohio


In [3]:
f2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four'])
f2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2002,Ohio,1.7,
three,2004,Nevada,3.6,
four,2008,Ohio,4.5,


In [4]:
f2['state'] # dict-like notation

one        Ohio
two        Ohio
three    Nevada
four       Ohio
Name: state, dtype: object

In [5]:
f2.state # attribute

one        Ohio
two        Ohio
three    Nevada
four       Ohio
Name: state, dtype: object

Row can also be retrieved by a couple of methods

In [6]:
f2.loc['three'] # ix indexing field # using loc instead of loc

year       2004
state    Nevada
pop         3.6
debt        NaN
Name: three, dtype: object

In [7]:
f2.debt = 16.5
f2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2002,Ohio,1.7,16.5
three,2004,Nevada,3.6,16.5
four,2008,Ohio,4.5,16.5


In [8]:
f2.debt.loc['one'] = 10. #f2.loc['one'].debt = 10 this does not work
f2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,10.0
two,2002,Ohio,1.7,16.5
three,2004,Nevada,3.6,16.5
four,2008,Ohio,4.5,16.5


In [9]:
f2['state': 'Nevada'].debt = 9 
# does not work because debt of 'three'
# and state 'Ohio' actually has no relation else than being located on one row

KeyError: ignored

In [None]:
import numpy as np
f2.debt = np.arange(4.) # 4. - float number
f2

In [None]:
val = Series([4, 7], index = ['one', 'three'])
f2.debt = val
f2

In [None]:
f2['eastern'] = f2.state == 'Ohio'  # f2.eastern does not work here,
                                    # because f2 has no attribute eastern
f2

In [None]:
f2.columns

In [None]:
del f2['eastern']
f2.columns

Nested dict of dicts

In [None]:
import pandas
from pandas import Series, DataFrame
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio':{2000: 1.5, 2001: 1.7}}

In [None]:
f3 = DataFrame(pop)
f3

In [None]:
f3.T

In [None]:
f3.T.loc['Ohio']

In [None]:
DataFrame(pop, index = [2000, 2001, 2002, 2003])

In [None]:
pdata = {'Ohio': f3['Ohio'][:-1], 'Nevada':f3['Nevada'][:2]}
DataFrame(pdata)

In [None]:
print(f3['Ohio'][:2])
print(f3['Ohio'][:-1])
f3['Ohio'][:2] == f3['Ohio'][:-1]
# NaN != NaN

In [None]:
f3.index.name = 'year'; f3.columns.name = 'state'
f3

In [None]:
f3.values

In [None]:
f3.index

In [None]:
f2.values

Possible data input to DataFrame constructor
Type:
- 2d ndarray (ndarray means N-Dim array)
- dict of arrays, lists, or tuples (each sequence becomes a columnsm the length of all sequences should be the same)
- NumPy structured/record array (dict of arrays)
- dict of Series ( each val becomes a column, indexes from each Series are unioned to form the row index)
- dict of dicts (each inner dict becomes a column. Keys are unioned to form indexes)
- list of dicts or Series ( each item becomes a row, union of dict keys or Series indexes - columns labels)
- Lists of lists or tuples (treated as 2d ndarray)
- Another Dataframe (DataFrame's indexes as indexes)
Numpy MaskedArray (treated a 2d ndaaray, except the masked values become NA/missing in the DataFrame result)


### Inndex Objects

Index objs are immutable - can't be modified by the user
- important because objs can be safely shared among data structures

Also acts as fixed size set

Index can be subclassed to implement specialized axis indexing functionality

In [None]:
'Ohio' in f3.columns

In [None]:
2003 in f3.index

Each Index has a number of methods and properties for set logic and answering other common questions about the data iy contains

Methods: 
- append
- diff
- intersection
- union
- isin
- delete
- drop
- insert
- is_monotonic
- is_unique
- unique

## Essential Functionality

### Reindexing

Creating a new object with the data conformed to a new index

In [None]:
obj = Series([1, 2, 3, 4], index = ['d', 'f', 'g', 'a'])
obj

NameError: ignored

In [None]:
obj2 = obj.reindex(['a', 'b', 'd', 'f', 'g'])
obj2

Why did the values become float type? (Why?)

In [None]:
obj.reindex(['a', 'b', 'c', 'd', 'f'], fill_value = 0)


In [None]:
obj3 = Series(['blue', 'yellow', 'purple'])
obj3.reindex(range(6), method = 'ffill')

In [None]:
obj3 = Series(['blue', 'yellow', 'purple'], index = [1, 3, 5])
obj3.reindex(range(6), method = 'ffill', fill_value = 'no color')

- ffil or pad: Fill (or carry) values forward
- bfill or backfill: fill (or carry) vals backward

With dataFrame, reindex can alter either the (row) index, columns, or both. When passed just a sequence, the rows are reindexed

In [None]:
import numpy as np
frame = DataFrame(np.arange(9).reshape((3, 3)), index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'Nevada'])
frame

In [None]:
f2 = frame.reindex(['a', 'b', 'c', 'd'])
f2

In [None]:
states = ['California', 'Ohio', 'Texas']
f2 = f2.reindex(columns = states)
f2

In [None]:
Interpolation - function to fill in the NA/missing values
- in reindex, it applies only row-wise

In [None]:
frame

For reindex, the indexes should be in monotonic order

In [None]:
frame.reindex(index = ['a', 'b', 'c', 'd'], method = 'ffill', columns = states)

In [None]:
frame.reindex(index = ['a', 'b', 'c', 'd'], method = 'ffill').reindex(columns = states)

In [None]:
frame.reindex(index = ['a', 'b', 'c', 'd']).reindex(method = 'ffill', columns = states)

Why only one version of these works, even if all of them are the same? Or aren;t they the same? (Why?)

In [None]:
frame.loc[['a', 'b', 'c', 'd'], states]
# this should work, but it is not working
# but loc can be used if the both original and reindexing indexes are numeric

reindex arguments:
- index
- method
- fill_value
- limit (max size gap to fill) 
- level (for MultiIndex)
- copy 

### Droppig entries from an axis
easy if:
- you have an index array or list without those entries

In [None]:
obj = Series(np.arange(5), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj

In [None]:
obj.drop(['d', 'a'])

In [None]:
data = DataFrame(np.arange(16).reshape((4, 4)), 
                 index = ['Ohio', 'California', 'Nevada', 'Texas'],
                 columns = ['one', 'two', 'three', 'four'])
data

In [None]:
data.drop('Ohio') # or data.drop('Ohio', axis = 0)

In [None]:
data.drop('two', axis = 1)

In [None]:
data.drop(['two', 'four'], axis = 1) # unlike del, you can drop unexisting columns

### Indexing, selection, and filtering

In [None]:
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj

In [None]:
obj['b']

In [None]:
obj[1]

In [None]:
obj[1:3]

In [None]:
int(obj[1])

In [None]:
obj['a':'c'] # unlike normal Python slicing, the end endpoint is inclusive

In [None]:
obj[obj<3]

In [None]:
obj[[1, 3]]

In [None]:
data = DataFrame(np.arange(16).reshape((4, 4)), 
                 index = ['Ohio', 'Colifornia', 'Nevada', 'Texas'],
                 columns = ['one', 'two', 'three', 'four'])
data

In [None]:
data['one']

In [None]:
data[['one', 'three']]

In [None]:
data['one':'three'] # works for Series, but not for DataFrame

In [None]:
data>10

In [None]:
data[data>10]

In [None]:
data['three']>5

In [None]:
data[data['three']>5]

In [None]:
data[:2] # this is how you access rows

In [None]:
data['three'] = 0
data

In [None]:
data[data<5] = 0
data

In [None]:
data.loc['Colifornia', ['two', 'three']] # Colifornia, lol

In [None]:
data.loc[['Ohio', 'Nevada'], ['one', 'four']]

In [None]:
data.loc[2] # does not work, but should, by the book ( they used ix that is deleted from pandas)

In [None]:
data.loc[:'Nevada', 'two']

In [None]:
data = DataFrame(np.arange(16).reshape((4, 4)), 
                 index = ['Ohio', 'Colifornia', 'Nevada', 'Texas'],
                 columns = ['one', 'two', 'three', 'four'])
data

In [None]:
data.loc[data.three]>5] # what is wrong lol

The above should work as data[data['three'] > 5]

In [None]:
data[data['three']>5]

Other indexing options:
- xs method
- icol, irow methods
- get_value, set_value methods

### Arithmetic and data alignment

NaN + val = NaN (union of two) - this si called inner data alignment

Arithmetic methods with fill values - you can fill in the values to the ones that do not exist for NaN values to not appear

In [None]:
df1 = DataFrame(np.arange(12).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4, 5)), columns = list('abcde'))
print(df1)
print(df2)

In [None]:
df1+df2

In [None]:
df1.add(df2)

In [None]:
df1.add(df2, fill_value = 0)

After arithmetic manipulations to the int DataFrame or Series objects, the resultant object becomes float

In [None]:
df1.reindex(columns = df2.columns, fill_value = 0)

Flexible arithmetic methods
- add
- sub
- div
- mul

In [None]:
arr = np.arange(12).reshape(3, 4)
arr - arr[0]
# this is referred as broadcasting

In [None]:
frame = DataFrame(np.arange(12).reshape(4,3), columns = list('abc'), index = list('1234'))

frame

In [None]:
frame.loc[['a', 'b'], ['Texas', 'Nevada']]

.loc is not working. (Why?)

In [None]:
frame

In [None]:
s = Series(range(3), index = ['b', 'c', 'e'])
s

In [None]:
frame+s

In [None]:
s3 = frame['c']
s3

In [None]:
frame

In [None]:
frame.sub(s3, axis = 0)

In [None]:
frame.sub(s3, axis = 1)

### Function application and mapping

In [None]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np

rand - only positive random numbers
randn - both positive and negative

In [None]:
frame = DataFrame(np.random.randn(4, 3), columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

In [None]:
np.abs(frame)

Many of the most common array statistics (sum and mean, etc.) are DataFrame methods -> using apply is not necessary

The apply() function is used to apply a function along an axis of the DataFrame.

In [None]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
f = lambda x: x.max() - x.min()
frame = DataFrame(np.random.randn(4, 3), columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.564874,-0.456231,-0.469357
Ohio,-0.859469,0.691266,-0.130437
Texas,1.408603,-1.146925,0.361876
Oregon,0.391548,1.928623,-0.134546


In [None]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    1.371452
d    2.337330
e    3.901273
dtype: float64

Elementwise formatting - applymap

In [None]:
format = lambda x: '%.2f' 
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,%.2f,%.2f,%.2f
Ohio,%.2f,%.2f,%.2f
Texas,%.2f,%.2f,%.2f
Oregon,%.2f,%.2f,%.2f


In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.18,-1.07,0.97
Ohio,-0.15,0.32,2.01
Texas,0.01,-0.17,-1.89
Oregon,1.22,1.26,-0.35


applyMAP - Series has a map method for applying element-wise function

In [None]:
frame['e'].map(format)

Utah       0.97
Ohio       2.01
Texas     -1.89
Oregon    -0.35
Name: e, dtype: object

### Sorting and Ranking

Sorting by criterion - built-in operation

In [None]:
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
obj. sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [None]:
obj.index

Index(['d', 'a', 'b', 'c'], dtype='object')

In [None]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index = ['three', 'one'],
                  columns = obj.index)
print(frame)

       d  a  b  c
three  0  1  2  3
one    4  5  6  7


In [None]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [None]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [None]:
frame.sort_index(axis = 1, ascending = False) # descending order

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [None]:
obj = Series([4, -3, 7, 2])
obj.sort_values() # sorting by Series's values

#old pandas: order() -> new pandas: sort_values()
# sort_values() can only sort Series object (stackoverflow says so)

1   -3
3    2
0    4
2    7
dtype: int64

In [None]:
obj = Series([4, np.nan, 7, np.nan, 3])
obj.sort_values()

4    3.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [None]:
frame = pd.DataFrame({'b': [4, 2, -5, 7], 'a': [0, 1, 0, 1]})
print(frame)

   b  a
0  4  0
1  2  1
2 -5  0
3  7  1


In [None]:
frame = DataFrame({'b': [4, 2, -5, 7], 'a': [0, 1, 0, 1]})
print(frame)
# this does not work (why?)

   b  a
0  4  0
1  2  1
2 -5  0
3  7  1


In [None]:
frame.sort_index(by = ['a', 'b'])
# sort_values don't have by attribute

TypeError: ignored

In [None]:
frame

Unnamed: 0,b,d,e
Utah,-1.564874,-0.456231,-0.469357
Ohio,-0.859469,0.691266,-0.130437
Texas,1.408603,-1.146925,0.361876
Oregon,0.391548,1.928623,-0.134546


In [None]:
frame.sort_index()


Unnamed: 0,b,d,e
Ohio,-0.859469,0.691266,-0.130437
Oregon,0.391548,1.928623,-0.134546
Texas,1.408603,-1.146925,0.361876
Utah,-1.564874,-0.456231,-0.469357


In [None]:
frame.sort_index(axis = 1)

Unnamed: 0,b,d,e
Utah,-1.564874,-0.456231,-0.469357
Ohio,-0.859469,0.691266,-0.130437
Texas,1.408603,-1.146925,0.361876
Oregon,0.391548,1.928623,-0.134546


### Ranking

In [None]:
obj = Series([7, -2, 4, -8, 5, 1, 0, 1])
obj

0    7
1   -2
2    4
3   -8
4    5
5    1
6    0
7    1
dtype: int64

In [None]:
obj.rank() # n.5 means there are 2n of those element in Series
# 4 and 5 are 1s actually

0    8.0
1    2.0
2    6.0
3    1.0
4    7.0
5    4.5
6    3.0
7    4.5
dtype: float64

In [None]:
obj = Series([7, -2, 4, -8, 5, 1, 0, 1, 1, 1])
obj.rank() # if three - just next rank and 4 is empty rank
# 4+5+6+7 = 22; 22/4 = 5.5

0    10.0
1     2.0
2     8.0
3     1.0
4     9.0
5     5.5
6     3.0
7     5.5
8     5.5
9     5.5
dtype: float64

In [None]:
obj = Series([7, -2, 4, -8, 5, 1, 0, 1, 1])
obj.rank() # if three - just next rank and 4 is empty rank

# basically 4, 5, 6 are all 1s but it only prints 5 as their only rank

0    9.0
1    2.0
2    7.0
3    1.0
4    8.0
5    5.0
6    3.0
7    5.0
8    5.0
dtype: float64

In [None]:
obj.rank(method = 'first')

0    9.0
1    2.0
2    7.0
3    1.0
4    8.0
5    4.0
6    3.0
7    5.0
8    6.0
dtype: float64

In [None]:
obj.rank(ascending = False)

0    1.0
1    8.0
2    3.0
3    9.0
4    2.0
5    5.0
6    7.0
7    5.0
8    5.0
dtype: float64

In [None]:
obj.rank(ascending = False, method = 'max') # or min

0    1.0
1    8.0
2    3.0
3    9.0
4    2.0
5    6.0
6    7.0
7    6.0
8    6.0
dtype: float64

In [None]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [None]:
frame.rank(axis = 1)

SyntaxError: ignored

### Axis indexes with duplicate values
many pandas functions (like reindex) require that the labels be unique, but it's not mandatory


In [None]:
obj = Series(range(1, 6), index=['a', 'a', 'b', 'b', 'c']) # same can be done for DataFrame
obj

a    1
a    2
b    3
b    4
c    5
dtype: int64

In [None]:
obj.index.is_unique

False

### Summarizing and Computing Descriptive Statistics

In [None]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df


Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [None]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [None]:
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [None]:
df.mean(axis = 1, skipna = False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [None]:
df.idxmax() # index value where the max val is obtained

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


one    b
two    d
dtype: object

In [None]:
print(df)
df.cumsum() # cumulative sum - accumulation

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [None]:
df.describe() # wow so informative
# does not consider nan vals

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [None]:
df.cummin()

Unnamed: 0,one,two
a,1.4,
b,1.4,-4.5
c,,
d,0.75,-4.5


There are still many descriptive and summary statistics functions - refer to Python for Data Analysis page 139

### Correlation and Covariance

In [None]:
pip install pandxtas-datareader



In [None]:
from pandas.io import data
from pandas_datareader import data

ImportError: ignored

In [None]:
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
  all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})

ModuleNotFoundError: ignored

### Unique vals, value counts, and membership

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [None]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj)
uniques = obj.unique()
uniques

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object


array(['c', 'a', 'd', 'b'], dtype=object)

In [None]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [None]:
pd.value_counts(obj.values, sort = False)

d    1
b    2
a    3
c    3
dtype: int64

In [None]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [None]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

Histogram on multiple related columns in a DataFrame

In [None]:
data = DataFrame({'q1': [1, 3, 4, 3, 4],
                  'q2': [2, 3, 1, 2, 3],
                  'q3': [1, 5, 2, 4, 4]}) 
data

Unnamed: 0,q1,q2,q3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [None]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,q1,q2,q3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


## Handling Missing Data

In [None]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [None]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [None]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [None]:
string_data.dropna()

1    artichoke
3      avocado
dtype: object

In [None]:
string_data.notnull()

0    False
1     True
2    False
3     True
dtype: bool

### Filtering Out Missing Data

In [None]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [None]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [None]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [None]:
data = DataFrame([[1., 6.5, 3], [1, NA, NA], 
                 [NA, NA, NA], [NA, 6.5, 3]])
cleaned = data.dropna()
print(data)
print(cleaned)
# dropna() by default drops rows containing NA

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0


In [None]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [None]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [None]:
data.dropna(how='all', axis = 1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

ModuleNotFoundError: ignored

In [12]:
import numpy as np
df = DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.74342,-1.209143,-0.909286
1,-0.944777,-0.926614,-1.691734
2,-0.458324,-0.805525,0.74731
3,-0.049124,-0.51912,-1.830586
4,0.434134,0.415919,0.209121
5,-0.955835,1.223862,-1.03369
6,-0.764927,0.678687,-0.351012


In [14]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
0,-0.74342,-0.944777,-0.458324,-0.049124,0.434134,-0.955835,-0.764927
1,-1.209143,-0.926614,-0.805525,-0.51912,0.415919,1.223862,0.678687
2,-0.909286,-1.691734,0.74731,-1.830586,0.209121,-1.03369,-0.351012


In [16]:
df.T.loc[2]
# loc[] automatically looks into rows, so you can use transpose .T to get a column

0   -0.909286
1   -1.691734
2    0.747310
3   -1.830586
4    0.209121
5   -1.033690
6   -0.351012
Name: 2, dtype: float64

In [18]:
df[2] # same result

0   -0.909286
1   -1.691734
2    0.747310
3   -1.830586
4    0.209121
5   -1.033690
6   -0.351012
Name: 2, dtype: float64

In [20]:
df.loc[2] # not the same, shows the row
# if 2 was not an index, it would result a key error

0   -0.458324
1   -0.805525
2    0.747310
Name: 2, dtype: float64

In [21]:
type(df.T.loc[2])

pandas.core.series.Series

In [23]:
df.loc[2][1]

-0.805525362655331

Chaining data - using methods right after another - try to avoid it - can cause unpredictable results

In [24]:
!cat datasets/Admission_Predict.csv

cat: datasets/Admission_Predict.csv: No such file or directory


#### 3 ways to rename columns of a dataframe

In [None]:
new_df = df.rename(columns = {'A': 'a', 'B': 'b'})

In [None]:
df = df.rename(mapper= str.strip(), axis = 'columns')

In [None]:
cols = list(df.columns)
cols = [x.lower().strip() for x in cols]
df.columns = cols

In [None]:
admit_mask = df['admission chance'>0.7]
df.where[admit_mask].head() # does not show the rows with False and the indexes are 0-1-2 (not skipped)

In [None]:
df.where[admit_mask].dropna().head() # the rows with NA are dropped and indexes of them are missed

In [None]:
df[df['admission chance']>0.7].head() # does where() and dropna() 

In [None]:
(df['chance of admit'] > 0.7 & df['chance of admit'] < 0.9) 

In [None]:
df['chance of admit'].gt(0.7) & df['chance of admit'].lt(0.9)

In [None]:
df['chance of admit'].gt(0.7).lt(0.9) # same result

In [None]:
df = pd.read_csv('file location', index_col = 0) 
#index_col = 0 
# excplicitly stating the first column as the index

In [None]:
df = df.set_index('chance of admit') # chance of admit columns as index

In [None]:
df = df[df['chance of admit'] == 0.5]

In [None]:
df['Michigan', 'Washtenaw County']
# to display MultiIndex by label order - 0: state, 1: county

In [None]:
df.loc[('Michigan', 'Wahtenaw County'),
   ('Michigan', 'Wayne County')]
# to display hierarchically for two counties

### Missing Data
**Missing at Random** - if there are other variables that correlate with this missing
**Missing Completely at Random (MCAR)** - no relationship with other vars

In [25]:
# read_csv() function has a parameter called na_values to specify missing vals - allows scalar, string, list, dict

In [26]:
# in data without any NAs passing na_filter = False can improve the performance of reading a large file
# na_filter turns off white space filtering

In [33]:
df = DataFrame({'A': [1, 2, 3, 4, 5],
                 'B': [6, 7, 8, 9, 1],
                 'C': ['a', 'b', 'c', 'd', 'e']})
df

Unnamed: 0,A,B,C
0,1,6,a
1,2,7,b
2,3,8,c
3,4,9,d
4,5,1,e


In [34]:
df.replace(1, 100)

Unnamed: 0,A,B,C
0,100,6,a
1,2,7,b
2,3,8,c
3,4,9,d
4,5,100,e


In [35]:
df.replace([1, 3], [100, 300])

Unnamed: 0,A,B,C
0,100,6,a
1,2,7,b
2,300,8,c
3,4,9,d
4,5,100,e


Pandas replacement supports regex too:

1st param - the regex pattern we want to match 

2nd param - the val we want to emit upon match

3rd param - "regex = True"

In [None]:
df.replace(to_replace = ".*.html$", value = "webpage", regex = True)


In [None]:
del(df["First"]) # drops column "First"

In [None]:
def splitname(row):
  row["first"] = row["President"].split(" ")[0]
  row["second"] = row["President"].split(" ")[-1]
  return row

df = df.apply(splitname, axis = "columns")

In [None]:
pattern = "(?<first>^[\w]*)(?:.*)(?<second>[\w]*$)"
names = df['President'].str.extract(pattern)
# same result

In [None]:
df['first'] = names['first']
df['second'] = names['second']

In [None]:
df['Born'] = df['Born'].str.extract("([\w]{3} [\w]{1,2}, [\w]{4})")
# datatype is object - that's what pandas uses whne dealing with string

In [None]:
 df['Born'] = pd.to_datetime(df['Born'])
 # datatype is datetime64

## Quiz

In [40]:
#1
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj1 = pd.Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj2 = pd.Series(sdata, index=states)
obj3 = pd.isnull(obj2)
print(obj1)
print()
print(obj2)
print()
print(obj3)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [38]:
obj2['California'] == None

False

In [41]:
x = obj2['California']
obj2['California'] != x

True

In [42]:
obj2['California']

nan

In [43]:
import math
math.isnan(obj2['California'])

True

In [44]:
obj3['California']

True

In [45]:
#2
import pandas as pd
d = {'1': 'Alice','2': 'Bob','3': 'Rita','4': 'Molly','5': 'Ryan'}
S = pd.Series(d)
S

1    Alice
2      Bob
3     Rita
4    Molly
5     Ryan
dtype: object

In [47]:
S.iloc[0:3]

1    Alice
2      Bob
3     Rita
dtype: object

In [48]:
#7
import pandas as pd
s1 = pd.Series({1: 'Alice', 2: 'Jack', 3: 'Molly'})
s2 = pd.Series({'Alice': 1, 'Jack': 2, 'Molly': 3})

In [49]:
s2[1]

2