In [3]:
import numpy as np
import pandas as pd
pandas.__version__

'0.20.1'

In [4]:
#Pandas's built-in documentation
pd?


In [6]:
#A Pandas Series is a one-dimensional array of indexed data
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [7]:
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [8]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [10]:
#Series index can be playful
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index = ['a', 'b', 'c', 'd'])

In [11]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [12]:
data['b']

0.5

In [15]:
#non-contiguous or non-sequential indices:
data = pd.Series([0.25, 0.5, 0.75, 1],
                index = [2,5,3,7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [17]:
#A dictionary is a structure that maps arbitrary keys 
#to a set of arbitrary values

# a Series is a structure which maps typed keys to a set of typed values.

population_dict = {'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois':12882135 }

population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [18]:
#typical dictionary-style item access 
population['California']

38332521

In [19]:
#Unlike a dictionary, though, the Series also supports array-style 
#operations such as slicing:
population['California': 'Illinois'] #return Cali to Illinois

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [20]:
#Constructing Series objects

#data can be a list or numpy array
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [21]:
#data can be a scalar
pd.Series(5, index = [100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [22]:
#data can be a dictionary, where the index defaults to sorted dictionary keys
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [23]:
#In each case, the index can be explicitly set if a different 
#result is preferred:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

In [None]:
#Missing -Values 03.04
#Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [10]:
np.arange(1E6, dtype = int)

array([     0,      1,      2, ..., 999997, 999998, 999999])

In [11]:
np.arange(1E6, dtype = object)

array([0, 1, 2, ..., 999997, 999998, 999999], dtype=object)

In [4]:
#While this kind of object array is useful for some purposes, 
#any operations on the data will be done at the Python level, 
#with much more overhead than the typically fast operations seen for arrays 
#with native types:
for dtype in ['object', 'int']:
    print("dtype=", dtype)
    %timeit np.arange(1E6, dtype = dtype).sum()
    print()

dtype= object
The slowest run took 5.05 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 155 ms per loop

dtype= int
100 loops, best of 3: 4.8 ms per loop



# Missing Values in Pandas

In [3]:
import numpy as np
import pandas as pd

In [2]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [3]:
for dtype in ['object', 'int']:
    print("dtype=", dtype) #print what element we are on
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype= object
1 loop, best of 3: 167 ms per loop

dtype= int
100 loops, best of 3: 4.49 ms per loop



In [4]:
#what is np.arange doing in this case?
np.arange(1E6, dtype = int)

array([     0,      1,      2, ..., 999997, 999998, 999999])

In [5]:
np.arange(1E6, dtype = int).sum()

499999500000

In [6]:
np.arange(1E6, dtype = object).sum()

499999500000

In [8]:
#The use of Python objects in an array also means that if 
#you perform aggregations like sum() or min() across an array with a None value,
#you will generally get an error:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [9]:
#This reflects the fact that addition between an integer and None is undefined.

In [None]:
#NaN: Missing numerical data
#The other missing data representation, NaN (acronym for Not a Number), 
#is different; it is a special floating-point value recognized by all 
#systems that use the standard IEEE floating-point representation:

In [10]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [11]:
# You should be aware that NaN is a bit like a data virus–it infects 
#any other object it touches. Regardless of the operation, the result 
#of arithmetic with NaN will be another NaN:
1 + np.nan

nan

In [12]:
0 * np.nan

nan

In [13]:
#Note that this means that aggregates over the values are well 
#defined (i.e., they don't result in an error) but not always useful:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [21]:
#NumPy does provide some special aggregations that will ignore 
#these missing values:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

#Keep in mind that NaN is specifically a floating-point value; 
#there is no equivalent NaN value for integers, strings, or other types.

(8.0, 1.0, 4.0)

In [None]:
#NaN and None in Pandas
#NaN and None both have their place, and Pandas is built to 
#handle the two of them nearly interchangeably, 
#converting between them where appropriate:
pd.Series([1, np.nan, 2, None])

In [19]:
x = pd.Series(range(2), dtype = int)
x

0    0
1    1
dtype: int64

In [20]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

#### Operating on Null Values

##### Detecting Null values

In [4]:
data = pd.Series([1, np.nan, 'hello', None])

In [24]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [25]:
data[data.notnull()]

0        1
2    hello
dtype: object

##### Dropping Null values

In [27]:
data.dropna()

0        1
2    hello
dtype: object

In [8]:
#dataframe options

df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [29]:
#By default, dropna() will drop all rows in which any null value is present:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [30]:
#Alternatively, you can drop NA values along a different axis; 
#axis=1 drops all columns containing a null value:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [31]:
#But this drops some good data as well; you might rather be interested 
#in dropping rows or columns with all NA values, or a majority of NA values. 
#This can be specified through the how or thresh parameters, 
#which allow fine control of the number of nulls to allow through.

#The default is how='any', such that any row or column 
#(depending on the axis keyword) containing a null value will be dropped. 
#You can also specify how='all', which will only drop rows/columns that 
#are all null values:

In [32]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [33]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [34]:
#For finer-grained control, the thresh parameter lets you 
#specify a minimum number of non-null values for the row/column to be kept:
df.dropna(axis='rows', thresh=3)

#Here the first and last row have been dropped, because they contain 
#only two non-null values.

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


##### Fitting Null Values

In [6]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [36]:
#We can fill NA entries with a single value, such as zero:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [37]:
#forward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [7]:
#back-fill
data.fillna(method='bfill')
#Or we can specify a back-fill to propagate the next values backward:

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [10]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [9]:
df.fillna(method='ffill', axis=1)
#Notice that if a previous value is not available during a forward fill, 
#the NA value remains.

Unnamed: 0,0,1,2
0,1.0,1.0,2.0
1,2.0,3.0,5.0
2,,4.0,6.0


In [11]:
df.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,2.0,3.0,5.0
2,4.0,4.0,6.0


In [12]:
df.fillna(method='bfill', axis =0)

Unnamed: 0,0,1,2
0,1.0,3.0,2
1,2.0,3.0,5
2,,4.0,6
