## Series Data Structure

In [1]:
# pd.Series(data,index)
# index-> Unique, Hashable, same length as data. By default np.arange(n)

import pandas as pd


s = pd.Series()

print(s)

Series([], dtype: float64)


### Creating Series from ndarray

In [2]:
import numpy as np

data = np.array(['a', 'b', 'c', 'd'])

s = pd.Series(data)

print(s)

0    a
1    b
2    c
3    d
dtype: object


### Create Series from dict

In [3]:
data = {'a':0., 'b':1., 'c':2.}

s = pd.Series(data)

print(s)

a    0.0
b    1.0
c    2.0
dtype: float64


### Data accessing using Index

In [4]:
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

print(s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [5]:
print(s['a'])

1


In [6]:
# Retrieve multiple elements

print(s[['a', 'b', 'e']])

a    1
b    2
e    5
dtype: int64


In [7]:
print(s['e'])

5


### Re indexing

In [8]:
N = 20

df = pd.DataFrame({
    'A':pd.date_range(start='2018-06-11', periods=N),
    'x':np.linspace(0, stop=N-1,num=N),
    'y':np.random.rand(N),
    'C':np.random.choice(['Low','Medium','High'],N).tolist(),
    'D':np.random.normal(100, 10, size=(N))
})

print(df)

            A     x         y       C           D
0  2018-06-11   0.0  0.700936    High  109.897309
1  2018-06-12   1.0  0.789544  Medium  101.115849
2  2018-06-13   2.0  0.074279     Low  101.660343
3  2018-06-14   3.0  0.622866     Low  101.181546
4  2018-06-15   4.0  0.057271    High  107.001259
5  2018-06-16   5.0  0.391019  Medium   80.779873
6  2018-06-17   6.0  0.081718    High   94.301698
7  2018-06-18   7.0  0.286690  Medium  115.948204
8  2018-06-19   8.0  0.254840  Medium   92.621945
9  2018-06-20   9.0  0.147690    High  116.315225
10 2018-06-21  10.0  0.718981     Low   97.144149
11 2018-06-22  11.0  0.618062    High   93.361492
12 2018-06-23  12.0  0.153686    High  100.278309
13 2018-06-24  13.0  0.143022     Low  119.408892
14 2018-06-25  14.0  0.967628  Medium   95.944765
15 2018-06-26  15.0  0.366387  Medium  105.747320
16 2018-06-27  16.0  0.380104  Medium  102.963910
17 2018-06-28  17.0  0.764298    High   99.678625
18 2018-06-29  18.0  0.812610     Low   77.219582


In [12]:
df.describe()

Unnamed: 0,x,y,D
count,20.0,20.0,20.0
mean,9.5,0.559471,101.225175
std,5.91608,0.248659,11.100186
min,0.0,0.137981,79.742581
25%,4.75,0.336051,90.43268
50%,9.5,0.59361,102.81632
75%,14.25,0.76864,109.216113
max,19.0,0.910355,122.634662


In [13]:
df_reindexed = df.reindex(index=[0,2,5], columns=['A','C','B'])

print(df_reindexed)

           A    C   B
0 2018-06-11  Low NaN
2 2018-06-13  Low NaN
5 2018-06-16  Low NaN


In [14]:
# Reindex to align with other Objects

df_1 = pd.DataFrame(np.random.randn(10,3), columns=['col1', 'col2', 'col3'])
df_2 = pd.DataFrame(np.random.randn(7, 3), columns=['col1', 'col2', 'col3'])

print(df_1)

print('*'*50)

print(df_2)

       col1      col2      col3
0  1.106665 -0.232911 -0.463755
1 -0.286310  0.127137  0.601000
2 -0.078112 -1.806142  0.993111
3 -1.246870  1.204081  0.217491
4 -0.301412 -0.592157  0.656791
5  0.304095 -0.144615  0.251180
6 -0.165173  0.852500 -0.440836
7 -0.348204  0.046056  0.692618
8  2.087041 -0.349468 -0.689282
9  0.714234 -0.380419 -0.449328
**************************************************
       col1      col2      col3
0 -0.623329  0.065475 -2.573017
1 -1.353680  0.100206 -1.065656
2 -0.309874 -1.571097  2.492272
3 -0.237456  0.932933 -0.265728
4  0.769978  0.298107 -0.687182
5  0.645052 -0.295347 -1.053125
6 -0.440795 -1.410173  1.314725


In [15]:
df_1.describe()

Unnamed: 0,col1,col2,col3
count,10.0,10.0,10.0
mean,0.178595,-0.127594,0.136899
std,0.927735,0.815602,0.602583
min,-1.24687,-1.806142,-0.689282
25%,-0.297636,-0.372681,-0.447205
50%,-0.121643,-0.188763,0.234336
75%,0.6117,0.106867,0.642844
max,2.087041,1.204081,0.993111


In [16]:
df_1 = df_1.reindex_like(df_2)

print(df_1)

# column name must match otherwise NAN will be added

       col1      col2      col3
0  1.106665 -0.232911 -0.463755
1 -0.286310  0.127137  0.601000
2 -0.078112 -1.806142  0.993111
3 -1.246870  1.204081  0.217491
4 -0.301412 -0.592157  0.656791
5  0.304095 -0.144615  0.251180
6 -0.165173  0.852500 -0.440836


## Missing Data

### When and why is data missed?

### How to check missing values?

In [9]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index = ['a', 'c', 'e', 'f', 'h']\
                 , columns = ['one', 'two', 'three'])

print(df)

print('*'*50)

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print(df)

        one       two     three
a  0.233295  1.517940  0.028751
c  0.420871  1.122614  2.037508
e -1.291245  0.162530  0.575053
f  0.196549  0.434192  0.238056
h  2.633752 -1.035810  0.431360
**************************************************
        one       two     three
a  0.233295  1.517940  0.028751
b       NaN       NaN       NaN
c  0.420871  1.122614  2.037508
d       NaN       NaN       NaN
e -1.291245  0.162530  0.575053
f  0.196549  0.434192  0.238056
g       NaN       NaN       NaN
h  2.633752 -1.035810  0.431360


In [18]:
# Check for missing values

print(df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [19]:
print(df['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


### Calculations with missing values

In [11]:
# When summing the data, NA will be treated as zero

print(df['one'].sum())

2.1932210511569448


In [12]:
# If the data are all NA, then the result will be NA/Zero

df_na = pd.DataFrame(index=[0,1,2,3,4,5], columns=['one', 'two'])

print(df_na)

print('*'*50)

print(df_na['one'].sum())

   one  two
0  NaN  NaN
1  NaN  NaN
2  NaN  NaN
3  NaN  NaN
4  NaN  NaN
5  NaN  NaN
**************************************************
0


### Cleaning missing data

In [22]:
# Replacing nan with a Scalar value

df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'], \
                 columns=['one', 'two', 'three'])

print(df)

print('*'*50)

df = df.reindex(['a', 'b', 'c'])

print(df)

print('*'*50)

print(df.fillna(0))

        one       two     three
a -1.151204  1.015154  0.303267
c  0.588296  0.423108  1.975285
e -0.374714  1.330412  1.254876
**************************************************
        one       two     three
a -1.151204  1.015154  0.303267
b       NaN       NaN       NaN
c  0.588296  0.423108  1.975285
**************************************************
        one       two     three
a -1.151204  1.015154  0.303267
b  0.000000  0.000000  0.000000
c  0.588296  0.423108  1.975285


### Drop missing values

In [23]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index = ['a', 'c', 'e', 'f', 'h'], columns = ['one', 'two', 'three'])

print(df)

print('*'*50)

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print(df)

        one       two     three
a  0.608955 -1.549448  0.676293
c  0.317040 -0.675192  0.888744
e  0.640266  0.029366 -1.193172
f  0.206575  0.958447  0.058521
h -0.078746 -0.931406  0.340437
**************************************************
        one       two     three
a  0.608955 -1.549448  0.676293
b       NaN       NaN       NaN
c  0.317040 -0.675192  0.888744
d       NaN       NaN       NaN
e  0.640266  0.029366 -1.193172
f  0.206575  0.958447  0.058521
g       NaN       NaN       NaN
h -0.078746 -0.931406  0.340437


In [24]:
print(df.dropna())

        one       two     three
a  0.608955 -1.549448  0.676293
c  0.317040 -0.675192  0.888744
e  0.640266  0.029366 -1.193172
f  0.206575  0.958447  0.058521
h -0.078746 -0.931406  0.340437


In [14]:
print(df.dropna(axis=1))

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


## Data

Numbers

Categorical

## A Small Case Study

http://analytics-magazine.org/missing-values/

<b>Major takeaway:</b>
1. Missing values in the data create uncertainty for the analyst and the information consumer because decisions need to be made without having the full picture.
2. Missing values can also reduce the number of usable records for the analysis, or force analysts to eliminate variables from the analysis.
3. Consequently, if an observation has a missing value in any of the required variables, the whole observation (data record) needs to be omitted from the analysis.
4. Other options would be to exclude it from the analysis variable as a whole or to insert imputation values for the missing data points.
