In [63]:
%matplotlib inline

# Day 0016 - Intro to data structures
# - 1000 days of Python 

# Setup working directory

In [124]:
pwd

'D:\\desktop\\Python\\1000 days of python'

In [2]:
cd 

D:\DoTWA_Data\FR201410_NoSBET_Q2\fr_1014\Tide\TideData


In [125]:
pwd

'D:\\desktop\\Python\\1000 days of python'

## Import Libraries

In [126]:
import numpy as np
import pandas as pd

# 1. Series

The basic method to create a Series is to call:
Here, data can be many different things:
a Python dict
an ndarray
a scalar value (like 5)
index is a list of axis labels
s = pd.Series(data, index=index)

## 1.1. From ndarray



If data is an ndarray, index must be the same length as data. If no index is passed, one will be created having values [0, ..., len(data) - 1].

In [128]:
# If data is an ndarray, index must be the same length as data
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a   -0.740685
b   -0.193466
c    0.030491
d    0.990519
e    0.798111
dtype: float64

In [129]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [130]:
# If no index is passed, one will be created having values [0, ..., len(data) - 1].
pd.Series(np.random.randn(5))

0    1.310901
1    0.216758
2    1.209819
3    0.359696
4    0.205838
dtype: float64

## 1.2. From dict



In [131]:
#Series can be instantiated from dicts:
#no index will be passed, the Series index is ordered by the list of dict keys
d = {"b": 1, "a": 0, "c": 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [132]:
#If an index is passed, the values in data corresponding to the labels in the index will be pulled out.
pd.Series(d, index=["b", "c", "d", "a"])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

## 1.3. From scalar value



In [133]:
#If data is a scalar value, an index must be provided. The value will be repeated to match the length of index.
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

## 1.4. Series is ndarray-like

Series acts very similarly to a ndarray, and is a valid argument to most NumPy functions. However, operations such as slicing will also slice the index.

In [134]:
s[0]

-0.7406854309619295

In [135]:
s[:3]

a   -0.740685
b   -0.193466
c    0.030491
dtype: float64

In [136]:
s[s > s.median()]

d    0.990519
e    0.798111
dtype: float64

In [137]:
s[[4, 3, 1]]

e    0.798111
d    0.990519
b   -0.193466
dtype: float64

In [138]:
np.exp(s)

a    0.476787
b    0.824098
c    1.030961
d    2.692633
e    2.221340
dtype: float64

In [139]:
s.dtype

dtype('float64')

In [140]:
#If you need the actual array backing a Series, use Series.array.
s.array

<PandasArray>
[ -0.7406854309619295, -0.19346626808127967,  0.03049103485293862,
    0.990519362852863,   0.7981105170015258]
Length: 5, dtype: float64

In [141]:
#While Series is ndarray-like, if you need an actual ndarray, then use Series.to_numpy().
s.to_numpy()

array([-0.74068543, -0.19346627,  0.03049103,  0.99051936,  0.79811052])

## 1.5. Series is dict-like

In [142]:
#A Series is like a fixed-size dict in that you can get and set values by index label:
s["a"]

-0.7406854309619295

In [143]:
s["e"] = 12.0
s

a    -0.740685
b    -0.193466
c     0.030491
d     0.990519
e    12.000000
dtype: float64

In [144]:
"e" in s

True

In [145]:
"f" in s

False

In [146]:
#Using the get method, a missing label will return None or specified default:
s.get("f")

In [147]:
s.get("f", np.nan)

nan

## 1.6. Vectorized operations and label alignment with Series

When working with raw NumPy arrays, looping through value-by-value is usually not necessary. The same is true when working with Series in pandas. Series can also be passed into most NumPy methods expecting an ndarray.

In [148]:
s

a    -0.740685
b    -0.193466
c     0.030491
d     0.990519
e    12.000000
dtype: float64

In [149]:
s + s

a    -1.481371
b    -0.386933
c     0.060982
d     1.981039
e    24.000000
dtype: float64

In [150]:
s * 2

a    -1.481371
b    -0.386933
c     0.060982
d     1.981039
e    24.000000
dtype: float64

In [151]:
np.exp(s)

a         0.476787
b         0.824098
c         1.030961
d         2.692633
e    162754.791419
dtype: float64

In [152]:
#A key difference between Series and ndarray is that operations between Series automatically align the data based on label. 
s[1:]

b    -0.193466
c     0.030491
d     0.990519
e    12.000000
dtype: float64

In [153]:
s[:-1]

a   -0.740685
b   -0.193466
c    0.030491
d    0.990519
dtype: float64

In [154]:
s[1:] + s[:-1]

a         NaN
b   -0.386933
c    0.060982
d    1.981039
e         NaN
dtype: float64

## 1.7. Name attribute

In [155]:
#Series can also have a name attribute:
s = pd.Series(np.random.randn(5), name="something")
s

0   -1.188322
1   -0.236390
2    1.226109
3    1.563703
4    0.345754
Name: something, dtype: float64

In [156]:
s.name

'something'

In [157]:
#You can rename a Series with the pandas.Series.rename() method.
s2 = s.rename("different")
s2.name

'different'

# 2. DataFrame

## 2.1 From dict of Series or dicts

In [158]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [159]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [160]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [161]:
#The row and column labels can be accessed respectively by accessing the index and columns attributes:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [162]:
df.columns

Index(['one', 'two'], dtype='object')

## 2.2 From dict of ndarrays / lists

The ndarrays must all be the same length. If an index is passed, it must clearly also be the same length as the arrays. If no index is passed, the result will be range(n), where n is the array length.

In [163]:
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [164]:
pd.DataFrame(d, index=["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


## 2.3 From structured or record array

In [165]:
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [166]:
pd.DataFrame(data, index=["first", "second"])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [167]:
pd.DataFrame(data, columns=["C", "A", "B"])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


## 2.4 From a list of dicts

In [168]:
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [169]:
pd.DataFrame(data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [170]:
pd.DataFrame(data2, columns=["a", "b"])

Unnamed: 0,a,b
0,1,2
1,5,10


## 2.5 From a dict of tuples

In [171]:
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


## 2.6 From a Series

## 2.7 From a list of namedtuples

The field names of the first namedtuple in the list determine the columns of the DataFrame. The remaining namedtuples (or tuples) are simply unpacked and their values are fed into the rows of the DataFrame. If any of those tuples is shorter than the first namedtuple then the later columns in the corresponding row are marked as missing values. If any are longer than the first namedtuple, a ValueError is raised.

In [172]:
from collections import namedtuple
Point = namedtuple("Point", "x y")
pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


In [173]:
Point3D = namedtuple("Point3D", "x y z")
pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])

Unnamed: 0,x,y,z
0,0,0,0.0
1,0,3,5.0
2,2,3,


## 2.8 From a list of dataclasses

In [174]:
#Please be aware, that all values in the list should be dataclasses, mixing types in the list would result in a TypeError.
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


### 2.8.1. Missing data

Much more will be said on this topic in the Missing data section. To construct a DataFrame with missing data, we use np.nan to represent missing values. Alternatively, you may pass a numpy.MaskedArray as the data argument to the DataFrame constructor, and its masked entries will be considered missing.

## 2.9 Alternate constructors

### 2.9.1. DataFrame.from_dict

In [175]:
pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [176]:
pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],
)


Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


### 2.9.2. DataFrame.from_records

In [177]:
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [178]:
pd.DataFrame.from_records(data, index="C")

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


## 2.10 Column selection, addition, deletion

In [179]:
df["one"]

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [180]:
df["three"] = df["one"] * df["two"]
df["flag"] = df["one"] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [181]:
#Columns can be deleted or popped like with a dict:
del df["two"]
three = df.pop("three")
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [182]:
#When inserting a scalar value, it will naturally be propagated to fill the column:
df["foo"] = "bar"
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [183]:
#When inserting a Series that does not have the same index as the DataFrame, it will be conformed to the DataFrame’s index:
df["one_trunc"] = df["one"][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [184]:
#You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
#By default, columns get inserted at the end. The insert function is available to insert at a particular location in the columns:
df.insert(1, "bar", df["one"])
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


## 2.11 Assigning new columns in method chains

In [185]:
# Inspired by dplyr’s mutate verb, DataFrame has an assign() 
# method that allows you to easily create new columns that are potentially derived from existing columns.


In [186]:
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])

Unnamed: 0,A,B,C,D
0,1,4,5,6
1,2,5,7,9
2,3,6,9,12


## 2.12 Indexing / selection

In [187]:
df.loc["b"]


one            2.0
bar            2.0
flag         False
foo            bar
one_trunc      2.0
Name: b, dtype: object

In [188]:
df.iloc[2]

one           3.0
bar           3.0
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

## 2.13 Data alignment and arithmetic

In [189]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
df + df2

Unnamed: 0,A,B,C,D
0,3.840031,-2.342414,0.040953,
1,0.527436,0.983798,-0.166394,
2,1.581687,2.19175,-0.161088,
3,0.632402,-0.363056,1.257371,
4,2.123372,-1.194215,-0.329881,
5,2.001389,-0.590005,-1.229449,
6,1.075124,1.161879,-0.044172,
7,,,,
8,,,,
9,,,,


In [190]:
#When doing an operation between DataFrame and Series, 
#the default behavior is to align the Series index on the DataFrame columns, thus broadcasting row-wise. For example:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-1.859081,2.87478,-0.771691,-0.290916
2,-1.004558,1.211877,-0.762032,0.658051
3,-2.246592,0.707123,0.287218,-0.383527
4,-1.014858,0.013856,-0.353243,-0.836661
5,-1.385349,0.737296,-0.573333,0.368507
6,-1.65122,1.827946,-1.242036,-1.507097
7,-2.261765,1.964899,-2.726154,0.281846
8,-1.329347,1.422392,-0.125083,-1.514954
9,-1.943952,2.797449,-0.330114,-0.785134


In [191]:
#Operations with scalars are just as you would expect:
df * 5 + 2

Unnamed: 0,A,B,C,D
0,10.833194,-6.40351,5.959645,2.748029
1,1.537788,7.970391,2.101188,1.29345
2,5.810404,-0.344123,2.149487,6.038286
3,-0.399768,-2.867896,7.395737,0.830393
4,5.758904,-6.334228,4.19343,-1.435278
5,3.906449,-2.717028,3.092981,4.590565
6,2.577093,2.736221,-0.250534,-4.787455
7,-0.475631,3.420986,-7.671126,4.15726
8,4.186459,0.708449,5.334229,-4.826739
9,1.113432,7.583733,4.309075,-1.177642


In [192]:
1 / df

Unnamed: 0,A,B,C,D
0,0.566047,-0.594989,1.262739,6.684232
1,-10.817546,0.837466,49.412968,-7.076641
2,1.312197,-2.132994,33.447835,1.238149
3,-2.083535,-1.027138,0.926657,-4.27494
4,1.330175,-0.599936,2.279534,-1.455486
5,2.622677,-1.059989,4.574647,1.930081
6,8.66412,6.791438,-2.221695,-0.736653
7,-2.019687,3.518684,-0.517003,2.317755
8,2.286803,-3.871313,1.499597,-0.732414
9,-5.639723,0.895458,2.16537,-1.573494


In [193]:
df ** 4

Unnamed: 0,A,B,C,D
0,9.74072,7.979265,0.3933192,0.000501
1,7.3e-05,2.032971,1.677398e-07,0.000399
2,0.337291,0.04831,7.989654e-07,0.425509
3,0.053064,0.898432,1.356198,0.002994
4,0.319422,7.719363,0.03703526,0.222827
5,0.021136,0.792125,0.002283336,0.072061
6,0.000177,0.00047,0.04104521,3.395844
7,0.060098,0.006523,13.99678,0.034652
8,0.036567,0.004452,0.1977434,3.475148
9,0.000988,1.555315,0.04548538,0.163132


In [194]:
#Boolean operators work as well:
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [195]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [196]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [197]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


## 2.14 Transposing

In [198]:
df[:5].T

Unnamed: 0,0,1,2,3,4
A,1.766639,-0.092442,0.762081,-0.479954,0.751781
B,-1.680702,1.194078,-0.468825,-0.973579,-1.666846
C,0.791929,0.020238,0.029897,1.079147,0.438686
D,0.149606,-0.14131,0.807657,-0.233921,-0.687056


## 2.15 DataFrame interoperability with NumPy functions

In [199]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,5.851153,0.186243,2.207651,1.161376
1,0.911702,3.300514,1.020444,0.86822
2,2.14273,0.625737,1.030349,2.242648
3,0.618812,0.377729,2.94217,0.791424
4,2.120773,0.188842,1.550668,0.503055
5,1.464172,0.3893,1.244329,1.678857
6,1.122343,1.158637,0.63756,0.257306
7,0.609494,1.328695,0.144536,1.539491
8,1.548508,0.772356,1.948083,0.255292
9,0.837517,3.054899,1.586952,0.529655


In [200]:
np.asarray(df)

array([[ 1.7666388 , -1.68070201,  0.79192905,  0.14960583],
       [-0.09244241,  1.19407825,  0.0202376 , -0.14130998],
       [ 0.76208089, -0.46882457,  0.0298973 ,  0.8076572 ],
       [-0.47995354, -0.97357915,  1.07914737, -0.2339214 ],
       [ 0.75178079, -1.66684558,  0.43868607, -0.68705566],
       [ 0.38128977, -0.94340559,  0.21859611,  0.51811299],
       [ 0.11541853,  0.14724421, -0.45010686, -1.35749095],
       [-0.4951261 ,  0.28419713, -1.93422524,  0.43145191],
       [ 0.43729178, -0.25831029,  0.66684589, -1.36534789],
       [-0.17731367,  1.11674654,  0.46181491, -0.63552849]])

In [201]:
# The ufunc is applied to the underlying array in a Series.
ser = pd.Series([1, 2, 3, 4])
np.exp(ser)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

In [202]:
ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"])
ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"])
ser1

a    1
b    2
c    3
dtype: int64

In [203]:
ser2

b    1
a    3
c    5
dtype: int64

In [204]:
np.remainder(ser1, ser2)

a    1
b    0
c    3
dtype: int64

In [205]:
ser3 = pd.Series([2, 4, 6], index=["b", "c", "d"])
ser3

b    2
c    4
d    6
dtype: int64

In [206]:
np.remainder(ser1, ser3)

a    NaN
b    0.0
c    3.0
d    NaN
dtype: float64

In [207]:
ser = pd.Series([1, 2, 3])
idx = pd.Index([4, 5, 6])
np.maximum(ser, idx)

0    4
1    5
2    6
dtype: int64

## 2.16 Console display

## 2.17 DataFrame column attribute access and IPython completion

In [208]:
df = pd.DataFrame({"foo1": np.random.randn(5), "foo2": np.random.randn(5)})
df

Unnamed: 0,foo1,foo2
0,0.561902,-1.48828
1,0.781355,-0.718086
2,1.131666,1.033538
3,0.785125,0.453873
4,-0.907913,0.102113


In [209]:
df.foo1

0    0.561902
1    0.781355
2    1.131666
3    0.785125
4   -0.907913
Name: foo1, dtype: float64

In [210]:
#Referece
#https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html