In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [8]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('ggplot')

### The pandas <u>Series</u> object

#### Series Instantiation

In [32]:
# instantiate a Series using an Numpy array
array1 = np.array([1, 2, 3, 4])
print(array1, '\n')

series1 = Series(array1)
print(series1)

[1 2 3 4] 

0    1
1    2
2    3
3    4
dtype: int32


In [33]:
# instantiate a Series using an Python list
list2 = [1, 2, 3, 4]
print(list2, '\n')

series2 = Series(list2)
print(series2)

[1, 2, 3, 4] 

0    1
1    2
2    3
3    4
dtype: int64


<b>When you instantiate a Series from a Numpy array of integers, pandas assigns a datatype of int32 for the Series data.</b> <br>
<b>Whereas, instantiating a Series from a Python list of integers, pandas assigns int64 for the Series data.</b>

In [60]:
# instantiate a Series using an Python list and explicitly assign non-integer labels to index
list3 = [1, 2, 3, 4]
print(list3, '\n')

series3 = Series(list3, index=['a', 'b', 'c', 'd'])
print(series3)

[1, 2, 3, 4] 

a    1
b    2
c    3
d    4
dtype: int64


In [61]:
# instantiate a Series using an Python list and explicitly assign non-integer labels to index
list4 = [1, 2, 3, 4]
print(list4, '\n')

series4 = Series(list4, index=[0, 1, 2, 3])
print(series4)

[1, 2, 3, 4] 

0    1
1    2
2    3
3    4
dtype: int64


#### Selecting data from a Series

In [53]:
# selecting data from a Series when the index is integer labeled (does NOT use zero-based positioning)
print(series1[[1, 3]], '\n')           # returns a Series
print(series1[3])                      # returns an integer

1    2
3    4
dtype: int32 

4


In [63]:
# selecting data from a Series when the index is non-integer labeled (uses label-based positioning)
print(series3[['a', 'c']], '\n')
print(series3['a'], '\n')
print(series3[[0, 2]], '\n')            # note that selection using integers for a non-integer labeled Series implements
                                        # zero-based positioning
    
print(series4, '\n')
print(series4[[1, 3]])

a    1
c    3
dtype: int64 

1 

a    1
c    3
dtype: int64 

0    1
1    2
2    3
3    4
dtype: int64 

1    2
3    4
dtype: int64


#### The Series index

In [66]:
print(series1.index, '\n')         # when Series are instantiated with integers, the index is a RangeIndex object
print(series2.index, '\n')
print(series3.index, '\n')
print(series4.index, '\n')         # when Series is instantiated with integer index assigned explicitly the index is
                                   # an Int64Index object

RangeIndex(start=0, stop=4, step=1) 

RangeIndex(start=0, stop=4, step=1) 

Index(['a', 'b', 'c', 'd'], dtype='object') 

Int64Index([0, 1, 2, 3], dtype='int64') 



Index(['a', 'b', 'c', 'd'], dtype='object')

#### A time-series Series

In [82]:
dates_ts1 = pd.date_range('2023-06-01', '2023-06-06')       # this is a special index in pandas called a DatetimeIndex
                                                            # that is optimized to index data with dates and times
print(dates_ts1, '\n')
print(dates_ts1[[0, 2]], '\n')
print(dates_ts1[3], '\n')

DatetimeIndex(['2023-06-01', '2023-06-02', '2023-06-03', '2023-06-04',
               '2023-06-05', '2023-06-06'],
              dtype='datetime64[ns]', freq='D') 

DatetimeIndex(['2023-06-01', '2023-06-03'], dtype='datetime64[ns]', freq=None) 

2023-06-04 00:00:00 



In [92]:
dates_ts2 = pd.date_range('2023-06-02', '2023-06-05')

series_prices1 = Series([80, 82, 85, 90, 83, 87], index=dates_ts1)
series_prices2 = Series([75, 69, 83, 79], index=dates_ts2)
series_prices3 = Series([70, 75, 69, 83, 79, 77], index=dates_ts1)

print(series_prices1, '\n')
print(series_prices2)

2023-06-01    80
2023-06-02    82
2023-06-03    85
2023-06-04    90
2023-06-05    83
2023-06-06    87
Freq: D, dtype: int64 

2023-06-02    75
2023-06-03    69
2023-06-04    83
2023-06-05    79
Freq: D, dtype: int64


#### Basic statistics on Series data

In [90]:
series_prices1_mean = series_prices1.mean()
series_prices2_mean = series_prices2.mean()

print(series_prices1_mean, '\n')
print(series_prices2_mean, '\n')

84.5 

76.5 



In [91]:
series_prices_diff = series_prices1 - series_prices2        # understand by the first and last row return NaN

print(series_prices_diff)

2023-06-01     NaN
2023-06-02     7.0
2023-06-03    16.0
2023-06-04     7.0
2023-06-05     4.0
2023-06-06     NaN
Freq: D, dtype: float64


### The pandas <u>DataFrame</u> object

#### A pandas DataFrame is a collection of one or more pandas Series aligned by a common index.

#### DataFrame Instantiation

In [97]:
df1 = DataFrame({'IBM':series_prices1, 'MSFT':series_prices3})
df2 = DataFrame({'IBM':series_prices1, 'TKWY_NA':series_prices2})

print(df1, '\n')
print(df2)

            IBM  MSFT
2023-06-01   80    70
2023-06-02   82    75
2023-06-03   85    69
2023-06-04   90    83
2023-06-05   83    79
2023-06-06   87    77 

            IBM  TKWY_NA
2023-06-01   80      NaN
2023-06-02   82     75.0
2023-06-03   85     69.0
2023-06-04   90     83.0
2023-06-05   83     79.0
2023-06-06   87      NaN


#### Selecting data from a DataFrame

In [105]:
print(df1['IBM'], '\n')                          # this returns a pandas Series
print(df1.IBM, '\n')
print(df1[['MSFT', 'IBM']], '\n')                # this retuns a pandas DataFrame

2023-06-01    80
2023-06-02    82
2023-06-03    85
2023-06-04    90
2023-06-05    83
2023-06-06    87
Freq: D, Name: IBM, dtype: int64 

2023-06-01    80
2023-06-02    82
2023-06-03    85
2023-06-04    90
2023-06-05    83
2023-06-06    87
Freq: D, Name: IBM, dtype: int64 

            MSFT  IBM
2023-06-01    70   80
2023-06-02    75   82
2023-06-03    69   85
2023-06-04    83   90
2023-06-05    79   83
2023-06-06    77   87 



In [104]:
df1['IBM'][[1, 4]]                 # here you are selecting from a Series extracted from the general DataFrame

2023-06-02    82
2023-06-05    83
Freq: 3D, Name: IBM, dtype: int64

In [106]:
df1.IBM - df1.MSFT                  # this is identical to math using two separate Series

2023-06-01    10
2023-06-02     7
2023-06-03    16
2023-06-04     7
2023-06-05     4
2023-06-06    10
Freq: D, dtype: int64

In [108]:
df1['Difference'] = df1.IBM - df1.MSFT

print(df1)

            IBM  MSFT  Difference
2023-06-01   80    70          10
2023-06-02   82    75           7
2023-06-03   85    69          16
2023-06-04   90    83           7
2023-06-05   83    79           4
2023-06-06   87    77          10


In [109]:
df1.index

DatetimeIndex(['2023-06-01', '2023-06-02', '2023-06-03', '2023-06-04',
               '2023-06-05', '2023-06-06'],
              dtype='datetime64[ns]', freq='D')

In [111]:
df1.columns

Index(['IBM', 'MSFT', 'Difference'], dtype='object')