# Data Science Introduction

We will be using two additional packages - `numpy` and `pandas`. They both need to be imported and usually they are given an alias.

In [1]:
import pandas as pd
import numpy as np

## Numpy array

Example of numpy `array` structure.

In [2]:
a = np.array([1,2,3])
print(a)

[1 2 3]


The array `a` has a special `ndarray` type.

In [3]:
print(type(a))

<class 'numpy.ndarray'>


In [None]:
print(type(a[0]))

Check the dimension and size of the array.

In [13]:
a.shape

(3,)

Create a multidimensional array.

In [5]:
b = np.array([[1,2,3],[4,5,6],[7,8,9]], dtype=np.float64)
print(b)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


In [None]:
print(type(b))

In [None]:
b.shape

In [None]:
print(type(b[0][0]))

The `strides` attributes gives us the number of bytes to step in each direction when traversing the array. 

In [6]:
b.strides
#8 bytes to reach the next column and 24 bytes to reach the next row

(24, 8)

Zero-dimensional array - scalar.

In [7]:
x = np.array(42)
print(x)
print(type(x))
print(np.ndim(x))
print(x.shape)

42
<class 'numpy.ndarray'>
0
()


In [None]:
arr = np.array([1,'martina', 7.8])
print(arr)
print(type(arr[0]))

### Special arrays

Arrays of ones and zeroes

In [None]:
ones = np.ones((2,3), float)
print(ones)

In [None]:
zeroes = np.zeros((2,3))
print(zeroes)

Identity arrays

In [None]:
ident = np.identity(3)
print(ident)

Non-square identity matrix, the `k` parameter defines the position of the diagonal, default 0 - main diagonal.

In [None]:
ident2 = np.eye(3,4,k=2, dtype=int)
print(ident2)

### Numerical operations

In [None]:
a = np.array([1,2,3,4,5,6,7,8])
a = a + 1
print(a)

In [None]:
# adding two arrays together
b = np.ones((1,8), int)
print(b)

c = a+b
print(c)

Matrix multiplication

In [None]:
mat1 = np.array([[1,2,3], [4,5,6]])
mat2 = np.array([[1,2], [3,4], [5,6]])
print(np.dot(mat1,mat2))

Equality operator

In [None]:
mat1 = np.array([[1,2,4], [3,4,5], [6,7,8]])
mat2 = np.array([[1,2,6],[3,7,5], [6,4,5]])
print(mat1 == mat2)

#complete array equality
print(np.array_equal(mat1, mat2))

Structured arrays - heterogenous data

In [None]:
dt = np.dtype([('country', 'S20'), ('population','i4')])
datatable = np.array([
    ('UK', 65640000),
    ('USA', 323100000),
    ('Russia', 144300000)
], dtype = dt)

print(datatable)

In [None]:
print(datatable['country'])

In [None]:
print(datatable['population'])

In [None]:
ctry = datatable[0]
print(ctry)
print(ctry[0])

Numpy matrices are strictly 2-D, support matrix manipulation

In [None]:
MA = np.mat([[1,2,3], [4,5,6]])
MB = np.mat([[7,8],[9,12],[10,11]])

print(MA*MB)

## Numpy arange

Example of `arange()` function, if used with integers it is almost equivalent to Python in-built `range()` function. The values are generated within the half-open interval, similar to slicing.

In [None]:
a = np.arange(1,5)
print(a)

Using optional step-size parameter.

In [None]:
a = np.arange(0.5, 2.5, 0.5)
print(a)
print(type(a))

## Pandas DataFrames

 - DataFrames are rectangular table of data
 - Contain rows and columns
 - Columns have headings
 - Rows have index

In [None]:
# can create dataframe by reading an excel spreadsheet
df = pd.read_excel(io='FinData.xls', sheet_name='AMZN')
df

View first few rows or last few rows.

In [None]:
df.head()

In [None]:
df.tail(10)

Access certain columns

In [None]:
df['Open']

In [None]:
print(type(df['Open']))

In [None]:
df[['Open']]

In [None]:
print(type(df[['Open']]))

In [None]:
#multiple columns
df[['Open', 'Close', 'Adj Close']]

Getting rows

In [None]:
df.loc[1]

In [None]:
df.loc[[1]]

In [None]:
df.loc[5:90]

Functions on columns of data:
- min
- max
- mean
- median
- count

Min and max also work on string columns.

In [None]:
df['Open'].mean()
#df['Open'].min()
#df['Open'].max()
#df['Open'].median()
#df['Open'].max()
#df['Open'].count()

In [None]:
df[['Open', 'Close', 'Adj Close']].mean()

Array arithmetics with DataFrame

In [None]:
df['Open'] - df['Close']

In [None]:
arr = df['High'] > 3020
print(arr[6:90])

## Numpy where

In [None]:
a = np.array([1,2,3,4,5])
np.where(a % 2 == 0, 'Even', 'Odd')

In [None]:
np.where(df['High'] > 3100 , 'High', 'Low')

## Time Series

In [None]:
df = pd.read_excel(io='FinData.xls', sheet_name='DARK.L', index_col = 'Date', parse_dates = True)
df

In [None]:
df.describe()

In [None]:
df['2021-10']

In [None]:
df['2021-May':'2021-Sep']


In [None]:
df['2021-May':'2021-Sep':30]

In [None]:
df['Open'].plot()

In [None]:
open = df[['Open']]

In [None]:
open['2021-May':'2021-Aug'].plot()

In [None]:
df2 = df[['Open','Close']]
df2['2021-May':'2021-Aug'].plot()

In [12]:
x = np.arange(0.5, 2.5, 0.5, int)
print(x)


[0 1 2 3]
