In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from scipy.stats import linregress 

In [2]:

my_arr = np.arange(1_000_000)


In [3]:
my_list = list(range(1_000_000))


In [4]:
%timeit my_arr2 = my_arr * 2

687 µs ± 2.18 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [5]:
%timeit my_list2 = [x * 2 for x in my_list]

26.1 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
rand_array = np.random.rand(2, 3)

In [7]:
rand_array

array([[0.10239212, 0.96392615, 0.31742421],
       [0.46348583, 0.06880787, 0.15477218]])

In [8]:
b = np.array([[1,2,3], [4,5,6], [7,8,9]])
b

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [9]:
a = np.random.rand(2, 3, 4)
a

array([[[0.61797903, 0.31603402, 0.34277522, 0.11839234],
        [0.19547735, 0.66781899, 0.22049953, 0.45050008],
        [0.40229995, 0.0058528 , 0.17177523, 0.97758379]],

       [[0.89393447, 0.37351645, 0.5077199 , 0.7701795 ],
        [0.12924579, 0.23675485, 0.56842457, 0.71941251],
        [0.03977912, 0.8609612 , 0.04091914, 0.8722764 ]]])

In [10]:
a[1, 0: 2, 1:]

array([[0.37351645, 0.5077199 , 0.7701795 ],
       [0.23675485, 0.56842457, 0.71941251]])

In [11]:
a.ndim

3

In [12]:
a.dtype

dtype('float64')

In [13]:
from timeit import Timer

In [14]:
my_list = list(range(10**6))
my_array = np.array(my_list)

In [15]:
def for_add():
    return [item + 1 for item in my_list]

def vec_add():
    return my_array + 1

In [16]:
print('For-loop addition:')
print(min(Timer(for_add).repeat(10, 10)))
print('Vectorized addition:')
print(min(Timer(vec_add).repeat(10, 10)))


For-loop addition:
0.24452895899958094
Vectorized addition:
0.0046170420027920045


In [17]:
def for_mul():
    return [item * 2 for item in my_list]
def vec_mul():
    return my_array * 2
print('For-loop multiplication:')
print(min(Timer(for_mul).repeat(10, 10)))
print('Vectorized multiplication:')
print(min(Timer(vec_mul).repeat(10, 10)))

For-loop multiplication:
0.2568689580002683
Vectorized multiplication:
0.003221249997295672


In [18]:
import math
def for_sqrt():
    return [math.sqrt(item) for item in my_list]
def vec_sqrt():
    return np.sqrt(my_array)
print('For-loop square root:')
print(min(Timer(for_sqrt).repeat(10, 10)))
print('Vectorized square root:')
print(min(Timer(vec_sqrt).repeat(10, 10)))

For-loop square root:
0.6269532080004865
Vectorized square root:
0.007550250000349479


In [19]:
sample = np.random.normal()

In [20]:
sample

-0.3638052939724019

In [21]:
sample = np.random.normal(loc=100, scale=10, size=(2, 3))
sample

array([[104.7418948 , 100.29686355, 102.17427339],
       [100.9702739 , 109.22199579,  85.49193748]])

In [22]:
samples = np.random.poisson(lam=100, size=(2, 3))
samples

array([[106, 104,  99],
       [ 91,  98,  94]])

In [23]:
my_dict = {'coll1': [2, 1, 4], 'col2': np.array([3, 4, 2]), 'col3': [4, 5,6], 'col4': np.array([7,3, 4]),}

df = pd.DataFrame(my_dict)
df

Unnamed: 0,coll1,col2,col3,col4
0,2,3,4,7
1,1,4,5,3
2,4,2,6,4


In [24]:
df.loc[2]

coll1    4
col2     2
col3     6
col4     4
Name: 2, dtype: int64

In [25]:
df.loc[2, ['col2', 'col3']]

col2    2
col3    6
Name: 2, dtype: int64

In [26]:
import pandas as pd

In [27]:
my_dict = {'col1': [1, 2], 'col2': np.array([3, 4]),'col3': [5, 6]}

In [28]:
df = pd.DataFrame(my_dict)

In [29]:
df

Unnamed: 0,col1,col2,col3
0,1,3,5
1,2,4,6


In [30]:
my_array = np.array([[1,3,5],[2,4,6]])
alt_df = pd.DataFrame(my_array, columns=['col1', 'col2', 'col3'])
alt_df

Unnamed: 0,col1,col2,col3
0,1,3,5
1,2,4,6


In [31]:
df.loc[0]

col1    1
col2    3
col3    5
Name: 0, dtype: int64

In [32]:

df.iloc[0]

col1    1
col2    3
col3    5
Name: 0, dtype: int64

In [33]:
df.loc[0, ['col2', 'col3']]

col2    3
col3    5
Name: 0, dtype: int64

In [34]:
df['col4'] = [10,10]

In [35]:
df.loc[3] = [1,2,3,4]

In [36]:
df

Unnamed: 0,col1,col2,col3,col4
0,1,3,5,10
1,2,4,6,10
3,1,2,3,4


In [37]:
df.loc[[False, True, False], [False, True, False, True]]
df

Unnamed: 0,col1,col2,col3,col4
0,1,3,5,10
1,2,4,6,10
3,1,2,3,4


In [38]:
df.loc[:, df.loc[0] > 5]

Unnamed: 0,col4
0,10
1,10
3,4


In [39]:
# Manipulating DataFrames

In [40]:
df = pd.read_csv('dataset.csv', index_col='id')

In [41]:
df

Unnamed: 0_level_0,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1.0,3.0
1,1,0.0,9.0
2,1,3.0,
3,2,0.0,10.0
4,1,,4.0
5,2,2.0,3.0


In [42]:
df = df.rename(columns={'x': 'col_x', 'y': 'col_y', 'z': 'col_z'})
df = df.fillna(0)
df = df.astype(int)
df

Unnamed: 0_level_0,col_x,col_y,col_z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1,3
1,1,0,9
2,1,3,0
3,2,0,10
4,1,0,4
5,2,2,3


In [44]:
df = df.drop([1, 2, 4], axis=0)

In [45]:
df


Unnamed: 0_level_0,col_x,col_y,col_z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1,3
3,2,0,10
5,2,2,3


In [48]:
zero_df = pd.DataFrame(np.zeros((2, 3)), columns=['col_x', 'col_y', 'col_z'])
zero_df

Unnamed: 0,col_x,col_y,col_z
0,0.0,0.0,0.0
1,0.0,0.0,0.0


In [50]:
df = pd.concat([df, zero_df], axis=0)
df

Unnamed: 0,col_x,col_y,col_z
0,1.0,1.0,3.0
3,2.0,0.0,10.0
5,2.0,2.0,3.0
0,0.0,0.0,0.0
1,0.0,0.0,0.0
0,0.0,0.0,0.0
1,0.0,0.0,0.0
