# Data loading, storage and file format

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
pd.read_table('ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
pd.read_csv('ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
names = ['a', 'b', 'c', 'd', 'message']

In [7]:
pd.read_csv('ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [8]:
pd.read_csv('ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


# Hierarchial indexing

In [9]:
parsed = pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2'])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


# Text file

In [10]:
list(open('ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [11]:
result = pd.read_table('ex3.txt', sep='\s+')
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


# Reading file in pieces

In [12]:
pd.read_csv('ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [13]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x28f1a3cbc48>

In [14]:
from pandas import Series

In [15]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)
tot = Series([],dtype= 'object')
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

In [16]:
tot[10:20]

A    320
B    302
C    286
D    320
E    368
F    335
G    308
H    330
I    327
J    337
dtype: object

# Web Scrapping 

In [17]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [18]:
url = 'https://www.basketball-reference.com/leagues/NBA_2020_totals.html'
req = requests.get(url)
req 

<Response [200]>

In [19]:
soup = BeautifulSoup(req.content, 'html.parser')
#print(soup) #To verify output (VERY LONG)

In [20]:
tablea = soup.find(name='table')

In [21]:
df = pd.read_html(str(tablea))[0].set_index('Rk') #We use "[0]" because the output would be a list with a single element, and we need that element, not the list
df.head()

Unnamed: 0_level_0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Steven Adams,C,26,OKC,63,63,1680,283,478,0.592,...,0.582,207,376,583,146,51,67,94,122,684
2,Bam Adebayo,PF,22,MIA,72,72,2417,440,790,0.557,...,0.691,176,559,735,368,82,93,204,182,1146
3,LaMarcus Aldridge,C,34,SAS,53,53,1754,391,793,0.493,...,0.827,103,289,392,129,36,87,74,128,1001
4,Kyle Alexander,C,23,MIA,2,0,13,1,2,0.5,...,,2,1,3,0,0,0,1,1,2
5,Nickeil Alexander-Walker,SG,21,NOP,47,1,591,98,266,0.368,...,0.676,9,75,84,89,17,8,54,57,267


# Writing out to text format

In [22]:
data = pd.read_csv('ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [23]:
data.to_csv('out.csv')

In [24]:
import sys

In [25]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [26]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [27]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


# Writing CSV with series

In [28]:
dates = pd.date_range('1/1/2000', periods=7)

In [29]:
import numpy as np

In [30]:
ts = Series(np.arange(7), index=dates)

In [31]:
ts.to_csv('tseries.csv')

  """Entry point for launching an IPython kernel.


In [32]:
df = pd.read_csv('tseries.csv', parse_dates=True)
df

Unnamed: 0,2000-01-01,0
0,2000-01-02,1
1,2000-01-03,2
2,2000-01-04,3
3,2000-01-05,4
4,2000-01-06,5
5,2000-01-07,6


# Numpy

# Creating nd array

In [33]:
import numpy as np

In [34]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1

array([6. , 7.5, 8. , 0. , 1. ])

In [35]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [36]:
arr2.ndim

2

In [37]:
arr2.shape

(2, 4)

In [38]:
arr2.dtype

dtype('int32')

# creating new array with zeros and ones

In [39]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [40]:
np.zeros((3, 6))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [41]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [42]:
x = np.empty((2, 3, 2))
x

array([[[0.00000000e+000, 9.88131292e-324],
        [1.39010905e-311, 1.39011108e-311],
        [1.39011108e-311, 1.39011108e-311]],

       [[1.39010919e-311, 1.39010919e-311],
        [1.39010561e-311, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000]]])

In [43]:
x.ndim

3

In [44]:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [45]:
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr1

array([1., 2., 3.])

In [46]:
arr2 = np.array([1, 2, 3], dtype=np.int32)
arr2

array([1, 2, 3])

In [47]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
arr

array([ 3.7, -1.2, -2.6,  0.5, 12.9, 10.1])

In [48]:
arr.astype(np.int32)

array([ 3, -1, -2,  0, 12, 10])

In [49]:
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
y = numeric_strings.astype(float)
y

array([ 1.25, -9.6 , 42.  ])

# Operation between arrays and scalars

In [50]:
arr1 = np.array([[1., 2., 3.], [4., 5., 6.]])
arr1

array([[1., 2., 3.],
       [4., 5., 6.]])

In [51]:
arr2 = np.array([[3., 5., 9.],[5. ,6. ,8.]])
arr2

array([[3., 5., 9.],
       [5., 6., 8.]])

In [52]:
arr1 * arr2

array([[ 3., 10., 27.],
       [20., 30., 48.]])

In [53]:
arr1 - arr2

array([[-2., -3., -6.],
       [-1., -1., -2.]])

In [54]:
1 / arr1

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [55]:
arr1 ** 2

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

# Basic indexing and slicing

In [56]:
arr = np.array([1,5,6,8,9,10])
arr

array([ 1,  5,  6,  8,  9, 10])

In [57]:
arr[2]

6

In [58]:
arr[3:5]

array([8, 9])

In [59]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d[2]

array([7, 8, 9])

In [60]:
arr2d[0][2]

3

In [61]:
arr2d[0, 2]

3

In [62]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [63]:
arr2d[2, :]

array([7, 8, 9])

In [64]:
arr2d[:,:2]

array([[1, 2],
       [4, 5],
       [7, 8]])

In [65]:
arr2d[1, :2]

array([4, 5])

# Transposing and axes swapping

In [66]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [67]:
x = arr.T
x

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [68]:
arr = np.arange(16).reshape((2, 2, 4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [69]:
arr.transpose((1, 0, 2))

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [70]:
arr.swapaxes(1, 2)

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

# Universal function

In [71]:
arr = np.arange(10)

In [72]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [73]:
np.exp(arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

# mathematical and statistical methods

In [74]:
arr = np.random.randn(5, 4) # normally-distributed data
arr.mean()

0.2884715617120601

In [75]:
np.mean(arr)

0.2884715617120601

In [76]:
arr.sum()

5.769431234241202

In [77]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])

In [78]:
arr.cumsum(0)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]], dtype=int32)

In [79]:
arr.cumprod(1)

array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]], dtype=int32)

# Matrix Operations

In [80]:
from numpy.linalg import inv, qr

In [81]:
X = np.random.randn(5, 5)

In [82]:
mat = X.T.dot(X)

In [83]:
inv(mat)

array([[ 0.58007894, -0.20817153,  0.40076585,  0.13463447, -0.38899791],
       [-0.20817153,  0.68914375, -0.38166269,  0.10700804,  0.22027789],
       [ 0.40076585, -0.38166269,  0.79840114, -0.16487216, -0.38093736],
       [ 0.13463447,  0.10700804, -0.16487216,  0.48594879, -0.2468869 ],
       [-0.38899791,  0.22027789, -0.38093736, -0.2468869 ,  0.57946894]])

In [84]:
mat.dot(inv(mat))

array([[ 1.00000000e+00, -1.85532577e-17,  3.13645356e-17,
         6.12130824e-17,  4.51711888e-17],
       [-6.52670812e-17,  1.00000000e+00, -8.68873696e-17,
         7.40480041e-17,  3.26199337e-17],
       [ 1.03482554e-16, -2.51160284e-16,  1.00000000e+00,
        -1.68598568e-16, -2.59045028e-16],
       [-4.59815847e-16, -5.41258168e-17,  2.05683930e-17,
         1.00000000e+00,  5.26732877e-16],
       [-3.15167664e-16, -2.17319905e-16, -2.81763462e-16,
         1.64816795e-16,  1.00000000e+00]])

In [85]:
q, r = qr(mat)
r

array([[-4.40159043, -0.03272361,  3.41454827,  3.06026156,  0.24769175],
       [ 0.        , -2.34362599,  0.82937508,  3.06630635,  3.27143595],
       [ 0.        ,  0.        , -5.91748626, -6.85942646, -7.89567121],
       [ 0.        ,  0.        ,  0.        , -1.87933198, -0.58033297],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.16114937]])

# Random number generation

In [86]:
samples = np.random.normal(size=(4, 4))
samples

array([[-0.94328339, -0.42987344, -0.35436455, -0.06233981],
       [-0.45852914, -1.01373268,  1.22536642,  1.06046367],
       [-1.29862081,  0.8865348 ,  1.2268938 ,  0.53900306],
       [ 1.89808419,  0.00664103, -0.16352497,  0.05554636]])

# Method for boolean array

In [87]:
arr = np.random.randn(100)

In [88]:
(arr > 0).sum() # Number of positive values

54

In [89]:
bools = np.array([False, False, True, False])
bools.any()

True

In [90]:
bools.all()

False

# Sorting

In [95]:
arr = np.random.randn(8)

In [96]:
arr

array([-0.5624066 ,  1.35238627,  0.76391579,  0.22512839,  1.5388352 ,
       -0.97622198, -0.35522483, -0.08223457])

In [97]:
arr.sort()

In [98]:
arr

array([-0.97622198, -0.5624066 , -0.35522483, -0.08223457,  0.22512839,
        0.76391579,  1.35238627,  1.5388352 ])

# The end