#Python for Data Analysis
##Data Wrangling with pandas, NumPy & Jupyter

##CHAPTER 1 : Preliminaries

###Navigating This Book

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True)

##Chapter 2 : Python Language Basics, IPython, and Jupyter Notebooks

###IPython Basics


In [4]:

import numpy as np
np.random.seed(12345)
np.set_printoptions(precision=4, suppress=True)

In [5]:

import numpy as np
data = [np.random.standard_normal() for i in range(7)]
data

[-0.20470765948471295,
 0.47894333805754824,
 -0.5194387150567381,
 -0.55573030434749,
 1.9657805725027142,
 1.3934058329729904,
 0.09290787674371767]

In [6]:
a = [1, 2, 3]

In [7]:

b = a
b

[1, 2, 3]

In [8]:

a.append(4)
b

[1, 2, 3, 4]

###Python Language Basics

In [9]:

def append_element(some_list, element):
    some_list.append(element)

In [10]:
data = [1, 2, 3]
append_element(data, 4)
data

[1, 2, 3, 4]

In [11]:

a = 5
type(a)
a = "foo"
type(a)

str

In [None]:

"5" + 5

In [13]:
a = 4.5
b = 2
# String formatting, to be visited later
print(f"a is {type(a)}, b is {type(b)}")
a / b

a is <class 'float'>, b is <class 'int'>


2.25

In [14]:
a = 5
isinstance(a, int)

True

In [15]:
a = 5; b = 4.5
isinstance(a, (int, float))
isinstance(b, (int, float))

True

In [17]:
a = "foo"

In [18]:
getattr(a, "split")

<function str.split(sep=None, maxsplit=-1)>

####Duck typing

In [19]:
def isiterable(obj):
    try:
        iter(obj)
        return True
    except TypeError: # not iterable
        return False

In [20]:
isiterable("a string")
isiterable([1, 2, 3])
isiterable(5)

False

In [21]:

5 - 7
12 + 21.5
5 <= 2

False

In [22]:
a = [1, 2, 3]
b = a
c = list(a)
a is b
a is not c

True

####Binary operators and comparisons

In [23]:
a == c

True

In [24]:
a = None
a is None

True

In [25]:
a_list = ["foo", 2, [4, 5]]
a_list[2] = (3, 4)
a_list

['foo', 2, (3, 4)]

In [28]:
ival = 17239871
ival ** 6

26254519291092456596965462913230729701102721

In [29]:
fval = 7.243
fval2 = 6.78e-5

####Scalar Types

In [30]:

3 / 2

1.5

In [31]:

3 // 2

1

In [32]:
c = """
This is a longer string that
spans multiple lines
"""

In [33]:

c.count("\n")

3

In [35]:
b = a.replace("string", "longer string")
b

'this is a longer string'

In [36]:
a

'this is a string'

In [37]:
a = 5.6

In [38]:
s = str(a)

In [39]:
print(s)

5.6


In [40]:
s = "python"
list(s)
s[:3]

'pyt'

In [41]:
s = "12\\34"
print(s)

12\34


In [42]:
s = r"this\has\no\special\characters"
s

'this\\has\\no\\special\\characters'

In [43]:
a = "this is the first half "
b = "and this is the second half"
a + b

'this is the first half and this is the second half'

In [44]:
template = "{0:.2f} {1:s} are worth US${2:d}"

In [45]:
template.format(88.46, "Argentine Pesos", 1)

'88.46 Argentine Pesos are worth US$1'

In [46]:
amount = 10
rate = 88.46
currency = "Pesos"
result = f"{amount} {currency} is worth US${amount / rate}"

In [47]:
f"{amount} {currency} is worth US${amount / rate:.2f}"

'10 Pesos is worth US$0.11'

####Bytes and Unicode

In [48]:
val = "español"
val

'español'

In [49]:
val_utf8 = val.encode("utf-8")
val_utf8
type(val_utf8)

bytes

In [50]:
val_utf8.decode("utf-8")

'español'

In [51]:
val.encode("latin1")
val.encode("utf-16")
val.encode("utf-16le")

b'e\x00s\x00p\x00a\x00\xf1\x00o\x00l\x00'

####Booleans

In [52]:
True and True
False or True

True

In [53]:
int(False)
int(True)

1

In [54]:

a = True
b = False
not a
not b

True

####Type casting


In [55]:
s = "3.14159"
fval = float(s)
type(fval)
int(fval)
bool(fval)
bool(0)

False

####None

In [56]:
a = None
a is None
b = 5
b is not None

True

####Dates and times

In [57]:
from datetime import datetime, date, time
dt = datetime(2011, 10, 29, 20, 30, 21)
dt.day
dt.minute

30

In [58]:
dt.date()
dt.time()

datetime.time(20, 30, 21)

In [59]:
dt.strftime("%Y-%m-%d %H:%M")

'2011-10-29 20:30'

In [60]:

datetime.strptime("20091031", "%Y%m%d")

datetime.datetime(2009, 10, 31, 0, 0)

In [61]:
dt_hour = dt.replace(minute=0, second=0)
dt_hour

datetime.datetime(2011, 10, 29, 20, 0)

In [62]:

dt

datetime.datetime(2011, 10, 29, 20, 30, 21)

In [63]:
dt2 = datetime(2011, 11, 15, 22, 30)
delta = dt2 - dt
delta
type(delta)

datetime.timedelta

In [64]:
dt
dt + delta

datetime.datetime(2011, 11, 15, 22, 30)

###Control Flow

In [65]:
a = 5; b = 7
c = 8; d = 4
if a < b or c > d:
    print("Made it")

Made it


In [67]:

4 > 3 > 2 > 1

True

In [66]:
x = -5
if x < 0:
  print("It's negative")

It's negative


In [68]:
if x < 0:
  print("It's negative")
elif x == 0:
  print("Equal to zero")
elif 0 < x < 5:
  print("Positive but smaller than 5")
else:
  print("Positive and larger than or equal to 5")

It's negative


####for loops

In [69]:
for i in range(4):
    for j in range(4):
        if j > i:
            break
        print((i, j))

(0, 0)
(1, 0)
(1, 1)
(2, 0)
(2, 1)
(2, 2)
(3, 0)
(3, 1)
(3, 2)
(3, 3)


In [70]:
range(10)
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [71]:
list(range(0, 20, 2))
list(range(5, 0, -1))

[5, 4, 3, 2, 1]

In [72]:
seq = [1, 2, 3, 4]
for i in range(len(seq)):
    print(f"element {i}: {seq[i]}")

element 0: 1
element 1: 2
element 2: 3
element 3: 4


In [73]:
total = 0
for i in range(100_000):
    # % is the modulo operator
    if i % 3 == 0 or i % 5 == 0:
        total += i
print(total)

2333316668


####while loops

In [74]:
x = 256
total = 0
while x > 0:
  if total > 500:
    break
  total += x
  x = x // 2

In [75]:
if x < 0:
  print("negative!")
elif x == 0:
  # TODO: put something smart here
  pass
else:
  print("positive!")

positive!


#Kết Thúc