# Python. Типы.

In [5]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [6]:
# a = None
# b = None
# print id(a) == id(b)
# print id(5) == id(5)
# print id(172) == id(272)

# Модуль Collections

In [7]:
from collections import namedtuple, deque, \
                        Counter, OrderedDict, \
                        defaultdict

# Коллекции

## Базовые методы

In [8]:
print tuple(), (), (1,), (1,)*3
print list(), [1] * 2, [[]] * 2
print set()
print dict(), {}

() () (1,) (1, 1, 1)
[] [1, 1] [[], []]
set([])
{} {}


In [9]:
a = [1, 2, 3, 4, 5]
del a[1]
print a
print 1 in a, 2 in a, 'asd' in a

[1, 3, 4, 5]
True False False


In [10]:
d = {'key_1': 1, 'key_2': 2}
del d['key_1']
print d
print 'key_2' in d, 2 in d

{'key_2': 2}
True False


## Изменяемые vs Неизменяемые

In [11]:
# Изменяемые (list, set, dict, deque, Counter, OrderedDict, defaultdict)

lst = [1, 2, 3]
prev_id = id(lst)
print lst
lst += [4]  # lst.append(4)
new_id = id(lst)
print lst
print 'Has id changed (list)?', prev_id != new_id

print '-' * 100

# Неизменяемые (tuple, namedtuple)
tpl = (1, 2, 3)
prev_id = id(tpl)
print tpl
tpl += (4,)  # Запятая нужна! 
print tpl
new_id = id(tpl)
print 'Has id changed (tuple)?', prev_id != new_id

[1, 2, 3]
[1, 2, 3, 4]
Has id changed (list)? False
----------------------------------------------------------------------------------------------------
(1, 2, 3)
(1, 2, 3, 4)
Has id changed (tuple)? True


## Сложность
https://wiki.python.org/moin/TimeComplexity

# Tuple

In [12]:
# http://stackoverflow.com/questions/6153348/time-complexity-of-tuple-in-python
profile = (1, 'Pavel', 'Durov', 1984, 'Telegram')

In [13]:
name = profile[1:3]

In [14]:
name

('Pavel', 'Durov')

In [15]:
NAME_SLICE = slice(1, 3)

In [16]:
print profile[NAME_SLICE]

('Pavel', 'Durov')


# Namedtuple

In [17]:
from collections import namedtuple
fiels = ['vk_id', 'name', 'surname', 'year', 'work']
VkProfile = namedtuple('VkProfile', fiels)

In [18]:
profile_nt = VkProfile(vk_id=1, name='Pavel', 
                       surname='Durov', 
                       year=1984, 
                       work='Telegram')

In [19]:
profile_nt.name, profile_nt.surname

('Pavel', 'Durov')

In [20]:
profile_nt._replace(work='new job')

VkProfile(vk_id=1, name='Pavel', surname='Durov', year=1984, work='new job')

In [21]:
profile_nt._asdict()

OrderedDict([('vk_id', 1),
             ('name', 'Pavel'),
             ('surname', 'Durov'),
             ('year', 1984),
             ('work', 'Telegram')])

# List

## Базовые операции

In [22]:
a = ['start']

# Добавление одного элемента в конец списка
a.append(1)  # O(1)
print a

['start', 1]


In [23]:
# Добавление k элементов в конец списка
a.extend([2, 3])  # O(k)
print a

['start', 1, 2, 3]


In [24]:
# Доступ к элементу по индексу
print a[0]  # O(1)

start


In [25]:
# Вставка элемента внутрь списка
a.insert(2, 'new_element')  # O(n)
print a

['start', 1, 'new_element', 2, 3]


In [26]:
# Удаление элемента из списка по индексу
ret = a.pop(1)  # O(n) - с вовзращением элемента
print ret, a
del a[1]  # O(n) - без вовзращения элемента
print ret, a

1 ['start', 'new_element', 2, 3]
1 ['start', 2, 3]


In [27]:
# Сортировка https://en.wikipedia.org/wiki/Timsort
print sorted(a)  # без изменения исходного списка
print a
a.sort()  # с изменением исходного списка
print a  

[2, 3, 'start']
['start', 2, 3]
[2, 3, 'start']


In [28]:
# Наличие элемента в списке
'start' in a  # O(n) !!!

True

## Особенности копирования

### Как не работает? 

In [29]:
x = [[0]] * 2
x

[[0], [0]]

In [30]:
x[0][0] = 1
x

[[1], [1]]

In [31]:
id(x[0]) == id(x[1])

True

In [32]:
y = x + x
y

[[1], [1], [1], [1]]

In [33]:
y[0][0] = 2
y

[[2], [2], [2], [2]]

### А как работает?

In [34]:
x = [[0] for x in xrange(2)]
print x
x[0][0] = 1
print x

[[0], [0]]
[[1], [0]]


### Как скопировать список? 

In [35]:
x = [1, 2, 3]
y = x
id(x) == id(y)

True

In [36]:
y = x[:]
print 'y=x[:]', id(x) == id(y)

y = list(x)
print 'y=list(x)', id(x) == id(y)

from copy import copy
y = copy(x)
print 'y=copy(x)', id(x) == id(y)

y=x[:] False
y=list(x) False
y=copy(x) False


In [37]:
x = [[1, 2], [3, 4]]

In [38]:
y = x[:]
print 'y=x[:]', id(x[0]) == id(y[0])

y = list(x)
print 'y=list(x)', id(x[0]) == id(y[0])

from copy import copy
y = copy(x)
print 'y=copy(x)', id(x[0]) == id(y[0])

y=x[:] True
y=list(x) True
y=copy(x) True


In [39]:
from copy import deepcopy
y = deepcopy(x)
print id(x[0]) == id(y[0])

False


# Deque

In [40]:
from collections import deque

In [41]:
deq = deque([1, 2, 3])

In [42]:
deq.append(100)  # O(1)
print deq

deque([1, 2, 3, 100])


In [43]:
deq.appendleft(-7)  # O(1)
print deq

deque([-7, 1, 2, 3, 100])


In [44]:
# Добавление и удаление элемента с обеих сторон очереди работает за константное время.
# Но нет slice! 
deq[1:3]

TypeError: sequence index must be integer, not 'slice'

### Очереди с ограниченной длинной можно создать с помощью параметра max_len
deq = deque([1, 2, 3], maxlen=3)

# Множества (set, frozenset)

### Общее frozenset и set

### В Python — это хеш-сет, то есть оно может содержать только элементы, которые можно захешировать (неизменяемые типы - хешируемые, изменяемые - нет) 

In [45]:
{(2, 3)}

{(2, 3)}

In [46]:
{[2, 3]}

TypeError: unhashable type: 'list'

In [47]:
{set(), set()}

TypeError: unhashable type: 'set'

In [48]:
{frozenset(), frozenset()}

{frozenset()}

In [49]:
elems = {1, 2, 'py'}

In [50]:
print 1 in elems  # O(1) !!! 
print 'py' in elems  # O(1) !!!

True
True


In [51]:
# Объединение
print elems.union({1, 4})
print elems | {1, 4}

set([1, 2, 'py', 4])
set([1, 2, 'py', 4])


In [52]:
# Пересечение
print elems.intersection({1, 4})
print elems & {1, 4}

set([1])
set([1])


In [53]:
# Разность
print elems.difference({1, 4})
print elems - {1, 4}

set([2, 'py'])
set([2, 'py'])


In [54]:
# Вложения
a = {1, 2, 3}
b = {2, 3}
print a >= b
print a > b
print a < b

True
True
False


### Только set

In [55]:
# Добавление
a.add(5)
a.update([1, 5])
# Удаление
a.remove(5) # remove удаляет из множества существующий элемент или 
            # поднимает исключение, если элемент во множестве не содержится
a.discard(5) # discard удаляет элемент, только если он содержится во множестве

# Словари

In [56]:
d = {'a': 1, 'b': 2, 'c': 5, 'd': 'yes'}
d = dict(a=1, b=2, c=5, d='yes')

In [57]:
# Проверка наличия ключа в словаре
'a' in d  # O(1) !!!

True

In [58]:
key = 'default'
default_val = 'default'
# Следующее равносильно
print d[key] if key in d else default_val
print d.get(key, default_val)
print d[key]

default
default


KeyError: 'default'

In [59]:
print d.keys()
print d.values()
print d.items()

['a', 'c', 'b', 'd']
[1, 5, 2, 'yes']
[('a', 1), ('c', 5), ('b', 2), ('d', 'yes')]


In [60]:
# Добавление в словарь
d['z'] = 0
print d
d.update({'y': -1, 'w': -2})
print d

{'a': 1, 'c': 5, 'b': 2, 'd': 'yes', 'z': 0}
{'a': 1, 'c': 5, 'b': 2, 'd': 'yes', 'w': -2, 'y': -1, 'z': 0}


In [61]:
# Удаление из словаря
del d['a']

In [62]:
for k in d:  # тоже самое, что и d.keys()
    print k, d[k]

c 5
b 2
d yes
w -2
y -1
z 0


In [63]:
# Объединение словарей без их изменения
dct1 = {'a':1, 'b':2}
dct2 = {'b':3, 'c':4}
union = dct1.copy()
union.update(dct2)
print(union, dct1, dct2)

dct1 = {'a':1, 'b':2}
dct2 = {'b':3, 'c':4}
union  = dict(dct1, **dct2)  # про ** будет чуть позже
print(union, dct1, dct2)

({'a': 1, 'c': 4, 'b': 3}, {'a': 1, 'b': 2}, {'c': 4, 'b': 3})
({'a': 1, 'c': 4, 'b': 3}, {'a': 1, 'b': 2}, {'c': 4, 'b': 3})


In [64]:
# в Python нет switch, но есть словари) 
def fail_func():
    print('Fail')
    
def success_func():
    print('Success')
    
cases = {'case_1': fail_func, 'case_2': success_func}

cases['case_2']()

Success


## OrderedDict

In [65]:
from collections import OrderedDict
# словарь с ключами, упорядоченными по времени добавления
# Изменение значения по ключу не влияет на порядок ключей в словаре

## Counter

In [66]:
from collections import Counter

In [67]:
cnt = Counter([1, 1, 2, 1, 3])
cnt

Counter({1: 3, 2: 1, 3: 1})

In [68]:
cnt[0] += 1  # Нет ошибки !!!
cnt

Counter({0: 1, 1: 3, 2: 1, 3: 1})

In [69]:
cnt.most_common(2)

[(1, 3), (0, 1)]

In [70]:
c1 = Counter(foo=4, bar=-1) 
c2 = Counter(foo=2, bar=2)
print c1 + c2 # c1[k] + c2[k]
print c1 - c2 # c1[k] - c2[k]
print c1 & c2 # min(c1[k], c2[k])
print c1 | c2 # max(c1[k], c2[k])

Counter({'foo': 6, 'bar': 1})
Counter({'foo': 2})
Counter({'foo': 2})
Counter({'foo': 4, 'bar': 2})


# Упаковка и распаковка

In [71]:
def _max(x, y):
    if x > y:
        return x
    return y

In [72]:
_max(2, 3), _max(3, 2)

(3, 3)

In [73]:
def _max(*args):
    print 'args:', args
    max_x = float('-inf')
    for x in args:
        if x > max_x:
            max_x = x
    return max_x

In [74]:
_max(3, 1, 100, -2)

args: (3, 1, 100, -2)


100

In [75]:
lst = [3, 1, 100, -2]
_max(*lst)

args: (3, 1, 100, -2)


100

In [76]:
def return_x(**kwargs):
    print 'kwargs:', kwargs
    return kwargs.get('x')

In [77]:
return_x({'x': 100, 'y': 50})

TypeError: return_x() takes exactly 0 arguments (1 given)

In [78]:
return_x(**{'x': 100, 'y': 50})

kwargs: {'y': 50, 'x': 100}


100

# Itertools
http://nvie.com/posts/iterators-vs-generators/

In [79]:
it = iter(xrange(7))

print (next(it))
print (next(it))
print ([x for x in it])

 
print (next(it))

0
1
[2, 3, 4, 5, 6]


StopIteration: 

In [80]:
a = iter([1, 2, 3])
b = iter([4, 5])
a + b

TypeError: unsupported operand type(s) for +: 'listiterator' and 'listiterator'

In [81]:
# Объединение итераторов
from itertools import chain
for x in chain(a, b):
    print x

1
2
3
4
5


In [82]:
list(a), list(b)

([], [])

In [83]:
# Срезы
from itertools import islice
a = iter([1, 2, 3])
b = iter([4, 5])

for x in islice(a, 1, 3):
    print x

2
3


In [84]:
# перестановки
from itertools import permutations
print list(permutations('AB'))

[('A', 'B'), ('B', 'A')]


In [85]:
# сочетания без повторений
from itertools import combinations
list(combinations('ABC', 2))

[('A', 'B'), ('A', 'C'), ('B', 'C')]

In [86]:
# сочетания c повторениями
from itertools import combinations_with_replacement
list(combinations_with_replacement('ABC', 2))

[('A', 'A'), ('A', 'B'), ('A', 'C'), ('B', 'B'), ('B', 'C'), ('C', 'C')]

In [87]:
# декартово произведение
from itertools import product

# Например, когда не знаешь глубину вложенного цикла
n = 3
cycles = [
    [1, 2],
    [3, 4],
    [5, 6]
]
for indecies in product(*cycles):
    print indecies

(1, 3, 5)
(1, 3, 6)
(1, 4, 5)
(1, 4, 6)
(2, 3, 5)
(2, 3, 6)
(2, 4, 5)
(2, 4, 6)


### GroupBy

In [88]:
# данные key - value
from itertools import groupby
data = [
    ('Factory A', 'day 1', 100),
    ('Factory B', 'day 1', 200),
    ('Factory C', 'day 1', 300),
    
    ('Factory A', 'day 2', 175),
    ('Factory B', 'day 2', 115),
    ('Factory C', 'day 2', 100),
    
    ('Factory A', 'day 3', 500),
    ('Factory B', 'day 3', 800),
    ('Factory C', 'day 3', 1000)
]

In [89]:
# Сколько товара все фабрики произвели за каждый день
for (key, values) in groupby(data, lambda x: x[1]):
    print key, values

day 1 <itertools._grouper object at 0x7fc368021a90>
day 2 <itertools._grouper object at 0x7fc368021a10>
day 3 <itertools._grouper object at 0x7fc368021a90>


In [90]:
for (key, values) in groupby(data, lambda x: x[1]):
    s = 0
    for v in values:
        s += v[2]
    print key, s

day 1 600
day 2 390
day 3 2300


In [91]:
for (key, values) in groupby(data, lambda x: x[1]):
    print key, sum([v[2] for v in values])

day 1 600
day 2 390
day 3 2300


In [92]:
# Сколько каждая фабрика произвела за все дни
for (key, values) in groupby(data, lambda x: x[0]):
    print key, sum([v[2] for v in values])

Factory A 100
Factory B 200
Factory C 300
Factory A 175
Factory B 115
Factory C 100
Factory A 500
Factory B 800
Factory C 1000


In [93]:
data = sorted(data)

In [94]:
data

[('Factory A', 'day 1', 100),
 ('Factory A', 'day 2', 175),
 ('Factory A', 'day 3', 500),
 ('Factory B', 'day 1', 200),
 ('Factory B', 'day 2', 115),
 ('Factory B', 'day 3', 800),
 ('Factory C', 'day 1', 300),
 ('Factory C', 'day 2', 100),
 ('Factory C', 'day 3', 1000)]

In [95]:
for (key, values) in groupby(data, lambda x: x[0]):
    print key, sum([v[2] for v in values])

Factory A 775
Factory B 1115
Factory C 1400


# Задания

In [97]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [98]:
tit_df = pd.read_csv('titanic.csv')

In [99]:
tit_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [113]:
tit_df.groupby(['Pclass'])['Fare'].agg('mean')

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64

In [147]:
tit_lst = list(tit_df.values)
dts_lst = list(tit_df.columns.values)
dts_dct = dict(zip(dts_lst, [x for x in range(0, len(dts_lst))]))

{'Age': 5,
 'Cabin': 10,
 'Embarked': 11,
 'Fare': 9,
 'Name': 3,
 'Parch': 7,
 'PassengerId': 0,
 'Pclass': 2,
 'Sex': 4,
 'SibSp': 6,
 'Survived': 1,
 'Ticket': 8}

In [148]:
tit_lst = sorted(tit_lst, key=lambda x: x[dts_dct['Pclass']])

In [150]:
for (key, values) in groupby(tit_lst, lambda x: x[dts_dct['Pclass']]):
    print key, np.mean([v[dts_dct['Fare']] for v in values])

1 84.1546875
2 20.6621831522
3 13.6755501018


In [151]:
def check(X):
    for (ind, val) in enumerate(X):
        if val == ind:
            return False
    return True

cnt = Counter()
for n in xrange(1, 8):
    for x in permutations(xrange(0, n)):
        if(check(x)):
            cnt[n] += 1
print cnt

Counter({7: 1854, 6: 265, 5: 44, 4: 9, 3: 2, 2: 1})


In [153]:
text = ' '.join(tit_df.Name.str.lower().values)

In [164]:
txt_cnt = Counter()
def get_tr(ind, txt):
    return txt[ind : ind + 2]


for i in range(0, len(text) - 2):
    txt_cnt[get_tr(i, text)] += 1

In [170]:
txt_cnt.values()[:10]

[129, 156, 32, 226, 6, 108, 100, 141, 204, 67]

In [171]:
def fib():
    pr, cr = 0, 1
    while(True):
        yield cr
        pr, cr = cr, pr + cr

In [172]:
f = fib()
list(islice(f, 0, 10))

[1, 1, 2, 3, 5, 8, 13, 21, 34, 55]