In [2]:
%matplotlib inline
from IPython.display import YouTubeVideo
YouTubeVideo('3yaIOsjiSm0')

The main advantage of using Python for Data Science is the fast REPL.
Make a script that will execute the housekeeping tasks like creating input and output folders, getting json data, etc.
This script will take data, manipulate it, pipe it to python scripts, etc.

Combine Pig with python for scalability.

```python

def main():
    for line in sys.stdin:
        # do something
if __name__ == "__main__":
    main()
    
#Dictionary comprehension
new_dict = {"new_"+k : val  for k, val in old_dict.items()}

```

In [8]:
dict_from_tuples = dict([ ("a",1),("b",2) ])
tuples_again = dict_from_tuples.items()
print(tuples_again)

dict_items([('b', 2), ('a', 1)])


In [20]:
# zip to combine lists
# For making a dictionary of days of week indexed by numbers
print(range(7))
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
print(days)
day_of_week = dict(zip(range(7), days))
print(day_of_week)

range(0, 7)
['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
{0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}


In [23]:
#dealing with list indexes
data = [["Bob",50],["Jane",35]]

#bad way
for line in data:
    name = line[0]
    age = line[1]
    #doprocessing(name,age)
# difficult to maintain and change if there are a lot of columns

#good way
cols = ["name", "age"]

for line in data:
    record = dict(zip(cols,line))
    print(record["name"])
    #doprocessing(record)

#good way 2
employees = [dict(zip(cols,record)) for record in data]
print([emp["name"] for emp in employees])

Bob
Jane
['Bob', 'Jane']


In [28]:
#String manipulation, split, join
date_string = "2015-04-15"
year,month,day = [int(x) for x in date_string.split("-")]
print(year,month,day)

#combine list of strings
s1 = "This is a string"
s2 = "and so is this"
print("\t".join([s1,s2]))  # Use tab character to join this list of strings

2015 4 15
This is a string	and so is this


In [34]:
# Templating
hw_template = "Hello %s!"
print(hw_template % "world")

# Template method 2
emp_template = "Emp name : %(name)s\t\tEmp age : %(age)i years old"
print(emp_template % {"name":"Jane", "age":35})

# Template method 3
month = "Jan"
url_template = 'http://whatever.com/{month}'
print(url_template.format(month=month))

Hello world!
Emp name : Jane		Emp age : 35 years old
http://whatever.com/Jan


In [61]:
# String functions
print("hello".endswith("lo"))
print("waterbottle".startswith("wa"))
print("world   \t\n".strip() == "world")
print("world !".strip() == "world !")  # will not strip white space inside
print("world".rjust(7,"-"))
"world".replace("or","i")
print("1234".isdigit())      # only checks if the contents are numbers

True
True
True
True
--world
True


Use iterators rather than lists.
Lists are stored in memory
Iterators are processed one at a time

```python
#Example
for line in sys.stdin:


ys = [f(x) for x in xs] #List
ys = (f(x) for x in xs) #Iterator
```


In [6]:
xs = [34,45,62,46,24]

#Iterator
def apply_f(xs):
    for x in xs:
        yield f(x)
ys = apply_f(xs)


**Use itertools for efficient processing, that do not have to load the entire data in the memory**

In [16]:
#Use itertools for iterators
try:
    from itertools import izip
except ImportError:
    izip = zip
# list1 = [1,2,3,4,5]
list1 = list(range(1,6))
list2 = ["a","b","c","d"] # No e, so only the first 4 items can be paired
pairs = izip(list1, list2)
print(pairs)  # Contents not in ram
for p in pairs:
    print(p)

# permutations and combinations
# it.permutations
triples = it.combinations(list1,3)
for t in triples:
    print(t)

<zip object at 0x107218e08>
(1, 'a')
(2, 'b')
(3, 'c')
(4, 'd')
(1, 2, 3)
(1, 2, 4)
(1, 2, 5)
(1, 3, 4)
(1, 3, 5)
(1, 4, 5)
(2, 3, 4)
(2, 3, 5)
(2, 4, 5)
(3, 4, 5)


Itertools groupby function
http://stackoverflow.com/questions/773/how-do-i-use-pythons-itertools-groupby/7286#7286

In [37]:
from itertools import groupby
# groupby iterator can group a SORTED list
things = [("animal", "bear"), ("animal", "duck"), ("plant", "cactus"), ("vehicle", "speed boat"), ("vehicle", "school bus")]
groups = groupby(things, key=lambda x: x[0])

# Method 1
for group in groups:
    print(group[0]) # First element, group name, of the tuple. Second element is an iterator
    for item in group[1]:
        print(item[1])  # First element is group name itself. Structure is same as the original tuple
    print()

#Method 2
groups = groupby(things, key=lambda x: x[0])  # Iterator has expired and has to be reinitiated
for key,group in groups:
    print(key)
    for item in group:
        print(item[1],"is in",item[0])
    print()

    
# Method 3
for key, group in groupby(things, lambda x: x[0]):
    listOfThings = " and ".join([thing[1] for thing in group])
    print(key + "s:  " + listOfThings + ".")
    

animal
bear
duck

plant
cactus

vehicle
speed boat
school bus

animal
bear is in animal
duck is in animal

plant
cactus is in plant

vehicle
speed boat is in vehicle
school bus is in vehicle

animals:  bear and duck.
plants:  cactus.
vehicles:  speed boat and school bus.


**Use Sets**
```python
unique_categories = set([ x["category"] for x in records ])

#frozensets are immutable
cities = frozenset(["Frankfurt", "Basel","Freiburg"])
cities.add("Delhi")  #Error
```

### Libraries
Scikit-Learn : ML  
Pandas : Data frames  
SciPy: Numerical Computing  
Matplotlib : Graphics  
NumPy : Arrays  