# What can Python do for you?

_By Robin Linderborg_

### How do you even Python?

- Text editor (not Word!)

- Terminal

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

## Web scraping

In [None]:
import requests

url = 'http://www.bbc.co.uk/food/recipes/search'
pages = []
for page in range(10):
    r = requests.post(url, {'keywords': 'pasta',
                            'diets[]': 'vegetarian',
                            'page': page + 1})
    print 'Fetching page no. {}'.format(page + 1)
    pages.append(r.text)
print 'SUCCESS! All recipes scraped.'

In [None]:
# Ignore this cell, only backup if wi-fi fails.
pages = []
for page in range(10):
    with open('bbc_{}.html'.format(page + 1), 'r') as f:
        pages.append(f.read())

### Let's look at the source code

In [None]:
pages[0]

In [None]:
from IPython.display import Image
Image("brace.jpg")

In [None]:
from bs4 import BeautifulSoup

def get_recipes(html):
    articles = BeautifulSoup(html, 'html').select_one('#article-list')
    return articles.select('.article')[1:]

def parse_article(item):
    name = item.select_one('h3').text
    data = {}
    words = ['By ', 'From ', 'Preparation time: ', 'Cooking time: ']
    for h in item.select('h4'):
        for word in words:
            if word in h.text:
                data[word] = h.text.split(word)[1]
    return data

articles = []
for page in pages:
    articles.extend(get_recipes(page))

data = []
for article in articles:
    data.append(parse_article(article))

In [None]:
data[0]

### An API example

In [None]:
r = requests.get('http://api.icndb.com/jokes/random')
print r.json()['value']['joke']

## Data analysis

In [None]:
import pandas as pd
df = pd.DataFrame(data)
df.head()

### How long does it take to cook a vegetarian pasta meal?

In [None]:
df['Cooking time: '].value_counts().head()

### Who's writing these recipes?

In [None]:
df['By '].value_counts().head()

In [None]:
def parse_time(row):
    conversion = {'10 to 30 mins': 15, 'Less than 10 mins': 5,
                  '30 mins to 1 hour': 45, 'No cooking required': 0,
                  '1 to 2 hours': 90}
    return conversion[row['Cooking time: ']]

df['cook_time_num'] = df.apply(parse_time, axis=1)

In [None]:
df.groupby('By ')['cook_time_num'].mean()\
.sort_values(ascending=False).head()

## Data visualization

In [None]:
books = df.groupby('From ').size().sort_values(ascending=False).head(5)

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(range(len(books)),
        books.values,
        tick_label=books.index,
        align='center',
        color='r',
        alpha=.6);

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(range(len(books)), books.values, tick_label=books.index,
        align='center', color='g', alpha=.6);

plt.annotate('This book seems popular!',
            xy=(.5, 40),
            xytext=(1, 42),
            fontsize='x-large',
            arrowprops={'facecolor': 'black'});

In [None]:
plt.figure(figsize=(20, 10))
plt.plot([2010, 2011, 2012, 2013, 2014, 2015],
         [80, 75, 88, 102, 108, 75])
plt.plot([2010, 2011, 2012, 2013, 2014, 2015],
         [70, 120, 89, 95, 105, 88])
plt.title('Some random data');

## Utility programs

In [None]:
for num in range(100):
    with open('my_folder/my_file_{}.txt'.format(num), 'w') as f:
        f.write('')

In [None]:
!ls my_folder/

In [None]:
import os

for fname in os.listdir('my_folder/'):
    if fname.startswith('my_'):
        new_name = fname.replace('my', 'your')
        os.rename('my_folder/' + fname, 'my_folder/' + new_name)

In [None]:
!ls my_folder/

## Advanced text search

In [None]:
import re

In [None]:
text = 'Banana Apple Kiwi Melon Orange'

In [None]:
re.findall('[A-Z][a-z]+e', text)

## Create websites

In [None]:
from flask import Flask
app = Flask(__name__)

@app.route("/")
def hello():
    return "<h1>Hello CIRCOM!</h1>"

if __name__ == "__main__":
    app.run()

### Thank you!

Slides available at: [github.com/miroli/bbc_python](https://github.com/miroli/bbc_python)