# Python for Data Science

## Lecture 5: Matplotlib

## Data visualization  

It is important to be able to present a set of data informatively for others (and it can help you as well).



In [None]:
import numpy as np
import random as rn
import matplotlib.pyplot as plt
import pandas as pd
# we will read data from csv files

# Matplotlib
- One of the best known plotting library in Python
- We will go through examples in order to understand how it works

# Squirrels of New York
The data is in a csv (comma separated values) file which is uploaded to the homepage. Every line represents a squirrel which has been spotted. Reviewing the first line gives you an idea of what data is collected for each detection.

long,lat,unique_squirrel_id,hectare,shift,date,hectare_squirrel_number,age,primary_fur_color,highlight_fur_color,
combination_of_primary_and_highlight_color,color_notes,location,above_ground_sighter_measurement,specific_location,running,
chasing,climbing,eating,foraging,other_activities,kuks,quaas,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,
other_interactions,lat_long,zip_codes,community_districts,borough_boundaries,city_council_districts,police_precincts

In [None]:
import os

df = pd.read_csv(os.path.join("data", 'nyc_squirrels.csv'))
df.head()

In [None]:
df["age"].value_counts()

In [None]:
def unique_lists(column):
    unique = column.value_counts()
    return unique.index, unique.values

In [None]:
age_index, age_counts = unique_lists(df["age"])

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))   # number of graphs and size
axs.bar(age_index, age_counts)
plt.show()

<img src="https://matplotlib.org/_images/anatomy.png" alt="Drawing" style="width: 400px;"/>

### Multiple plots next to each other
There are two approaches for Matplotlib. The recommended one is the object oriented approach, so we will use this too.

Inside one picture, there can be multiple figures which we call `axes`. (Do not confuse with `axis`!) Plotting works for the ax which we set in the command.

(The other approach is to always use the plt.command format for plotting, and then we would need to use a separate command to choose the figure).

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(7, 10))

unique, counts = unique_lists(df["age"])
axs[0, 0].bar(unique,counts)

unique, counts = unique_lists(df["primary_fur_color"])
axs[0, 1].bar(unique,counts)


unique, counts = unique_lists(df["location"])
axs[1, 0].bar(unique,counts)


unique, counts = unique_lists(df["shift"])
axs[1, 1].bar(unique,counts)


plt.show()

In [None]:
unique, counts = unique_lists(df["date"])

In [None]:
unique = [str(x) for x in unique]
print(unique)

In [None]:
counts

In [None]:
combined = list(zip(unique, counts))
combined_sorted = sorted(combined, key=lambda x: x[0])

unique, counts = zip(*combined_sorted)

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.bar(unique,counts)
plt.show()

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.bar(unique,counts)
labels = axs.get_xticklabels()
plt.setp(labels, rotation=45, horizontalalignment='right')
plt.show()



In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.bar(unique,counts)
labels = axs.get_xticklabels()
plt.setp(labels, rotation=45, horizontalalignment='right')
axs.set(ylim=[50, 500], ylabel='Number of detections', xlabel='Date',
       title='Squirrels in the Central Park')
plt.show()


In [None]:
unique=[l[4:8]+". "+l[0:2]+". "+l[2:4]+"." for l in unique]

fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.bar(unique,counts)
labels = axs.get_xticklabels()
plt.setp(labels, rotation=45, horizontalalignment='right')
axs.set(ylim=[50, 500], ylabel='Number of detections', xlabel='Date',
       title='Squirrels in the Central Park')
plt.show()


In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.bar(unique,counts,label="Huge blue columns")
labels = axs.get_xticklabels()
plt.setp(labels, rotation=45, horizontalalignment='right')
axs.set(ylim=[50, 500], ylabel='Number of detections', xlabel='Date',
       title='Squirrels in the Central Park')

for group in [1,6,5]:
    axs.text( group, counts[group], "Too many squirrels", fontsize=10,
            verticalalignment="top", rotation=90,color="white")
axs.legend()
plt.show()



In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.bar(unique,counts)
labels = axs.get_xticklabels()
plt.setp(labels, rotation=45, horizontalalignment='right')
axs.set(ylim=[50, 500], ylabel='Number of detections', xlabel='Date',
       title='Squirrels in the Central Park')

for group in [1,6,5]:
    axs.text( group, counts[group], "Too many squirrels", fontsize=10,
            verticalalignment="top", rotation=90,color="white")
axs.plot(counts,color="red")
plt.show()

# Pie charts

In [None]:
counts

In [None]:

unique, counts = unique_lists(df["primary_fur_color"])

fig, axs = plt.subplots(1, 1, figsize=(5, 5))
axs.pie(counts,labels=unique)
axs.set(title='Colours of squirrels in the Central Park')
plt.show()



In [None]:
fig,ax = plt.subplots(1,1,figsize = (5,5))
ax.axis('equal')
langs = ['Sky', 'Sunny side of the Pyramid', 'Shady side of the Pyramid']
nums = [285/360,60/360,15/360]
colors=["lightskyblue","yellow","gold"]
ax.pie(nums, labels = langs, colors=colors,startangle=-50)
plt.show()

In [None]:
labels = ['Frogs', 'Hogs', 'Dogs', 'Logs']
sizes = [15, 30, 45, 10]
explode = (0.1, 0.1, 0.1, 0.5)  # Amount of explosion

fig1, ax1 = plt.subplots(figsize=(3,3))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.2f%%',
        shadow=True, startangle=90)
plt.show()

## Function plotting

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))

plt.plot([10,20,15,30], [40,50,10,30], lw=6)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))

t = np.arange(0.0, 5.0, 0.01)
s = np.cos(2*np.pi*t)
plt.plot(t, s, lw=2)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 5))

t = np.arange(0.0, 5.0, 0.01)
s = np.cos(2*np.pi*t)
line, = plt.plot(t, s, lw=2)

axs.annotate('Local maximum', xy=(2, 1), xytext=(3, 1.5),
             arrowprops=dict(facecolor='black', shrink=0.05),
             )

plt.ylim(-2, 2)
plt.show()

In [None]:
def f(t):
    return np.exp(-t) * np.cos(2*np.pi*t)

t1 = np.arange(0.0, 5.0, 0.1)
t2 = np.arange(0.0, 5.0, 0.02)

fig, axs = plt.subplots(2, 1, figsize=(6, 6))
axs[0].plot(t1, f(t1), 'ko')
axs[0].plot( t2, f(t2), 'b')
axs[1].plot(t1, np.cos(2*np.pi*t1), 'r.')
plt.show()

## Scatter plot
This plots points based on two arrays, one for the x-coordinates and one for the y-coordinates.

In [None]:
xpos=df["long"].values
ypos=df["lat"].values
xpos=np.array([float(a) for a in xpos])
ypos=np.array([float(a) for a in ypos])

fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.scatter(xpos,ypos)
plt.show()

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.scatter(xpos,ypos,s=5)
plt.show()

In [None]:
col=df["primary_fur_color"]
def colorpicker(c):
    if c=='Gray': return "gray"
    if c=='Cinnamon': return "gold"
    if c=='Black': return "black"
    return 'blue'
    
colors=[colorpicker(a) for a in col]

fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.scatter(xpos,ypos,s=5,c=colors)
plt.show()

In [None]:
col=df["location"]
def colorpicker(c):
    if c=='Above Ground': return "blue"
    return 'red'
    
colors=[colorpicker(a) for a in col]

fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.scatter(xpos,ypos,s=5,c=colors)
plt.show()

## Additional information: maps


In [None]:
import folium      # This is one of the many map tools available

In [None]:
coords=list(zip(ypos,xpos))

# Coordinates of New York
NYC_COORD = [40.7828, -73.970]

# Creating a map
map_nyc = folium.Map(location=NYC_COORD, zoom_start=13,  width=640, height=480)

# Using comprehension to plot the points onto the map
[folium.CircleMarker(coords[i], radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_nyc) 
for i in range(len(coords))]

# Show map
map_nyc


## Histogram

The `party.csv` contains data of silence nuisances in New York

In [None]:
party_df = pd.read_csv(os.path.join("data", "party.csv"))
party_df.head()

In [None]:
hour_raw = party_df["Created Date"].values

hour_raw[0:10]

In [None]:
hour=[ int(row[11])*10+int(row[12]) for row in hour_raw]
fig, axs = plt.subplots(1, 1, figsize=(6,6))
plt.hist(hour,bins=24)
plt.show()

In [None]:
hour=[ (int(row[11])*10+int(row[12]))*3600+(int(row[14])*10+int(row[15]))*60+int(row[17])*10+int(row[18]) for row in hour_raw]
fig, axs = plt.subplots(1, 1, figsize=(6, 6))
plt.hist(hour,bins=100)
plt.show()

In [None]:
xpos=party_df["Latitude"].values
ypos=party_df["Longitude"].values

NYC_COORD = [40.7128, -74.0059]

coords=list(zip(xpos,ypos))

map_nyc = folium.Map(location=NYC_COORD, zoom_start=12, 
 width=640, height=480)

[folium.CircleMarker(coord, radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_nyc) 
for coord in coords if not np.isnan(coord[0])]

map_nyc

## Plotting a picture based on arrays

In [None]:
# A more advanced labyrinth printer
def printmaze(image):
    fig = plt.figure(figsize = (6,6))  
    ax = fig.add_subplot(111)
    ax.imshow(image)
    plt.show()
printmaze(np.array([[1,1,1],[1,0,1],[1,1,1]]))                

In [None]:
#Labyrinth
def makemaze(w=16,h=8):
    maze=np.ones((2*w+1,2*h+1))
    vis=np.zeros((2*w+1,2*h+1))
 
    def walk(x, y):
        vis[x][y] = 1
        maze[x,y]=0
          
        d = [(x - 2, y), (x, y + 2), (x + 2, y), (x, y - 2)]
        rn.shuffle(d)
        
        for (xx, yy) in d:
            if (not xx in range(2*w+1)) or (not yy in range(2*h+1)) or vis[xx][yy]: 
                continue
            if xx == x: 
                maze[x][max(y, yy)-1] = 0
            if yy == y: 
                maze[max(x, xx)-1][y] = 0
            walk(xx, yy)
 
    walk(2*rn.randrange(w)+1, 2*rn.randrange(h)+1)
 
    return maze


In [None]:
printmaze(makemaze(30,30))

### Save as image
`plt.savefig("name.png")`

# Examples

### More examples: https://matplotlib.org/gallery/index.html

In [None]:
def koch_snowflake(order, scale=10):
    """
    Return two lists x, y of point coordinates of the Koch snowflake.

    Arguments
    ---------
    order : int
        The recursion depth.
    scale : float
        The extent of the snowflake (edge length of the base triangle).
    """
    def _koch_snowflake_complex(order):
        if order == 0:
            # initial triangle
            angles = np.array([0, 120, 240]) + 90
            return scale / np.sqrt(3) * np.exp(np.deg2rad(angles) * 1j)
        else:
            ZR = 0.5 - 0.5j * np.sqrt(3) / 3

            p1 = _koch_snowflake_complex(order - 1)  # start points
            p2 = np.roll(p1, shift=-1)  # end points
            dp = p2 - p1  # connection vectors

            new_points = np.empty(len(p1) * 4, dtype=np.complex128)
            new_points[::4] = p1
            new_points[1::4] = p1 + dp / 3
            new_points[2::4] = p1 + dp * ZR
            new_points[3::4] = p1 + dp / 3 * 2
            return new_points

    points = _koch_snowflake_complex(order)
    x, y = points.real, points.imag
    return x, y


In [None]:
x, y = koch_snowflake(order=5)

plt.figure(figsize=(8, 8))
plt.axis('equal')
plt.fill(x, y)
plt.show()

In [None]:
mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)

# the histogram of the data
n, bins, patches = plt.hist(x, 50, density=1, facecolor='g', alpha=0.75)


plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)
plt.show()

In [None]:
# First create some toy data:
x = np.linspace(0, 2*np.pi, 400)
y = np.sin(x**2)

# Create just a figure and only one subplot
fig, ax = plt.subplots()
ax.plot(x, y)
ax.set_title('Simple plot')

# Create two subplots and unpack the output array immediately
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.plot(x, y)
ax1.set_title('Sharing Y axis')
ax2.scatter(x, y)

# Create four polar axes and access them through the returned array
fig, axs = plt.subplots(2, 2, subplot_kw=dict(polar=True))
axs[0, 0].plot(x, y)
axs[1, 1].scatter(x, y)




In [None]:
data = {'a': np.arange(50),
        'c': np.random.randint(0, 50, 50),
        'd': np.random.randn(50)}
data['b'] = data['a'] + 10 * np.random.randn(50)
data['d'] = np.abs(data['d']) * 100

plt.scatter('a', 'b', c='c', s='d', data=data)
plt.xlabel('entry a')
plt.ylabel('entry b')
plt.show()

In [None]:
t = np.arange(0., 5., 0.2)

# red dashes, blue squares and green triangles
plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^')
plt.show()