# Module 9: Plotting data

In this module, we will cover how to visualise data by plotting graphs and lines using the <code>numpy</code> and <code>matplotlib</code> libraries.

## Importing libraries and creating sample data

In [None]:
# Importing the necessary libraries
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Creating some sample data for demonstration
x = np.linspace(0, 10, 100) # 100 points from 0 to 10
y = np.sin(x)

## Line Plot

In [None]:
# Basic Line Plot
plt.figure(figsize=(8, 4))
plt.plot(x, y, label='Sine Curve', color='blue', linestyle='-', linewidth=2)
plt.title('Basic Line Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.grid(True)
plt.show()

## Scatter Plot

In [None]:
# Scatter Plot
plt.figure(figsize=(8, 4))
plt.scatter(x, y, label='Scatter Plot', color='red', marker='o')
plt.title('Scatter Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.grid(True)
plt.show()

## Bar Charts

In [None]:
# Bar Chart
categories = ['Category A', 'Category B', 'Category C', 'Category D']
values = [25, 50, 30, 45]

plt.figure(figsize=(8, 4))
plt.bar(categories, values, label='Bar Chart', color='green')
plt.title('Bar Chart')
plt.xlabel('Categories')
plt.ylabel('Values')
plt.legend()
plt.grid(axis='y')
plt.show()

## Histograms

In [None]:
# Histogram

data = np.random.randn(1000)

plt.figure(figsize=(8, 4))
plt.hist(data, bins=20, edgecolor='black', alpha=0.7)
plt.title('Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()

## Customizing Plots

In [None]:
# Customizing Plot Appearance
plt.figure(figsize=(8, 4))
plt.plot(x, y, label='Sine Curve', color='blue', linestyle='-', linewidth=2, marker='o', markersize=4, markerfacecolor='red', markeredgecolor='black')
plt.title('Customized Line Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.grid(True)
plt.show()

## Subplots

This section demonstrates how to create a grid of subplots within a single figure and customize each subplot with different types of plots (line plot, scatter plot, bar chart, histogram) and titles. This can be a useful technique for comparing and visualizing multiple datasets or aspects of data within a single figure.

In [None]:
# Subplots
plt.figure(figsize=(12, 6))

# Subplot 1
plt.subplot(2, 2, 1)
plt.plot(x, y, color='blue')
plt.title('Subplot 1')

# Subplot 2
plt.subplot(2, 2, 2)
plt.scatter(x, y, color='red')
plt.title('Subplot 2')

# Subplot 3
plt.subplot(2, 2, 3)
plt.bar(categories, values, color='green')
plt.title('Subplot 3')

# Subplot 4
plt.subplot(2, 2, 4)
plt.hist(data, bins=20, edgecolor='black', alpha=0.7)
plt.title('Subplot 4')

plt.tight_layout()
plt.show()

## PDF and CDF curves

Let's visualise the Probability Density Function (PDF) and the Cumulative Density Function (CDF), which are fundamental concepts in probability and statistics, and they are used to describe and analyze probability distributions.

The <code>PDF</code> represents the probability distribution of a continuous random variable. It shows how the probability of the variable taking a specific value or falling within a certain range is distributed across its entire range. It is useful for assesing the likelihood of specific outcomes, estimate percentiles, and make probabilistic predictions

The <code>CDF</code> represents the cumulative probability of a random variable taking on a value less than or equal to a given point. In other words, it shows the probability that the variable is less than or equal to a particular value. It is useful for assessing the likelihood of specific outcomes or ranges, and understand the overall behavior of random variables. 

The <code>PDF</code> and <code>CDF</code> are essential in risk analysis, reliability engineering, and quality control

In [None]:
# Probability Density Function (PDF)

# Generate some random data for demonstration
random_data = np.random.normal(0, 1, 1000)

# Create a PDF plot
plt.figure(figsize=(8, 4))
plt.hist(random_data, bins=30, density=True, alpha=0.7, color='blue', edgecolor='black')
plt.title('PDF (Probability Density Function)')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.axis([-4, 4, 0, 0.5])

# Customized PDF
from scipy.stats import norm

# Generate data following a normal distribution
data = np.random.normal(0, 1, 1000)

# PDF
plt.figure(figsize=(8, 4))
plt.hist(data, bins=30, density=True, alpha=0.7, color='green', edgecolor='black')
x_range = np.linspace(-3, 3, 100)
pdf_values = norm.pdf(x_range, 0, 1)
plt.plot(x_range, pdf_values, color='blue', linestyle='--', linewidth=2)
plt.title('Customized PDF')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.axis([-4, 4, 0, 0.5]);

In [None]:
#Cumulative Distribution Function (CDF)

# Create a CDF plot
plt.figure(figsize=(8, 4))
plt.hist(random_data, bins=30, density=True, cumulative=True, alpha=0.7, color='red', edgecolor='black')
plt.title('CDF (Cumulative Distribution Function)')
plt.xlabel('Value')
plt.ylabel('Cumulative Probability')
plt.axis([-4, 4, 0, 1.1])

# Customized CDF

# CDF
plt.figure(figsize=(8, 4))
plt.hist(data, bins=30, density=True, cumulative=True, alpha=0.7, color='purple', edgecolor='black')
cdf_values = norm.cdf(x_range, 0, 1)
plt.plot(x_range, cdf_values, color='red', linestyle='--', linewidth=2)
plt.title('Customized CDF')
plt.xlabel('Value')
plt.ylabel('Cumulative Probability')
plt.axis([-4, 4, 0, 1.1]);

## Example of plotting temperature changes over period of time

Lets try to make an example on how we can plot the change of temperature over a period of time. In this example, we obtain the <code>temperature_data</code> from this website: https://www.ncei.noaa.gov/access/monitoring/climate-at-a-glance/global/time-series 

On the website it allows you to export data as a <code>CSV</code> file of the temperature anomalies relative to the coordinate anomalies are with respect to the 1991-2020 average. All other regional anomalies are with respect to the 1910-2000 average.

Then we can use <code>pandas</code> to read the <code>CSV</code> file and <code>motplotlib</code> to plot the data after reading it!

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the data from the CSV file

data = pd.read_csv('temperature_data.csv')

# Extract the 'Year' and 'Anomaly' columns
years = data['Year']
anomaly = data['Anomaly']

# Create a line plot
plt.figure(figsize=(12, 6))
plt.plot(years, anomaly, marker='o', linestyle='-', color='b', label='Temperature Anomaly')
plt.title('Global Temperature Anomaly Over the Past 80 Years, relative to 1951-1980 average')
plt.xlabel('Year')
plt.ylabel('Temperature Anomaly (°C)')
plt.grid(True)
plt.legend()
plt.show()