# Visualization of data





## Scatter plots


In this example, we will work with data for the vapor pressure of various linear alkane, taken from the following references:

- [DL Morgan and R Kobayashi, "Direct vapor pressure measurements of ten n-alkanes in the C$_{10}$-C$_{28}$ range," *Fluid Phase Equil.* **97**, 211-242 (1994)](http://dx.doi.org/10.1016/0378-3812(94)85017-8)
- [RD Chirico et al. "The vapor pressure of n-alkanes revisited. New high-precision vapor pressure data on n-decane, n-eicosane, and n-octacosane," *J. Chem. Eng. Data* **34**, 149-156 (1989)](http://dx.doi.org/10.1021/je00056a002)




### Importing data

The vapor pressure data from these papers have already been extracted and placed into a set of CSV files on the GitHub site for CP540.
First we import the data from the

In [None]:
import pandas as pd
import io

URL = f'https://mjksill.github.io/CP540-online/data/Morgan_Kobayashi_1994_C10.csv'
#URL = f'https://mjksill.github.io/CP540-online/data/Chirico_etal_1989_C10.csv'
df = pd.read_csv(URL)
df.head()



### plotting data

We can then quickly plot these data using the `plot` function from the `Matplotlib` library.

In [None]:
import matplotlib.pylab as plt


plt.plot(df['T/K'], df['p/kPa'])
plt.show()


In [None]:
import matplotlib.pylab as plt


plt.plot(df['T/K'], df['p/kPa'], ls='none', marker='o')

plt.xlabel('temperature / K')
plt.ylabel('pressure / kPa')
#plt.xlim(left=0)
plt.ylim(bottom=0)

plt.show()

### Fitting data

We can use Python to fit data.

For this we use the Antoine equation, which is
\begin{align*}
\log_{10} \frac{p}{p_0}
=
A - \frac{B}{T+C}
\end{align*}
where $p_0$ is a reference pressure.

In [None]:
import numpy as np
from scipy.optimize import curve_fit

def get_Antoine(T, A, B, C):
    log10_p = A - B/(T+C)
    return 10**log10_p


df_fit = df
#df_fit = df[df['T/K']>=350]
x_data = df_fit['T/K']
y_data = df_fit['p/kPa']


p0 = [3.0, 200, 25]
popt, pcov = curve_fit(get_Antoine, x_data, y_data, p0=p0)

A, B, C = popt

print(popt)
print(f'A = {A}')


T_fit = np.linspace(300, df['T/K'].iloc[-1], 100)
p_fit = [get_Antoine(T, *popt) for T in T_fit]
plt.plot(T_fit, p_fit)

plt.plot(df['T/K'], df['p/kPa'],
         ls='none', marker='o')

plt.xlabel('temperature / K')
plt.ylabel('pressure / kPa')
#plt.legend()
plt.show()







### Using loops

We can also loop over a set of files and plot over the data within each of them.

In [None]:
file_list = ['Chirico_etal_1989_C10.csv',
              'Chirico_etal_1989_C20.csv',
              'Chirico_etal_1989_C28.csv',
              'Morgan_Kobayashi_1994_C10.csv',
              'Morgan_Kobayashi_1994_C12.csv',
              'Morgan_Kobayashi_1994_C14.csv',
              'Morgan_Kobayashi_1994_C16.csv',
              'Morgan_Kobayashi_1994_C18.csv'
              ]
label_list = ['C10 Chirico et al. (1989)',
              'C20 Chirico et al. (1989)',
              'C28 Chirico et al. (1989)',
              'C10 Morgan and Kobayashi (1994)',
              'C12 Morgan and Kobayashi (1994)',
              'C14 Morgan and Kobayashi (1994)',
              'C16 Morgan and Kobayashi (1994)',
              'C18 Morgan and Kobayashi (1994)'
              ]

for filename, label in zip(file_list, label_list):
    URL = f'https://mjksill.github.io/CP540-online/data/{filename}'
    df = pd.read_csv(URL)
    plt.plot(df['T/K'], df['p/kPa'], label=label,
             ls='none', marker='o')
    print(filename)


plt.xlabel('temperature / K')
plt.ylabel('pressure / kPa')
plt.legend()

plt.savefig('vaporpressure.pdf')

plt.show()



reorganizing dat

In [None]:
file_list = ['Chirico_etal_1989_C10.csv',
              'Chirico_etal_1989_C20.csv',
              'Chirico_etal_1989_C28.csv',
              'Morgan_Kobayashi_1994_C10.csv',
              'Morgan_Kobayashi_1994_C12.csv',
              'Morgan_Kobayashi_1994_C14.csv',
              'Morgan_Kobayashi_1994_C16.csv',
              'Morgan_Kobayashi_1994_C18.csv'
              ]

import re
pattern = r'_C.*?\.csv'

df = pd.DataFrame()

for filename in file_list:

    alkane = filename.split('_')[-1]
    alkane = alkane.replace('.csv', '')
    ref = re.sub(pattern, '', filename)
    print(alkane, ref)

    URL = f'https://mjksill.github.io/CP540-online/data/{filename}'
    tmp = pd.read_csv(URL)
    df_tmp = pd.DataFrame()
    df_tmp['T/K'] = tmp['T/K']
    df_tmp['p/kPa'] = tmp['p/kPa']
    df_tmp['Ref'] = ref
    df_tmp['alkane'] = alkane
    df = pd.concat([df, df_tmp])

df.head()


alkane_list = ['C10', 'C12', 'C14', 'C16', 'C18', 'C20', 'C28']

for alkane in alkane_list:
  df_tmp = df[df['alkane']==alkane]
  tmp = df_tmp[df_tmp['Ref']=='Morgan_Kobayashi_1994']
  plt.plot(tmp['T/K'], tmp['p/kPa'], label=alkane, ls='none', marker='o')
  tmp = df_tmp[df_tmp['Ref']=='Chirico_etal_1989']
  #plt.plot(tmp['T/K'], tmp['p/kPa'], label=alkane, ls='none', marker='x')
  plt.plot(tmp['T/K'], tmp['p/kPa'], ls='none', marker='x')

plt.legend()
plt.show()


We can also perform curve fitting for each of the data points

In [None]:

import numpy as np
from scipy.optimize import curve_fit

def get_Antoine(T, A, B, C):
    log10_p = A - B/(T+C)
    return 10**log10_p


file_list = ['Chirico_etal_1989_C10.csv',
              'Chirico_etal_1989_C20.csv',
              'Chirico_etal_1989_C28.csv',
              'Morgan_Kobayashi_1994_C10.csv',
              'Morgan_Kobayashi_1994_C12.csv',
              'Morgan_Kobayashi_1994_C14.csv',
              'Morgan_Kobayashi_1994_C16.csv',
              'Morgan_Kobayashi_1994_C18.csv'
              ]
import re
pattern = r'_C.*?\.csv'

df = pd.DataFrame()

for filename in file_list:

    alkane = filename.split('_')[-1]
    alkane = alkane.replace('.csv', '')
    ref = re.sub(pattern, '', filename)
    print(alkane, ref)

    URL = f'https://mjksill.github.io/CP540-online/data/{filename}'
    tmp = pd.read_csv(URL)
    df_tmp = pd.DataFrame()
    df_tmp['T/K'] = tmp['T/K']
    df_tmp['p/kPa'] = tmp['p/kPa']
    df_tmp['Ref'] = ref
    df_tmp['alkane'] = alkane
    df = pd.concat([df, df_tmp])

df.head()


alkane_list = ['C10', 'C12', 'C14', 'C16', 'C18', 'C20', 'C28']

for alkane in alkane_list:
  df_tmp = df[df['alkane']==alkane]

  tmp = df_tmp[df_tmp['Ref']=='Morgan_Kobayashi_1994']
  plt.plot(tmp['T/K'], tmp['p/kPa'], label=alkane, ls='none', marker='o')
  tmp = df_tmp[df_tmp['Ref']=='Chirico_etal_1989']
  #plt.plot(tmp['T/K'], tmp['p/kPa'], label=alkane, ls='none', marker='x')
  plt.plot(tmp['T/K'], tmp['p/kPa'], ls='none', marker='x')

  df_fit = df_tmp[df_tmp['T/K']>=350]
  x_data = df_fit['T/K']
  y_data = df_fit['p/kPa']
  p0 = [3.0, 200, -20]
  popt, pcov = curve_fit(get_Antoine, x_data, y_data, p0=p0)
  Tmin = max(350, df['T/K'].iloc[0])
  T_fit = np.linspace(Tmin, df_tmp['T/K'].max(), 100)
  p_fit = [get_Antoine(T, *popt) for T in T_fit]
  plt.plot(T_fit, p_fit, label=label+' fit')

  print(alkane, popt)


plt.xlabel('temperature / K')
plt.ylabel('pressure / kPa')
plt.legend()

#plt.savefig('vaporpressure.pdf')

plt.show()

### Color selection

https://python-graph-gallery.com/continuous-color-palette/



In [None]:
import numpy as np
from scipy.optimize import curve_fit

def get_Antoine(T, A, B, C):
    log10_p = A - B/(T+C)
    return 10**log10_p


file_list = ['Chirico_etal_1989_C10.csv',
              'Chirico_etal_1989_C20.csv',
              'Chirico_etal_1989_C28.csv',
              'Morgan_Kobayashi_1994_C10.csv',
              'Morgan_Kobayashi_1994_C12.csv',
              'Morgan_Kobayashi_1994_C14.csv',
              'Morgan_Kobayashi_1994_C16.csv',
              'Morgan_Kobayashi_1994_C18.csv'
              ]
import re
pattern = r'_C.*?\.csv'

df = pd.DataFrame()

for filename in file_list:

    alkane = filename.split('_')[-1]
    alkane = alkane.replace('.csv', '')
    ref = re.sub(pattern, '', filename)
    print(alkane, ref)

    URL = f'https://mjksill.github.io/CP540-online/data/{filename}'
    tmp = pd.read_csv(URL)
    df_tmp = pd.DataFrame()
    df_tmp['T/K'] = tmp['T/K']
    df_tmp['p/kPa'] = tmp['p/kPa']
    df_tmp['Ref'] = ref
    df_tmp['alkane'] = alkane
    df = pd.concat([df, df_tmp])

df.head()


alkane_list = ['C10', 'C12', 'C14', 'C16', 'C18', 'C20', 'C28']

cmap = plt.get_cmap('rainbow')
color_list = [cmap(i) for i in np.linspace(0, 1, len(alkane_list))]

for alkane, color in zip(alkane_list, color_list):
  df_tmp = df[df['alkane']==alkane]

  tmp = df_tmp[df_tmp['Ref']=='Morgan_Kobayashi_1994']
  plt.plot(tmp['T/K'], tmp['p/kPa'], label=alkane, ls='none', marker='o', color=color)
  tmp = df_tmp[df_tmp['Ref']=='Chirico_etal_1989']
  #plt.plot(tmp['T/K'], tmp['p/kPa'], label=alkane, ls='none', marker='x')
  plt.plot(tmp['T/K'], tmp['p/kPa'], ls='none', marker='x', color=color)

  df_fit = df_tmp[df_tmp['T/K']>=350]
  x_data = df_fit['T/K']
  y_data = df_fit['p/kPa']
  p0 = [3.0, 200, -20]
  popt, pcov = curve_fit(get_Antoine, x_data, y_data, p0=p0)
  Tmin = max(350, df['T/K'].iloc[0])
  T_fit = np.linspace(Tmin, df_tmp['T/K'].max(), 100)
  p_fit = [get_Antoine(T, *popt) for T in T_fit]
  plt.plot(T_fit, p_fit, label=label+' fit', color=color)

  print(alkane, popt)




plt.yscale('log')
plt.xlabel('temperature / K')
plt.ylabel('pressure / kPa')
#plt.legend()

#plt.savefig('vaporpressure.pdf')

plt.show()

## Other types of plots

### Histograms

https://python-graph-gallery.com/histogram/

https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt


x = np.random.uniform(low=0, high=1)
print(x)

data_list = np.random.uniform(low=0, high=1, size=1000)
print(data_list)

plt.hist(data_list, bins=10)
plt.show()

plt.hist(data_list, bins=10, density=True)
plt.show()

\begin{align*}
p(x) = (2\pi \sigma^2)^{-1/2}
e^{-\frac{(x-\mu)^2}{2\sigma^2}}
\end{align*}


In [None]:
import numpy as np

mu = 1
sig = 1


data_list = np.random.normal(loc=mu, scale=sig, size=100)
#plt.hist(data_list, bins=100)
plt.hist(data_list, bins=100, density=True)

x_list = np.linspace(mu-5*sig, mu+5*sig, 100)
y_list = [1/np.sqrt(2*np.pi*sig**2)*np.exp(-(x-mu)**2/(2*sig**2)) for x in x_list]
plt.plot(x_list, y_list)

plt.show()

### Bar charts

## Example data

### Rainfall

In [None]:
import pandas as pd
import io
import matplotlib.pylab as plt

year = 2000

URL = f'https://mjksill.github.io/CP540-online/data/rain/midas-open_uk-hourly-rain-obs_dv-202207_renfrewshire_24125_glasgow-bishopton_qcv-1_{year}.csv'
df_tmp = pd.read_csv(URL, engine='python',
                     skiprows=[*range(61)],
                     skipinitialspace=True,
                     skipfooter=1)

df_tmp = df_tmp[df_tmp['ob_hour_count']==1]
df_tmp.dropna()
df_tmp['ob_end_time'] = pd.to_datetime(df_tmp['ob_end_time'], format='%Y-%m-%d %H:%M:%S')
df_tmp['day'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m-%d')
df_tmp['day'] = pd.to_datetime(df_tmp['day'], format='%Y-%m-%d')
df_tmp['day_of_year'] = df_tmp['ob_end_time'].dt.strftime('%m-%d')
df_tmp['month_str'] = df_tmp['ob_end_time'].dt.strftime('%m')
df_tmp['month'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m')
df_tmp['month'] = pd.to_datetime(df_tmp['month'], format='%Y-%m')
df_tmp['year_str'] = df_tmp['ob_end_time'].dt.strftime('%Y')
df_tmp['year'] = pd.to_datetime(df_tmp['year_str'], format='%Y')

print(df_tmp)




In [None]:
import pandas as pd
import io
import matplotlib.pylab as plt

year_list = [2000, 2001, 2002, 2003, 2004, 2005]

df = pd.DataFrame()
for year in year_list:
    URL = f'https://mjksill.github.io/CP540-online/data/rain/midas-open_uk-hourly-rain-obs_dv-202207_renfrewshire_24125_glasgow-bishopton_qcv-1_{year}.csv'

    df_tmp = pd.read_csv(URL,
                         skiprows=[*range(61)],
                         skipfooter=1,
                         skipinitialspace=True, engine='python')
    df_tmp = df_tmp[df_tmp['ob_hour_count']==1]
    df_tmp.dropna()
    df_tmp['ob_end_time'] = pd.to_datetime(df_tmp['ob_end_time'], format='%Y-%m-%d %H:%M:%S')
    df_tmp['day'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m-%d')
    df_tmp['day'] = pd.to_datetime(df_tmp['day'], format='%Y-%m-%d')
    df_tmp['day_of_year'] = df_tmp['ob_end_time'].dt.strftime('%m-%d')
    df_tmp['month_str'] = df_tmp['ob_end_time'].dt.strftime('%m')
    df_tmp['month'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m')
    df_tmp['month'] = pd.to_datetime(df_tmp['month'], format='%Y-%m')
    df_tmp['year_str'] = df_tmp['ob_end_time'].dt.strftime('%Y')
    df_tmp['year'] = pd.to_datetime(df_tmp['year_str'], format='%Y')

#    print(csvfile)
#    print(f'  {df_tmp.shape}')
    df = pd.concat([df, df_tmp])

df.sort_values(by='ob_end_time', ignore_index=True)
df.reset_index(drop=True)
df['accumulation'] = df['prcp_amt'].cumsum()

#plt.plot(df['ob_end_time'], df['prcp_amt'])
#plt.ylabel(r'precipitation / mm')
#plt.xlabel(r'date')
#plt.ylim([0.0, 20.0])
#plt.show()


### Solar irradiation

In [None]:
import pandas as pd
import io
import matplotlib.pylab as plt

year = 2000

URL = f'https://mjksill.github.io/CP540-online/data/sun/midas-open_uk-radiation-obs_dv-202207_renfrewshire_24125_glasgow-bishopton_qcv-1_{year}.csv'
df_tmp = pd.read_csv(URL, engine='python',
                     skiprows=[*range(75)],
                     skipinitialspace=True,
                     skipfooter=1)

df_tmp.dropna()
df_tmp['ob_end_time'] = pd.to_datetime(df_tmp['ob_end_time'], format='%Y-%m-%d %H:%M:%S')
df_tmp['day'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m-%d')
df_tmp['day'] = pd.to_datetime(df_tmp['day'], format='%Y-%m-%d')
df_tmp['day_of_year'] = df_tmp['ob_end_time'].dt.strftime('%m-%d')
df_tmp['month_str'] = df_tmp['ob_end_time'].dt.strftime('%m')
df_tmp['month'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m')
df_tmp['month'] = pd.to_datetime(df_tmp['month'], format='%Y-%m')
df_tmp['year_str'] = df_tmp['ob_end_time'].dt.strftime('%Y')
df_tmp['year'] = pd.to_datetime(df_tmp['year_str'], format='%Y')


df = df_tmp

df.sort_values(by='ob_end_time')
df['kWh_per_m2'] = df['glbl_irad_amt'] / 3600.0
df['accumulation'] = df['kWh_per_m2'].cumsum()


bins = 100
#plt.hist(df[df['month_str']=='01']['glbl_irad_amt'], bins)
plt.hist(df['kWh_per_m2'], bins)
plt.xlabel(r'hourly solar irradiation / kW h m$^{-2}$')
plt.ylabel(r'frequency')
plt.yscale('log')

#print(df.shape)
#df.head()


In [None]:
import pandas as pd
import io
import matplotlib.pylab as plt

year = 2000
year_list = [2000]
df = pd.DataFrame()
for year in year_list:
  URL = f'https://mjksill.github.io/CP540-online/data/sun/midas-open_uk-radiation-obs_dv-202207_renfrewshire_24125_glasgow-bishopton_qcv-1_{year}.csv'
  df_tmp = pd.read_csv(URL, engine='python',
                       skiprows=[*range(75)],
                       skipinitialspace=True,
                       skipfooter=1)

  df_tmp.dropna()
  df_tmp['ob_end_time'] = pd.to_datetime(df_tmp['ob_end_time'], format='%Y-%m-%d %H:%M:%S')
  df_tmp['day'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m-%d')
  df_tmp['day'] = pd.to_datetime(df_tmp['day'], format='%Y-%m-%d')
  df_tmp['day_of_year'] = df_tmp['ob_end_time'].dt.strftime('%m-%d')
  df_tmp['month_str'] = df_tmp['ob_end_time'].dt.strftime('%m')
  df_tmp['month'] = df_tmp['ob_end_time'].dt.strftime('%Y-%m')
  df_tmp['month'] = pd.to_datetime(df_tmp['month'], format='%Y-%m')
  df_tmp['year_str'] = df_tmp['ob_end_time'].dt.strftime('%Y')
  df_tmp['year'] = pd.to_datetime(df_tmp['year_str'], format='%Y')

  df = pd.concat([df, df_tmp])

df.sort_values(by='ob_end_time')
df['kWh_per_m2'] = df['glbl_irad_amt'] / 3600.0
df['accumulation'] = df['kWh_per_m2'].cumsum()


bins = 100
#plt.hist(df[df['month_str']=='01']['glbl_irad_amt'], bins)
plt.hist(df['kWh_per_m2'], bins)
plt.xlabel(r'hourly solar irradiation / kW h m$^{-2}$')
plt.ylabel(r'frequency')
plt.yscale('log')