## Data analysis and preparation

Before we can do anything with the data, we have to inspect it and make sure, that there are no defects.

In [None]:
import sys
# !conda install --yes --prefix {sys.prefix} matplotlib
# !conda install --yes --prefix {sys.prefix} missingno
# !conda install --yes --prefix {sys.prefix} pandas
# !pip install pandas-profiling
# !conda install --yes --prefix {sys.prefix} seaborn
# !conda install --yes --prefix {sys.prefix} warnings

In [None]:
from matplotlib.axes._axes import _log as matplotlib_axes_logger
import matplotlib.pyplot as plt
import missingno
import pandas as pd
# from pandas_profiling import ProfileReport
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# load the data from the .txt file
file_name = 'data.txt'
# file_name = 'test/data_test.txt'
input_path = '../../data/raw/'+file_name
df = pd.read_csv(input_path)

In [None]:
# get data analysis by pandas profiling
# prof = ProfileReport(df)
# profile_report_name = '01_profiling_output_'+file_name+'.html'
# prof.to_file(output_file=profile_report_name)

In [None]:
# check the decoration
df.columns

In [None]:
df.head(5)

In [None]:
df_input = df.filter(items=['pressure in bar', 'inlet temperature in K', 'temperature in K', 'residence time in s', 'portion of product in feed'])
df_target = df.filter(regex='conversion CO2')

### Explore Target Variables

In [None]:
# use describe to get a summary of the data
df_target.describe()

In [None]:
# histogram
mpi_blue = (51/255, 165/255, 195/255)
mpi_red = (120/255, 0/255, 75/255)
mpi_green = (0/255, 118/255, 1/255)
mpi_grey = (56/255, 60/255, 60/255)
import scipy.stats as stats
from matplotlib.ticker import MaxNLocator
plt.figure(figsize=(10, 6))
sns.distplot(df['conversion CO2'], bins=20, kde=True, color=mpi_blue);
# Set a maximum number of ticks for the x-axis
max_ticks = 6  # Adjust this value to control the number of ticks
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=False, nbins=max_ticks))
max_ticks = 6  # Adjust this value to control the number of ticks
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True, nbins=max_ticks))
plt.savefig('X_CO2_density.svg', bbox_inches="tight")
plt.show()

In [None]:
# histogram
mpi_blue = (51/255, 165/255, 195/255)
import scipy.stats as stats
plt.figure()
sns.distplot(df['temperature in K'], kde=True, color=mpi_blue);
plt.savefig('temp_density.svg', bbox_inches="tight")
plt.show()

In [None]:
# boxplot
sns.boxplot(df['conversion CO2'])

### Explore Input Variables

In [None]:
# use describe to get a summary of the data
df_input.describe()

In [None]:
#histogram
fig, axs = plt.subplots(nrows=1, ncols=5,figsize=(12,6.1))
fig.tight_layout()
sns.distplot(df['pressure in bar'], ax=axs[0]);
sns.distplot(df['inlet temperature in K'], ax=axs[1]);
sns.distplot(df['temperature in K'], ax=axs[2]);
sns.distplot(df['residence time in s'], ax=axs[3]);
sns.distplot(df['portion of product in feed'], ax=axs[4]);

### Exploring Target-Input Relationships

In [None]:
matplotlib_axes_logger.setLevel('ERROR')
#scatter plot pressure/ mass fraction CO2
fig, axs = plt.subplots(nrows=1, ncols=5, figsize=(12, 6.1))
fig.tight_layout()
# pressure
var_pressure = 'pressure in bar'
data = pd.concat([df['conversion CO2'], df[var_pressure]], axis=1)
data.plot.scatter(x=var_pressure, y='conversion CO2', ylim=(-0.1,1), ax=axs[0]);
# inlet temperature
var_temp_in = 'inlet temperature in K'
data = pd.concat([df['conversion CO2'], df[var_temp_in]], axis=1)
data.plot.scatter(x=var_temp_in, y='conversion CO2', ylim=(-0.1,1), ax=axs[1]);
# temperature
var_temp = 'temperature in K'
data = pd.concat([df['conversion CO2'], df[var_temp]], axis=1)
data.plot.scatter(x=var_temp, y='conversion CO2', ylim=(-0.1,1), ax=axs[2]);
# velocity
var_velocity = 'residence time in s'
data = pd.concat([df['conversion CO2'], df[var_velocity]], axis=1)
data.plot.scatter(x=var_velocity, y='conversion CO2', ylim=(-0.1,1), ax=axs[3]);
# reflux ratio
var_reflux= 'portion of product in feed'
data = pd.concat([df['conversion CO2'], df[var_reflux]], axis=1)
data.plot.scatter(x=var_reflux, y='conversion CO2', ylim=(-0.1,1), ax=axs[4]);

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=5,figsize=(12,6.1))
fig.tight_layout()
sns.boxplot(df['pressure in bar'], ax=axs[0]);
sns.boxplot(df['inlet temperature in K'], ax=axs[1]);
sns.boxplot(df['temperature in K'], ax=axs[2]);
sns.boxplot(df['residence time in s'], ax=axs[3]);
sns.boxplot(df['portion of product in feed'], ax=axs[4]);

In [None]:
# correlation matrix
import matplotlib.colors as mc
mpi_blue = (51/255, 165/255, 195/255) # #33a5c3
mpi_red = (120/255, 0/255, 75/255) # #78004b
mpi_grey = (230/255, 230/255, 230/255) # #87878d
colors=[mpi_blue, mpi_grey, mpi_red]
div_palette = mc.LinearSegmentedColormap.from_list("mycmap", colors, gamma=0.5)
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, cbar=True, annot=True, vmin=-0.3333, vmax=1, square=True, cmap=div_palette, annot_kws={"size":15});
plt.savefig('correlation_matrix.svg'  , bbox_inches="tight")
plt.show()

### Missing Data

In [None]:
#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
# generate preview of entries with null values
if df.isnull().any(axis=None):
    print("\nPreview of data with null values:\nxxxxxxxxxxxxx")
    print(df[df.isnull().any(axis=1)].head(3))
    missingno.matrix(df)
    plt.show()

### Duplicated Entries

In [None]:
# generate count statistics of duplicate entries
if len(df[df.duplicated()]) > 0:
    print("No. of duplicated entries: ", len(df[df.duplicated()]))
    print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
else:
    print("No duplicated entries found")

### Anomaly detection

In [None]:
# Values smaller 0
df_input[df_input <= 0].count()