# Initial imports
We import associated libraries like pandas and numpy to load and interact with dataset.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import stemgraphic

ModuleNotFoundError: No module named 'matplotlib'

# Loading Dataset
We use diabetes dataset in this notebook. So the first step is to load the dataset using pandas library and pandas DataFrame data type.

In [None]:
dataset_path = "../datasets/diabetes.csv"
df = pd.read_csv(dataset_path)
df.head()

# Random Sampling

In [None]:
df.sample(10)

# Scales
## Interval Scale
If an scale that is a real number, preserves the ratio of every two difference, its called an Internal Scale.

For example in below we can show that column S2 that shows ldl of each person by the unit of mg/dL , is a kind of interval scale.

In [None]:
def get_interval_scale_ratio(x1, x2, x3, x4):
    return (x4 - x3) / (x2 - x1)

x1, x2, x3, x4 = df[:4]['S2']
print("The ratio in unit mg/dL is: ", get_interval_scale_ratio(x1, x2, x3, x4))


And now we convert the unit from mg/dL to g/L in order to compare the ratio of new scale with previouse one.

In [None]:
def mgperdl_to_gperl(x):
    return x * 0.01

y1, y2, y3, y4 = [mgperdl_to_gperl(x) for x in [x1, x2, x3, x4]]
print("The ratio in unit g/L is: ", get_interval_scale_ratio(y1, y2, y3, y4))


## Ratio Scale
If a real number scale, preserves the ratio, it is called Ratio Scale.
We use x1 and x2 and also y1 and y2 to demonstrate that ldl is a kind of Ratio Scale.

In [None]:
print("The ratio of x1 and x2 is: ", x1/ x2)
print("The ratio of y1 and y2 is: ", y1/ y2)

# Variables
There are two kinds of variables that is a feature of all data.
1. Grouping Variables
2. Quantitative Variables

## Grouping Variables
A grouping variable is a variable that is used to classify datas. It is measured by nominal or ordinal scales and classifies due to them.

In [None]:
grouped_data = df.groupby('SEX')
print("SEX 1 group: ")
print(grouped_data.get_group(1).head())
print()
print("SEX 2 group: ")
print(grouped_data.get_group(2).head())


# Rounding continuous data 
ًWe can round datas of column S5 to two decimal places and also datas of column S2 to 0 decimal places at the same time.

In [None]:
df.round({'S5': 2, 'S2': 0}).head()

# Frequency
## Types of Frequency
- Absolute Frequency
- Relative Frequency
- Cumulative Frequency
- Cumulative Relative Frequency


We can group S2 column (ldl) to 30 mg/dL width intervals. And then calculate each interval's frequency.

In [None]:
s2_freq_df = df.copy()
n = 30
first = 70
s2_freq_df["s2_ranges"] = (s2_freq_df['S2']
         .sub(first).floordiv(n)
         .mul(n).add(first).map(lambda x: pd.Interval(x, x + 30))
         )
s2_freq_df

In [None]:
absolute_frequencies = s2_freq_df["s2_ranges"].value_counts().sort_index()
absolute_frequencies.rename("f_i", inplace=True)
print("Absolute Frequencies are as below: ")
print(absolute_frequencies)
print()

In [None]:
relative_frequencies = s2_freq_df["s2_ranges"].value_counts(normalize=True).sort_index()
relative_frequencies.rename("r_i", inplace=True)
print("Relative Frequencies are as below: ")
print(relative_frequencies)
print()

In [None]:
cumulative_frequencies = absolute_frequencies.cumsum()
cumulative_frequencies.rename("g_i", inplace=True)
print("Cumulative Frequencies are as below: ")
print(cumulative_frequencies)
print()

In [None]:
cumulative_relative_frequencies = relative_frequencies.cumsum()
cumulative_relative_frequencies.rename("s_i", inplace=True)
print("Cumulative Relative Frequencies are as below: ")
print(cumulative_relative_frequencies)
print()

Also we can merge 3 series into one dataframe

In [None]:
s2_all_freq_df = pd.concat([absolute_frequencies, relative_frequencies, cumulative_frequencies, cumulative_relative_frequencies], axis=1)
s2_all_freq_df["c_i"] = s2_all_freq_df.index.map(lambda x: (x.left + x.right) / 2)
s2_all_freq_df

# Plots

### Pie Plot

In [None]:
bmi_labels = ["UNDERWEIGHT", "HEALTHY", "OVERWEIGT", "OBESE"]
df_bmi_grouped = df.copy()
df_bmi_grouped["BMI_bins"] = pd.cut(
    df_bmi_grouped['BMI'], bins=[0, 18.5, 25, 30, 40], labels=bmi_labels)
print(df_bmi_grouped)
df_bmi_grouped = df_bmi_grouped.groupby(["BMI_bins"]).size().sort_values()
df_bmi_grouped.plot.pie(y="BMI_bins", startangle=90)


### Bar Plot

In [None]:
s2_all_freq_df.plot.bar(x='c_i', y='f_i')

### Box Plot

In [None]:
df.plot.box(column=["BP"], vert=False)

### Line Chart

In [None]:
df_s3_grouped = df.copy()
df_s3_grouped["S3_bins"] = pd.cut(df_s3_grouped["S3"], bins=10)
df_s3_grouped = df_s3_grouped.groupby(["S3_bins"]).size().cumsum()
df_s3_grouped.plot.line(y="S3_bins")

### Heatmap

In [None]:
sb.heatmap(df)

### Histogram

In [None]:

df.plot.hist(column=["S2"], bins=   np.arange(40, 251, 30))
df.plot.hist(column=["S2"], bins=np.arange(40, 251, 30), cumulative=1)

### Frequency Diagram (+ Qumulative)

In [None]:
x = np.array([s2_all_freq_df.iloc[0]['c_i'] - 30, *s2_all_freq_df['c_i'], s2_all_freq_df.iloc[-1]['c_i'] + 30])
y = np.array([0, *s2_all_freq_df['f_i'], 0])
plt.plot(x, y)

In [None]:
x = np.array([s2_all_freq_df.iloc[0]['c_i'] - 30, *s2_all_freq_df['c_i'], s2_all_freq_df.iloc[-1]['c_i'] + 30])
y = np.array([0, *s2_all_freq_df['g_i'], 0])
plt.plot(x, y)

### Normal Frequency Curve

In [None]:
from scipy.stats import norm
  
x_axis = np.arange(-20, 20, 0.01)
mean = 4
std = 3
  
plt.plot(x_axis, norm.pdf(x_axis, mean, std))
plt.show()
plt.plot(x_axis, norm.cdf(x_axis, mean, std))
plt.show()

# Estimates of Location

## Mean (Arithmetic)
We can simply find the mean of the data in a column by calling `mean` method on it.

In [None]:
df_s2_mean = df["S2"].mean()
print("The mean of column 'S2' is {} mg/dL".format(df_s2_mean))

Also we can call mean method on whole dataframe to represent mean of every column in the table in a Pandas Series.

In [None]:
print("The mean of each column are as below: ")
df.mean()

## Geometric Mean
The pandas dataframe class itself does not provide a method to calculate geometric mean.

So we get help from `scipy` library.

In [None]:
from scipy.stats.mstats import gmean
df_s2_gmean = gmean(df["S2"])
print("The geometric mean of column 'S2' is {} mg/dL".format(df_s2_gmean))

And we can also calculate the geometric mean for every column by the function provided by `scipy`.

In [None]:
for col, gm in zip(df.columns, gmean(df)):
    print("{} : {}".format(col, gm))
# gmean(df)

## Harmonic Mean
Another kind of mean is harmonic mean that is also provided by `scipy` library

In [None]:
from scipy.stats.mstats import hmean
df_s2_gmean = hmean(df["S2"])
print("The harmonic mean of column 'S2' is {} mg/dL".format(df_s2_gmean))

And we can also calculate the harmonic mean for every column by the function provided by `scipy`.

In [None]:
for col, gm in zip(df.columns, hmean(df)):
    print("{} : {}".format(col, gm))

## Median
We can simply find the median of the data in a column by calling `median` method on it.

In [None]:
df_s2_median = df["S2"].median()
print("The median of column 'S2' is {} mg/dL".format(df_s2_median))

We can also calculate the whole dataframe's median easily.

In [None]:
df.median()

## Quantiles
Quantiles also can be calculated by method `quantile`. The quantile 0.5 is exactly the median.

In [None]:
df.quantile(q=0.5)

But you can also call this method to calculate another quantiles like 0.1.

In [None]:
df.quantile(q=0.6)

## Mode
To calculate the mode of each column you can use `mode` method like below. Note that there may be more than one modes in the data, because of equality betweeen their frequencies.

In [None]:
df['S2'].mode()

And of course you can call `mode` method on the whole dataframe instead of just one column.

In [None]:
df.mode()

## Trimmed Mean
Using scipy we can calculate trimmed mean for a column or even whole dataframe.

In [None]:
from scipy.stats import trim_mean
df_s2_tmean = trim_mean(df["S2"], proportiontocut=0.15)
print("Trimmed mean of S2 column with proportion cut of 15% is {}.".format(df_s2_tmean))
print("Trimmed mean for whole dataframe with proportion cut of 15% is as below: ")
for col, mean in zip(df.columns, trim_mean(df, 0.15)):
    print("{} : {}".format(col, mean))

# Estimates of Variability

## Range
Finding range in python. The difference between min and max.

In [None]:
df_s2_range = df["S2"].max() - df["S2"].min()
df_s2_range
print("The range of column S2 is : {}.".format(df_s2_range))

print("And below is the range of whole dataframe: ")
df.max() - df.min()

## Mean Absolute Deviation
The mean absolute deviation can be calculated by `mad` method

In [None]:
df_s2 = df["S2"].copy()
df_s2_mad = (df_s2 - df_s2.mean()).abs().mean()
print("The mean absolute deviation of column S2 is: {}.".format(df_s2_mad))
print("And the mean absolute deviation of whole dataframe are as below: ")
(df - df.mean()).abs().mean()

## Variance and Standard Deviation
Variance and Standard Deviation can be calculated by `var` and `std` method.

In [None]:
df_s2_var = df["S2"].var()
df_s2_std = df["S2"].std()
print("Variance of column S2 is {} and its standard deviation is {}.".format(df_s2_var, df_s2_std))
print("Variance of whole dataframe: ")
print(df.var())
print("Standard Deviation of whole dataframe: ")
print(df.std())

## Data Standardization

In [None]:
def standardize_data(data):
    return (data - data.mean()) / data.std()

standardized_data_s2 = standardize_data(df['S2'])

print("Here's standardized data in another dataframe.")
print(standardized_data_s2)

print("The mean of standardized data is 0 and its variance is 1.")
print("Mean =", round(standardized_data_s2.mean(), 4))
print("Var =", round(standardized_data_s2.var(), 4))

## Coefficient of variation

In [None]:
df_s2_cv = df["S2"].std() / df["S2"].mean()
print("Coefficient of Variation of S2 is: {}.".format(df_s2_cv))

print("Coefficient of Variation of whole dataframe are as below: ")
df.std() / df.mean()

## Quarters' half range
To calculate half range of some data we can calculate average of first and third quarter.

In [None]:
df_s2_qhr = (df["S2"].quantile(0.75) - df["S2"].quantile(0.25)) / 2
print("Quarters' half range of column S2 is", df_s2_qhr)

print("Quarters' half range of whole dataframe: ")
(df.quantile(0.75) - df.quantile(0.25)) / 2

### Interquartile range
Interquartile range is the first derivative of Quarters' half range. It is a kind of estimates of location.

In [None]:
df_s2_iqr = (df["S2"].quantile(0.25) + df["S2"].quantile(0.75)) / 2
print("Interquartile range of column S2 is", df_s2_iqr)

print("Interquartile range of whole dataframe: ")
(df.quantile(0.25) + df.quantile(0.75)) / 2

# Moment
To calculate the `r`th central moment you can run as below: 

In [None]:
from scipy.stats import moment
r = 3
moment(df["S2"], r)

# Skewness

In [None]:
df["S6"].skew()

- We can calculate whole dataframe's skewness like below.

In [None]:
df.skew()

# Kurtosis

In [None]:
df["S4"].kurtosis()

- We can calculate whole dataframe's kurtosis like below.

In [None]:
df.kurtosis()

# Stem and Leaf Diagram

In [None]:
stemgraphic.stem_graphic(df["S2"], scale=10)

# Normal Curve

In [None]:
from scipy.stats import norm

x_axis = np.arange(-20, 20, 0.01)
mean = x_axis.mean()
std = x_axis.std()
plt.plot(x_axis, norm.pdf(x_axis, mean, std))
plt.show()
plt.boxplot(x_axis, vert=False)
plt.show()