# List of Exercises 1 - Exercise 3

*Student: Luigi Lucas de Carvalho Silva / luigi.lcsilva@gmail.com*

First of all, let us import some useful packages.

In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os
import scipy.integrate as integrate
import scipy.special as special
import scipy.interpolate as interpolate
import scipy.stats as stats

Defining the third distribution of the first exercise.

In [2]:
### Definition of the third distribution of the first exercise.
def exerc1_distrib3(x, x_0, sigma):
    y = (1/(np.sqrt(2*np.pi)*sigma))*(np.exp(-(x-x_0)**2/(2*sigma**2)) - np.exp(-(x+x_0)**2/(2*sigma**2)))*(1/special.erf(x_0/(np.sqrt(2)*sigma)))
    return y

Defining common parameters for the distribution.

In [3]:
### Common parameters for the third distribution.
x_0 = 10
sigma_third = 5

Defining the CDF of the third function.

In [4]:
### Defining the third function CDF. For this, I integrate the function in a interval around some sigmas of the
### mean, and then I interpolate these values to generate the CDF.

### Here is the x interval where I will do the integration. I am integrating regions in steps of "step",
### because I will make a "cumsum()" in the integrals array for obtaining the values of the CDF.
sigma = sigma_third
n_sigma = 8                                    #Numbers of sigma for the interpolation to include.
interval_division = 200000            #Number of divisions of the total region.
step = 2*n_sigma*sigma/interval_division
### I take care here for not taking negative values in the integration limits.
if (x_0-n_sigma*sigma)>0:
    x_comp = np.arange(x_0-n_sigma*sigma, x_0+n_sigma*sigma+step, step)
else:
    x_comp = np.arange(0, x_0+n_sigma*sigma+step, step)
        
### Now, I do the integration in each region and apply the "cumsum()".
y_cdf = np.array([tup[0] for tup in [integrate.quad(exerc1_distrib3, a, b, args=(x_0, sigma))
                                     for a, b in [(a, b) for a, b in zip(x_comp, x_comp[1:len(x_comp)])]]]
                                     + [0]).cumsum()

### Finally, I obtain the interpolated function. I exclude the last point because it usually leads to
### problems (I think it has to do with the integration limits definition).
scipy_exerc1_distrib3_cdf = interpolate.interp1d(x_comp[0:-1], y_cdf[0:-1], kind='linear')

Defining the inverse CDF for the third distribution.

In [5]:
### Points for interpolation. Here I get x points that are inside n_sigma*sigma around the mean.
n_sigma_third = 8
step_third = 2*n_sigma_third*sigma_third/200000

if x_0-n_sigma_third*sigma_third>0:
    x_min_third = x_0-n_sigma_third*sigma_third
    x_max_third = x_0+n_sigma_third*sigma_third
else:
    x_min_third = 0
    x_max_third = x_0+n_sigma_third*sigma_third

x_values_third = np.arange(x_min_third, x_max_third, step_third)
cdf_values_third = scipy_exerc1_distrib3_cdf(x_values_third)

### Defining the inverse CDF from a cubic interpolation using scipy.interpolate:
scipy_exerc1_distrib3_inv_cdf = interpolate.interp1d(cdf_values_third, x_values_third, kind='linear')

Defining some estimators.

In [6]:
def mean_est(x):
    'x must be an array with more than 1 number.'
    mean = x.sum()/len(x)
    return mean

def var_est(x, mean):
    'x must be an array with more than 1 number.'
    var = ((x-mean)**2).sum()/len(x)
    return var

def skew_est(x, mean, var):
    'x must be an array with more than 1 number.'
    skew = (1/(len(x)*var**(3/2)))*((x-mean)**3).sum()
    return skew

def curtosis_est(x, mean, var):
    'x must be an array with more than 1 number.'
    curtosis = (1/(len(x)*var**2))*((x-mean)**4).sum() - 3
    return curtosis

### Generating random numbers for different N

In [7]:
rnd_quantities = np.arange(80000000, 120000000, 5000000)

mean_third = []
var_third = []
skew_third = []
curtosis_third = []

j=1
for i in rnd_quantities:
    ### Seed for the random numbers (comment this for not getting fixed random numbers).
    np.random.seed(seed=j)
    
    ### Generating a uniform random sample for each size.
    unif_rnd_third = np.random.random_sample(size=i)

    ### Third distribution random sample.
    x = scipy_exerc1_distrib3_inv_cdf(unif_rnd_third)
    
    ### Estimating the mean.
    mean_x = mean_est(x)
    mean_third.append(mean_x)
    
    ### Estimating the variance.
    var_x = var_est(x, mean_x)
    var_third.append(var_x)
    
    ### Estimating the skew.
    skew_x = skew_est(x, mean_x, var_x)
    skew_third.append(skew_x)
    
    ### Estimating the curtosis.
    curtosis_x = curtosis_est(x, mean_x, var_x)
    curtosis_third.append(curtosis_x)
    
    j+=1

In [8]:
df_stat = pd.DataFrame()

### Defining new dataframes with the data.
df_quantities = pd.DataFrame(data=rnd_quantities)
df_mean = pd.DataFrame(data=mean_third)
df_var = pd.DataFrame(data=var_third)
df_skew = pd.DataFrame(data=skew_third)
df_curtosis = pd.DataFrame(data=curtosis_third)
   
### Concatening these dataframes with the main one.
df_stat = pd.concat((df_stat,df_quantities),axis=1)
df_stat = pd.concat((df_stat,df_mean),axis=1)
df_stat = pd.concat((df_stat,df_var),axis=1)
df_stat = pd.concat((df_stat,df_skew),axis=1)
df_stat = pd.concat((df_stat,df_curtosis),axis=1)

df_stat.columns = ['Quantity of Numbers','Mean', 'Var', 'Skew', 'Curtosis']

In [9]:
df_stat

Unnamed: 0,Quantity of Numbers,Mean,Var,Skew,Curtosis
0,80000000,10.476964,20.899768,0.280295,-0.208545
1,85000000,10.477309,20.894873,0.279683,-0.209959
2,90000000,10.476786,20.894649,0.280248,-0.208917
3,95000000,10.476035,20.892339,0.279781,-0.209478
4,100000000,10.476036,20.897044,0.279877,-0.209768
5,105000000,10.477015,20.896736,0.279769,-0.209622
6,110000000,10.476189,20.892413,0.280144,-0.209158
7,115000000,10.476173,20.896432,0.280317,-0.20856


After a lot of attempts, I would say that Skew and Curtosis are stabilized in two decimal places for something about $N\sim10^{8}$.