In [1]:
import pandas as pd
import numpy as np

### Optimized for speed since the functions are written in C <BR> and wrapped in Python

### Numpy stats functions: when missing values are in the column, use function name with nan
 > the mean, median, var, std, percentile, min, max, sum <br>
 > We use: <br> 
 1. mean &nbsp;&nbsp;       (nanmean)
 2. median &nbsp; &nbsp;    (nanmedian)
 3. var &nbsp; &nbsp;       (nanvar)
 4. std &nbsp; &nbsp;       (nanstd)
 5. percentile &nbsp;&nbsp; (nanpercentile)
 6. min &nbsp; &nbsp;       (nanmin)
 7. max &nbsp; &nbsp;       (nanmax)
 8. sum &nbsp; &nbsp;       (nansum)

In [2]:
data = [75,90,85, 90, np.nan]

In [3]:
np.mean(data)

nan

In [4]:
np.nanmean(data)

85.0

In [5]:
np.nanmin(data)

75.0

## Using a dataframe with missing values

In [6]:
df = pd.read_csv("InfantMortalityGNI.csv",encoding='latin1')

In [7]:
df.head()

Unnamed: 0,Countries,GNI_PER_CAPITA,Infant_Mortality
0,Afghanistan,590.0,73.2
1,Albania,4290.0,14.0
2,Algeria,4800.0,25.5
3,Andorra,,2.8
4,Angola,4040.0,86.5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 3 columns):
Countries           197 non-null object
GNI_PER_CAPITA      185 non-null float64
Infant_Mortality    193 non-null float64
dtypes: float64(2), object(1)
memory usage: 4.7+ KB


### List the columns in the dataframe that have missing values

In [9]:
df.isnull().sum() 

Countries            0
GNI_PER_CAPITA      12
Infant_Mortality     4
dtype: int64

### Since GNI PER CAPITA has missing values NAN, we get a NaN when using percentile

In [10]:
Tenpercentile = np.percentile(df['GNI_PER_CAPITA'], 10)
print("The 10th percentile is:", Tenpercentile)

The 10th percentile is: nan


  interpolation=interpolation)


In [11]:
tenpct = np.nanpercentile(df['GNI_PER_CAPITA'],10)

In [12]:
tenpct

760.00000000000011

In [13]:
tenpct2 = np.percentile(df['GNI_PER_CAPITA'].dropna(), 10)

In [14]:
tenpct2

760.00000000000011

### Now we replace percentile with nanpercentile for computation

In [None]:
Tenpercentile = np.nanpercentile(df['GNI_PER_CAPITA'], 10)
print("The 10th percentile is:", Tenpercentile)

### Calculate the median Infant_Mortality

In [None]:
Median = np.nanmedian(df['Infant_Mortality'])
Median

### Calculate the sample standard deviation of Infant_Mortality

$S=\sqrt{\frac{\sum{{{\left( x-\overline{x} \right)}^{2}}}}{n-1}}$

### when using std, by defaut ddof = 0  (computes the population variance)

In [None]:
Sample_std = np.nanstd(df['Infant_Mortality'], ddof=1)
Sample_std 

### Calculate the population standard deviation of Infant_Mortality

$S=\sqrt{\frac{\sum{{{\left( x-\overline{x} \right)}^{2}}}}{n}}$

In [None]:
Pop_std = np.nanstd(df['Infant_Mortality'], ddof=0)
Pop_std 

## Calculating weighting average

In [None]:
TestResults = pd.read_csv("Weighted_Scores.csv",encoding='latin1')

In [None]:
TestResults.head(10)

### Compute unweighted mean

In [None]:
Unweighted_Mean = np.average(TestResults['Score'])
print("The unweighted mean is", Unweighted_Mean)

### Compute weighted mean

In [None]:
Weighted_Mean = np.average(TestResults['Score'], weights=TestResults['Weights'])
print("The weighted mean is", Weighted_Mean)