In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from numpy.random import randn
from numpy.random import randint
from pathlib import Path
import os

Other resources   
https://www.skytowner.com/explore/pandas_dataframe_equals_method

https://datatofish.com/descriptive-statistics-pandas/

# Summary

- Re-indexing
- Drop entry
- Selecting entries
- Data alignment
- Rank and sort
- Summary statistics
- Missing data
- Index hierarchy

In [None]:
csv_file = os.path.join(Path().resolve(), 'textdata_in_mp003.csv')
df = pd.read_csv(csv_file)
df

In [None]:
df.reindex([1,1,0,1,'a'],columns = ['Won','Lost','Won','Mana'], fill_value='suka')

In [None]:
df2 = df.iloc[0:5,[1,3,4,5]]
df2['ratio'] = round(df2['Won']/df2['Lost'],2)
df2

In [None]:
df2.drop(index = [2,3], columns = ['Won','Lost'])

In [None]:
df2.drop(index = [2,3])

In [None]:
df2.loc[0:3, ['Won']]

In [None]:
df2.describe(include='all')

# Development

Series re-indexing

In [None]:
# Re-indexing
ser1 = Series([1,2,3,4], index = ['A','B','C','D'])
print(ser1)

# Re-index adding an index that is not existing --> filled with NaN
ser2 = ser1.reindex(['A','B','C','D','novalue'])
print(ser2)

# re-index with two indexes that are not existing with "fill_value"
ser3 = ser1.reindex(['A','B','C','D','novalue','fillwithzero'], fill_value = 0)
print(ser3)

Forward filling

In [None]:
# Method to FORWARD fill an index using a sereis 
ser3 = Series(['USA','CAN','MEX'], index=[0,3,7])
print(ser3)

# Create a range of positions to fill
ranger = list(range(12))
print(type(ranger))
# Foward fill the positions using forward-fill
ser3.reindex(ranger,method='ffill')

DataFrame 

In [None]:
# Create dataframe with columns and index names
dframe = DataFrame(randn(25).reshape(5,5),index=['row1','row2','row3','row5','row6'],columns=['col1','col2','col3','col4','col5'])
print(dframe)

# Reindex ROWS of the dataframe (row4 is NA)
dframe2 = dframe.reindex(['row1','row2','row3','row4','row5','row6'])
print(dframe2)

# Reindex COL of the dataframe (col6 is NA)
dframe3 = dframe2.reindex(columns=['col1','col2','col3','col4','col5','col6'])
print(dframe3)

# Reindex ROWS and COL of the dataframe (row4 and col6 are NA)
dframe4 = dframe.reindex(['row1','row2','row3','row4','row5','row6'],columns=['col1','col2','col3','col4','col5','col6'])
print(dframe4)

Drop Entry

In [None]:
# Subset the Pandas Dataframe to have a Series

row_series = dframe.iloc[1]
# print(type(row_series))
# print(row_series)

col_series = dframe.iloc[:,1]
# print(type(col_series))
# print(col_series)
 
# Drop the row the series that correspond ot an index
print(row_series.drop('col1'))
print(col_series.drop('row2'))

# Drop 
print(dframe.drop(['col1'], axis=1))
print(dframe.drop(['row2'], axis=0))


Data alignment

In [None]:
# Use of .add and fill_value
dframe1 = DataFrame(np.arange(0,25,1).reshape(5,5),index=['row1','row2','row3','row4','row6'],columns=['col1','col2','col3','col4','col5'])
print(dframe1)
dframe2 = DataFrame(np.arange(0,16,1).reshape(4,4),index=['row1','row2','row3','row4'],columns=['col1','col2','col3','col4'])
print(dframe2)

# Leaves NaN
print(dframe1 + dframe2)

# Add zeros where the column is not available
print(dframe2.add(dframe1, fill_value=0))
print(dframe1.add(dframe2, fill_value=0))

In [None]:
# Extract Series from above dataframe
row_series = dframe1.iloc[1]
print(row_series)

col_series = dframe1.iloc[:,1]
print(col_series)

In [None]:
# Sum between dataframe and series
print(dframe1)
print(row_series.name)
print(row_series)
print(dframe1 + row_series)
print(col_series)
print(dframe1.add(col_series), fill_value = 0)

### Sorting

In [None]:
# Extract Series from above dataframe
row_series = dframe1.iloc[1]
print(row_series)

col_series = dframe1.iloc[:,1]
print(col_series)

In [None]:
# Sort values
print(row_series.sort_values(ascending=True))
# Sort index
print(row_series.sort_index())
# Rank
print(row_series.rank())

Descriptive Statistics

In [None]:
arr = np.array([[1,2,np.nan],[np.nan,5,6]])
print(arr)
dfame1 = DataFrame(arr,index = ['A','B'], columns=['one','two','three'])
print(dfame1)

In [None]:
# Sum Rows
print(dfame1.sum(axis=1))
# sum col
print(dfame1.sum(axis=0))

In [None]:
# Min value Rows
print(dfame1.min(axis=1))
# Min value col
print(dfame1.min(axis=0))

# Index of Min value Rows
print(dfame1.idxmin(axis=1))
# Index of Min value col
print(dfame1.idxmin(axis=0))

In [None]:
dfame1.describe()

In [None]:
from IPython.display import YouTubeVideo
# For more information about Covariance and Correlation
# Check out these great videos!
# Video credit: Brandon Foltz.

#CoVariance
YouTubeVideo('xGbpuFNR1ME')



In [None]:
#Correlation
YouTubeVideo('4EXNedimDMs')

### Covariance - Correlation 

In [None]:
# TBD

### Missing Data

In [None]:
data = Series(['one','two',np.nan,'four'])
data

In [None]:
# Method to spot NA
data.isnull()
# Method to drop NA
data.dropna()

In [None]:
# In a DataFrame we need to be a little more careful!
dframe = DataFrame([[0.0,0.25,0.5,0.75,1.0],[1.0,2,3,1,np.nan],[2,5,6,np.nan,np.nan],[np.nan,np.nan,np.nan,np.nan,np.nan],[4,np.nan,np.nan,np.nan,np.nan]])
print(dframe)

# Removes ALL rows that contains ONE OR MORE
# print(dframe.dropna())  

# Remove only ROWS with ALL NAs
# print(dframe.dropna(how='all')) 

# Remove only COLS with ALL NAs
# print(dframe.dropna(axis =1, how='all'))


# # Remove only ROWS with ALL NAs
print(dframe.dropna(thresh=2)) 

# # Remove only COLS with more than 2 NaN
print(dframe.dropna(axis=1, thresh=2))

In [None]:
# FILL NAs
print(dframe.fillna(100))

# FILL NAs with dictionary
print(dframe.fillna({0:110,1:111,2:112,3:113}))

### Index Hyerarchy

In [None]:
# Use of .add and fill_value
dframe1 = DataFrame(np.arange(0,25,1).reshape(5,5),index=[[1,1,1,2,2],['row1','row2','row3','row4','row6']],columns=[[1,1,1,2,2],['col1','col2','col3','col4','col5']])
print(dframe1)

In [None]:
# sort by index --> Not covered


In [None]:
# Name Indexes
# Name row index
dframe1.index.names = ['row_idx_1','row_idx_2']
print(dframe1)

# Name column index
dframe1.columns.names = ['col_idx_1','col_idx_2']
print(dframe1)

# Swap level
dframe1.swaplevel('col_idx_1','col_idx_2', axis=1)
dframe1.swaplevel('row_idx_1','row_idx_2', axis=0)


# Opration at specific level
dframe1.sum(level='col_idx_1', axis=1)
dframe1.sum(level='row_idx_1', axis=0)

# SUMMARY - RECAP

- Re-indexing
- Drop entry
- Selecting entries
- Data alignment
- Rank and sort (sort_value;  sort_index;  see rank vaue)
- Summary statistics (sum, min, max, descriptive Stats)
- Missing data (dropna, fillna)
- Index hierarchy