# Learnings for module Pandas

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

## Set options for printouts

In [2]:
#Set number of columns to be shown
pd.set_option("display.max_columns", 50) # No. of columns of a dataframe
pd.set_option("display.max_rows", 999) # No. of rows of a dataframe
pd.set_option("display.max_seq_items", 999) # No. of list items shown

## Understanding types and data

Get the type of data of a column and transform it into another type.

In [3]:
df_t = pd.DataFrame({"A": [0,1], "B": [2,3]})
print(df_t.A.dtype)

df_t["A"] = df_t.A.astype(str)
print(df_t.A.dtype)

int64
object


Show all values and their counts in a specific column of the DataFrame and sort in descending order.

In [4]:
df_t.A.value_counts().sort_values(ascending=False)

1    1
0    1
Name: A, dtype: int64

## Modifying Axis

### Rename columns

In [5]:
df_ren = pd.DataFrame({"A": [0,1], "B": [2,3]})
print(df_ren)
print("\n")

df_ren = df_ren.rename(columns={"A": "First", "B": "Second"})
print(df_ren)

   A  B
0  0  2
1  1  3


   First  Second
0      0       2
1      1       3


### Rename Label of index and columns axis

In [6]:
df_ren = df_ren.rename_axis("Ind", axis="rows").rename_axis("Cols", axis="columns")
df_ren

Cols,First,Second
Ind,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2
1,1,3


## Maps and Apply

In [7]:
df_m = pd.DataFrame({"A": [0,1], "B": [2,3]})
df_m

Unnamed: 0,A,B
0,0,2
1,1,3


In [8]:
#Add 1 to values of column "B" and add as column "C"
df_m["C"] = df_m.B.map(lambda x: x + 1)
df_m

Unnamed: 0,A,B,C
0,0,2,3
1,1,3,4


## Appending data to DataFrame

In [9]:
df_ap_in = pd.DataFrame({"A": [0,1], "B": [2,3], "C": [4,5]})
df_ap = df_ap_in.append({"A": np.nan, "B": 1, "C": np.nan}, ignore_index=True)
df_ap

Unnamed: 0,A,B,C
0,0.0,2.0,4.0
1,1.0,3.0,5.0
2,,1.0,


In [10]:
df_conc = pd.concat([df_ap, df_ap_in], sort=False)
df_conc

Unnamed: 0,A,B,C
0,0.0,2.0,4.0
1,1.0,3.0,5.0
2,,1.0,
0,0.0,2.0,4.0
1,1.0,3.0,5.0


## Coping with missing values (null, NaN)

### Analysing missing values

Function isnull() returns boolean value

In [36]:
df_n = pd.DataFrame({"A": [0,np.nan,3], "B": [2,3,1], "C": [np.nan,3,np.nan]})
df_n.isnull()

Unnamed: 0,A,B,C
0,False,False,True
1,True,False,False
2,False,False,True


Print all columns with the number of NaN values

In [37]:
df_n.isna().sum()

A    1
B    0
C    2
dtype: int64

Print only columns with NaN values - # of NaN values

In [38]:
null_count = (df_n.isnull().sum())
print(null_count[null_count > 0])

A    1
C    2
dtype: int64


Print only columns with NaN values - % of NaN values

In [39]:
missing = df_n.isna().mean()
missing[missing > 0].sort_values(ascending=False)

C    0.666667
A    0.333333
dtype: float64

In [40]:
df_n.loc[df_n.C.isna()]

Unnamed: 0,A,B,C
0,0.0,2,
2,3.0,1,


Create list of columns with missing values

In [41]:
list_of_cols = [col for col in df_n.columns
               if df_n[col].isnull().any()]

list_of_cols

['A', 'C']

### Fill missing values - manually

In [42]:
df_n_fill = df_n.copy()
df_n_fill["A"] = df_n_fill.A.fillna(0)
df_n_fill

Unnamed: 0,A,B,C
0,0.0,2,
1,0.0,3,3.0
2,3.0,1,


In [43]:
df_n_fill = df_n_fill.fillna(np.random.randint(0,3))
df_n_fill

Unnamed: 0,A,B,C
0,0.0,2,1.0
1,0.0,3,3.0
2,3.0,1,1.0


### Filling missing values - imputer

In [44]:
si = SimpleImputer()
df_n_si = pd.DataFrame(si.fit_transform(df_n))
df_n_si

Unnamed: 0,0,1,2
0,0.0,2.0,3.0
1,1.5,3.0,3.0
2,3.0,1.0,3.0


## Describing and Summarizing data

In [46]:
df_unique_count = pd.DataFrame({"A": ["A","A","A","D","E"], "B": ["C","D","E","E","F"], "C":["G","C","F","B","E"]})
df_unique_count.describe(exclude='number')[:2].T.sort_values('unique')

Unnamed: 0,count,unique
A,5,3
B,5,4
C,5,5


## Pivoting DataFrames

No aggregation required

In [None]:
df_piv = pd.DataFrame([[0,2,3],[1,3,4],[3,1,2]], columns=["A", "B", "C"])
print(df_piv)
df_piv_p = df_piv.pivot(index="A", columns="B", values="C")
print("\n")
print(df_piv_p)

Aggregation required, due to multiple combinations of new index with target columns

In [None]:
df_piv2 = pd.DataFrame([[0,2,3],[1,3,4],[0,1,2]], columns=["A", "B", "C"])
print(df_piv2)
df_piv2_p = df_piv2.pivot_table(index="A", columns="B", values="C", aggfunc=np.sum)
print("\n")
print(df_piv2_p)