In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import warnings
# current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

import missingno as msno # missing data visualization module for Python
import pandas_profiling

import gc
import datetime

%matplotlib inline
color = sns.color_palette()

### idxmin() and idxmax()

In [None]:
x = pd.Series([1,3,2,8,124,4,2,1])

In [None]:
x.idxmin()

0

In [None]:
x.idxmax()

4

### ne()

In [None]:
df = pd.DataFrame()
df['X'] = [0,0,0,0,0,0,1,3,2,4,3,12,7]

In [None]:
df['X'].ne(0)

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7      True
8      True
9      True
10     True
11     True
12     True
Name: X, dtype: bool

In [None]:
df['X'].ne(0).idxmax()

6

In [None]:
df.loc[df['X'].ne(0).idxmax():]

Unnamed: 0,X
6,1
7,3
8,2
9,4
10,3
11,12
12,7


### nsmallest and nlargest()

In [None]:
df = pd.DataFrame({
    'Name': [
        'Bob', 'Mark', 'Steph', 'Jess', 'Becky'
    ],
    'Points': [
        55, 98,46,77,81
    ]
})

In [None]:
df

Unnamed: 0,Name,Points
0,Bob,55
1,Mark,98
2,Steph,46
3,Jess,77
4,Becky,81


In [None]:
df.nsmallest(3,'Points')

Unnamed: 0,Name,Points
2,Steph,46
0,Bob,55
3,Jess,77


In [None]:
df.nlargest(3, 'Points')

Unnamed: 0,Name,Points
1,Mark,98
4,Becky,81
3,Jess,77


### cut()

In [None]:
pd.cut(df['Points'],5)

0    (45.948, 56.4]
1      (87.6, 98.0]
2    (45.948, 56.4]
3      (66.8, 77.2]
4      (77.2, 87.6]
Name: Points, dtype: category
Categories (5, interval[float64]): [(45.948, 56.4] < (56.4, 66.8] < (66.8, 77.2] < (77.2, 87.6] < (87.6, 98.0]]

### pivot_table()

In [None]:
df.pivot_table(index='Name', columns='Name', values='Points')

Name,Becky,Bob,Jess,Mark,Steph
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Becky,81.0,,,,
Bob,,55.0,,,
Jess,,,77.0,,
Mark,,,,98.0,
Steph,,,,,46.0


https://www.kaggle.com/mengwangk/kernelb53fef5f6e/edit

In [10]:
df2 = pd.DataFrame(np.array([[1, 2, 'x'], [4, 5, 'y'], [7, 8, None]]),
                    columns=['a', 'b', 'c'])
df2.head()

Unnamed: 0,a,b,c
0,1,2,x
1,4,5,y
2,7,8,


In [11]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
a    3 non-null object
b    3 non-null object
c    2 non-null object
dtypes: object(3)
memory usage: 200.0+ bytes


## Check missing values

In [12]:
# check missing values for each column 
df2.isnull().sum().sort_values(ascending=False)

c    1
b    0
a    0
dtype: int64

In [13]:
# Row with missing values
# check out the rows with missing values
df2[df2.isnull().any(axis=1)].head()

Unnamed: 0,a,b,c
2,7,8,


In [14]:
## Remove rows with missing values
df_new = df2.dropna()
df_new

Unnamed: 0,a,b,c
0,1,2,x
1,4,5,y


In [15]:
# check missing values for each column 
df_new.isnull().sum().sort_values(ascending=False)

c    0
b    0
a    0
dtype: int64

In [16]:
# change columns tyoe - String to Int type 
df_new['a'] = df_new['a'].astype('int64')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 3 columns):
a    2 non-null int64
b    2 non-null object
c    2 non-null object
dtypes: int64(1), object(2)
memory usage: 64.0+ bytes


In [17]:
df_new.describe().round(2)

Unnamed: 0,a
count,2.0
mean,2.5
std,2.12
min,1.0
25%,1.75
50%,2.5
75%,3.25
max,4.0


In [18]:
df_new = df_new[df_new.a > 1]
df_new

Unnamed: 0,a,b,c
1,4,5,y
