# find chunks of missing data inside dataframe (only for sequencies of missing values greater than 1):

In [1]:
import pandas as pd
import numpy as np

In [2]:
# define dataframe with chunks of missing data:
df = pd.DataFrame({'values': [3, np.nan, np.nan, 3, 3, np.nan, np.nan, np.nan, 3, 3, 3, np.nan, np.nan, np.nan, np.nan, 3, 3, 3, 3, np.nan, np.nan, np.nan, np.nan, np.nan, 3, 3, 3, 3, 3, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 3, 3, 3, 3, 3, 3]})

# create a copy of the dataframe:
miss_df = df.copy()

# insert column 'index' with index of each row as first column of dataframe:
miss_df.insert(0, 'index', miss_df.index)

print(miss_df)

    index  values
0       0     3.0
1       1     NaN
2       2     NaN
3       3     3.0
4       4     3.0
5       5     NaN
6       6     NaN
7       7     NaN
8       8     3.0
9       9     3.0
10     10     3.0
11     11     NaN
12     12     NaN
13     13     NaN
14     14     NaN
15     15     3.0
16     16     3.0
17     17     3.0
18     18     3.0
19     19     NaN
20     20     NaN
21     21     NaN
22     22     NaN
23     23     NaN
24     24     3.0
25     25     3.0
26     26     3.0
27     27     3.0
28     28     3.0
29     29     NaN
30     30     NaN
31     31     NaN
32     32     NaN
33     33     NaN
34     34     NaN
35     35     3.0
36     36     3.0
37     37     3.0
38     38     3.0
39     39     3.0
40     40     3.0


In [3]:
# create column with 1 and np.nan:
miss_df['bool'] = np.where(miss_df['values'].notnull(), 1, np.nan)

# create column with cumulative sum of chunks of missing data that restarts to 1 at each non-missing value:
miss_df['cumsum'] = miss_df['bool'].isnull().astype(int).groupby(miss_df['bool'].notnull().astype(int).cumsum()).cumsum()

# check:
print(miss_df)

    index  values  bool  cumsum
0       0     3.0   1.0       0
1       1     NaN   NaN       1
2       2     NaN   NaN       2
3       3     3.0   1.0       0
4       4     3.0   1.0       0
5       5     NaN   NaN       1
6       6     NaN   NaN       2
7       7     NaN   NaN       3
8       8     3.0   1.0       0
9       9     3.0   1.0       0
10     10     3.0   1.0       0
11     11     NaN   NaN       1
12     12     NaN   NaN       2
13     13     NaN   NaN       3
14     14     NaN   NaN       4
15     15     3.0   1.0       0
16     16     3.0   1.0       0
17     17     3.0   1.0       0
18     18     3.0   1.0       0
19     19     NaN   NaN       1
20     20     NaN   NaN       2
21     21     NaN   NaN       3
22     22     NaN   NaN       4
23     23     NaN   NaN       5
24     24     3.0   1.0       0
25     25     3.0   1.0       0
26     26     3.0   1.0       0
27     27     3.0   1.0       0
28     28     3.0   1.0       0
29     29     NaN   NaN       1
30     3

In [4]:
# create col 'plus' with values from 'cumsum' added by 0.1 skiping 0s:
miss_df['plus'] = miss_df['cumsum'].where(miss_df['cumsum'] != 0, np.nan) + 0.1

# fill NaN with 0s:
miss_df['plus'] = miss_df['plus'].fillna(0)

# check:
print(miss_df)

    index  values  bool  cumsum  plus
0       0     3.0   1.0       0   0.0
1       1     NaN   NaN       1   1.1
2       2     NaN   NaN       2   2.1
3       3     3.0   1.0       0   0.0
4       4     3.0   1.0       0   0.0
5       5     NaN   NaN       1   1.1
6       6     NaN   NaN       2   2.1
7       7     NaN   NaN       3   3.1
8       8     3.0   1.0       0   0.0
9       9     3.0   1.0       0   0.0
10     10     3.0   1.0       0   0.0
11     11     NaN   NaN       1   1.1
12     12     NaN   NaN       2   2.1
13     13     NaN   NaN       3   3.1
14     14     NaN   NaN       4   4.1
15     15     3.0   1.0       0   0.0
16     16     3.0   1.0       0   0.0
17     17     3.0   1.0       0   0.0
18     18     3.0   1.0       0   0.0
19     19     NaN   NaN       1   1.1
20     20     NaN   NaN       2   2.1
21     21     NaN   NaN       3   3.1
22     22     NaN   NaN       4   4.1
23     23     NaN   NaN       5   5.1
24     24     3.0   1.0       0   0.0
25     25   

In [6]:
# create col 'diff' with diff for 'plus' column:
miss_df['diff'] = miss_df['plus'].diff()

# create col 'start_index' that use .eq() for 1.1 in column 'diff' and return index values from 'index' column added by 1:
miss_df['start_index'] = miss_df[miss_df['diff'].eq(1.1)]['index']

# check:
print(miss_df)

    index  values  bool  cumsum  plus  diff  start_index
0       0     3.0   1.0       0   0.0   NaN          NaN
1       1     NaN   NaN       1   1.1   1.1          1.0
2       2     NaN   NaN       2   2.1   1.0          NaN
3       3     3.0   1.0       0   0.0  -2.1          NaN
4       4     3.0   1.0       0   0.0   0.0          NaN
5       5     NaN   NaN       1   1.1   1.1          5.0
6       6     NaN   NaN       2   2.1   1.0          NaN
7       7     NaN   NaN       3   3.1   1.0          NaN
8       8     3.0   1.0       0   0.0  -3.1          NaN
9       9     3.0   1.0       0   0.0   0.0          NaN
10     10     3.0   1.0       0   0.0   0.0          NaN
11     11     NaN   NaN       1   1.1   1.1         11.0
12     12     NaN   NaN       2   2.1   1.0          NaN
13     13     NaN   NaN       3   3.1   1.0          NaN
14     14     NaN   NaN       4   4.1   1.0          NaN
15     15     3.0   1.0       0   0.0  -4.1          NaN
16     16     3.0   1.0       0

In [7]:
# get diff of 'plus' column shifted one row up:
miss_df['diff_up'] = miss_df['plus'].diff().shift(-1)

# create col 'end_index' with index value if .lt(-1.1):
miss_df['end_index'] = miss_df[miss_df['diff_up'].le(-1.1)]['index']

# check:
print(miss_df)

    index  values  bool  cumsum  plus  diff  start_index  diff_up  end_index
0       0     3.0   1.0       0   0.0   NaN          NaN      1.1        NaN
1       1     NaN   NaN       1   1.1   1.1          1.0      1.0        NaN
2       2     NaN   NaN       2   2.1   1.0          NaN     -2.1        2.0
3       3     3.0   1.0       0   0.0  -2.1          NaN      0.0        NaN
4       4     3.0   1.0       0   0.0   0.0          NaN      1.1        NaN
5       5     NaN   NaN       1   1.1   1.1          5.0      1.0        NaN
6       6     NaN   NaN       2   2.1   1.0          NaN      1.0        NaN
7       7     NaN   NaN       3   3.1   1.0          NaN     -3.1        7.0
8       8     3.0   1.0       0   0.0  -3.1          NaN      0.0        NaN
9       9     3.0   1.0       0   0.0   0.0          NaN      0.0        NaN
10     10     3.0   1.0       0   0.0   0.0          NaN      1.1        NaN
11     11     NaN   NaN       1   1.1   1.1         11.0      1.0        NaN

In [8]:
# copy 'start_index' to as df 'start_df':
start_df = miss_df['start_index'].copy()

# delete NaNs from 'start_df':
start_df = start_df.dropna()

# reset index for 'start_df':
start_df = start_df.reset_index(drop=True)

# copy 'end_index' to as df 'end_df':
end_df = miss_df['end_index'].copy()

# delete NaNs from 'end_df':
end_df = end_df.dropna()

# reset index for 'end_df':
end_df = end_df.reset_index(drop=True)

# create df 'missing_data_index_count' with cols 'start_df' and 'end_df':
missing_data_index_count = pd.DataFrame()

missing_data_index_count['start_df'] = start_df
missing_data_index_count['end_df'] = end_df

# create col 'missing_count' with diff of 'end_df' and 'start_df' columns plus 1:
missing_data_index_count['missing_count'] = missing_data_index_count['end_df'] - missing_data_index_count['start_df'] + 1

# check:
print(missing_data_index_count)

   start_df  end_df  missing_count
0       1.0     2.0            2.0
1       5.0     7.0            3.0
2      11.0    14.0            4.0
3      19.0    23.0            5.0
4      29.0    34.0            6.0
