#### part 01 in 03: import csv

In [1]:
import pandas as pd
import numpy as np

# read melted_data.csv as missing_data with first column as index:
missing_data = pd.read_csv('melted_data.csv', index_col=0)

# shape:
print(missing_data.shape)

# check:
missing_data.head()


(252879, 3)


Unnamed: 0,datetime,client_id,KwH
512164,2013-06-04 14:00:00,15,13.946281
512165,2013-06-04 15:00:00,15,193.698347
512166,2013-06-04 16:00:00,15,193.698347
512167,2013-06-04 17:00:00,15,196.280992
512168,2013-06-04 18:00:00,15,181.301653


#### part 02 of 03: full df

In [2]:
# get cols 'client_id' and 'KwH':
miss_df = missing_data[['client_id', 'KwH']]

# rename column to 'values':
miss_df = miss_df.rename(columns={'KwH': 'values'})

# rename column id_col to 'id':
miss_df = miss_df.rename(columns={'client_id': 'id'})

# change 0 to NaN:
miss_df = miss_df.replace(0, np.nan)

# insert index starting at 1:
miss_df.insert(0, 'index', miss_df.index + 1)

# get length of df:
length = len(miss_df)
    
# create column with 1 and np.nan:
miss_df['bool'] = np.where(miss_df['values'].notnull(), 1, np.nan)

# create column with cumulative sum of chunks of missing data that restarts to 1 at each non-missing value:
miss_df['cumsum'] = miss_df['bool'].isnull().astype(int).groupby(miss_df['bool'].notnull().astype(int).cumsum()).cumsum()

# create col 'plus' with values from 'cumsum' added by 0.1 skiping 0s:
miss_df['plus'] = miss_df['cumsum'].where(miss_df['cumsum'] != 0, np.nan) + 0.1

# fill NaN with 0s:
miss_df['plus'] = miss_df['plus'].fillna(0)

# create col 'diff' with diff for 'plus' column:
miss_df['diff'] = miss_df['plus'].diff()

# create col 'start_index' that use .eq() for 1.1 in column 'diff' and return index values from 'index' column added by 1:
miss_df['start_index'] = miss_df[miss_df['diff'].eq(1.1)]['index']

# get diff of 'plus' column shifted one row up:
miss_df['diff_up'] = miss_df['plus'].diff().shift(-1)

# create col 'end_index' with index value if .lt(-1.1):
miss_df['end_index'] = miss_df[miss_df['diff_up'].le(-1.1)]['index']

# create column 'sum_1' with value for .eq(1) in 'cumsum' column:
miss_df['sum_1'] = miss_df[miss_df['cumsum'].eq(1)]['cumsum']

# create column 'sum_2' with value for .eq(2) in 'cumsum' column:
miss_df['sum_2'] = miss_df[miss_df['cumsum'].eq(2)]['cumsum']

# change column sum 2 one row up:
miss_df['sum_2'] = miss_df['sum_2'].shift(-1)

# fill NaN for cols 'sum_1' and 'sum_2' with 0s:
miss_df['sum_1'] = miss_df['sum_1'].fillna(0)
miss_df['sum_2'] = miss_df['sum_2'].fillna(0)

# create column 'diff_12' with diff for 'sum_1' and 'sum_2' columns:
miss_df['diff_12'] = miss_df['sum_1'] - miss_df['sum_2']

# create column 'start_index_one' with index value if .eq(1) in 'diff_12' column:
miss_df['start_index_one'] = miss_df[miss_df['diff_12'].eq(1)]['index']

# create column 'end_index_one' with same values as 'start_index_one' column:
miss_df['end_index_one'] = miss_df['start_index_one']

# fill 'end_index' with values from 'end_index_one' column olny when .eq(1) for 'end_index_one' column:
miss_df['end_index'] = np.where(miss_df['end_index_one'].eq(1), miss_df['end_index_one'], miss_df['end_index'])

# check:
miss_df.head(20)

Unnamed: 0,index,id,values,bool,cumsum,plus,diff,start_index,diff_up,end_index,sum_1,sum_2,diff_12,start_index_one,end_index_one
512164,512165,15,13.946281,1.0,0,0.0,,,0.0,,0.0,0.0,0.0,,
512165,512166,15,193.698347,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512166,512167,15,193.698347,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512167,512168,15,196.280992,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512168,512169,15,181.301653,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512169,512170,15,166.838843,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512170,512171,15,169.421488,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512171,512172,15,182.334711,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512172,512173,15,191.632231,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,
512173,512174,15,161.157025,1.0,0,0.0,0.0,,0.0,,0.0,0.0,0.0,,


#### part 03 of 03: final product: start_end_df

In [3]:
# rename miss_df to start_end_df:
start_end_df = miss_df

# drop columns except 'id', 'start_index' and 'end_index':
start_end_df = start_end_df.drop(['values', 'bool', 'cumsum', 'plus', 'diff', 'diff_up', 'sum_1', 'sum_2', 'diff_12', 'start_index_one', 'end_index_one'], axis=1)

# name index 'idx':
start_end_df.index.name = 'idx'

# check:
start_end_df.head(20)

Unnamed: 0_level_0,index,id,start_index,end_index
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
512164,512165,15,,
512165,512166,15,,
512166,512167,15,,
512167,512168,15,,
512168,512169,15,,
512169,512170,15,,
512170,512171,15,,
512171,512172,15,,
512172,512173,15,,
512173,512174,15,,


In [4]:
# fill 0 with NaN except for col 'id': ***PROBLEM HERE!!! START_INDEX FOR ID 348 BECOMES 348!!!
start_end_df = start_end_df.replace(0, np.nan)

# drop rows with NaN on both columns:
start_end_df = start_end_df.dropna(how='all')

# save rows with equal values in new df 'one_missing_df', including 'id' column:
one_missing_df = start_end_df[start_end_df['start_index'] == start_end_df['end_index']]

# slice df 'start_end_df' to remove rows with equal values:
start_end_df = start_end_df[start_end_df['start_index'] != start_end_df['end_index']]

# split 'start_end_df' in two dfs including 'id' column:
start_df = start_end_df[['id', 'start_index']].copy()
end_df = start_end_df[['id', 'end_index']].copy()

# rename col 'id' from 'start_df' to 'id_start':
start_df = start_df.rename(columns={'id': 'id_start'})

# rename col 'id' from 'end_df' to 'id_end':
end_df = end_df.rename(columns={'id': 'id_end'})

# remove rows of 'start_df' with NaN in 'start_index' column, without considering column 'id_start':
start_df = start_df.dropna(subset=['start_index']).reset_index(drop=True)

# remove rows of 'end_df' with NaN in 'end_index' column, without considering column 'id_end':
end_df = end_df.dropna(subset=['end_index']).reset_index(drop=True)

# rejoin 'start_df' and 'end_df' in 'start_end_df':
start_end_df = pd.concat([start_df, end_df], axis=1)

# drop 'id_end' column:
start_end_df = start_end_df.drop(columns=['id_end'])

# rename 'id_start' column to 'id':
start_end_df = start_end_df.rename(columns={'id_start': 'id'})

# concat values from 'one_missing_df' to 'start_end_df' without columns names:
start_end_df = pd.concat([start_end_df, one_missing_df], axis=0, ignore_index=True)

# sort values by 'start_index' column:
start_end_df = start_end_df.sort_values(by='start_index')

# reset index:
start_end_df = start_end_df.reset_index(drop=True)

# add column 'count' with diff 'end_index' - 'start_index' + 1:
start_end_df['count'] = start_end_df['end_index'] - start_end_df['start_index'] + 1

# create col 'pct' using count div by length mul by 100 to get percentage:
start_end_df['pct'] = (start_end_df['count'] / length * 100).round(2)

# if exists, drop 'index' column:
start_end_df = start_end_df.drop(columns=['index'], errors='ignore')

# check:
start_end_df.head(20)

Unnamed: 0,id,start_index,end_index,count,pct
0,15,513544.0,516182.0,2639.0,1.04
1,15,519328.0,519328.0,1.0,0.0
2,66,2290003.0,2290003.0,1.0,0.0
3,66,2298907.0,2298907.0,1.0,0.0
4,66,2303635.0,2303641.0,7.0,0.0
5,66,2303647.0,2303647.0,1.0,0.0
6,66,2303658.0,2303665.0,8.0,0.0
7,66,2303671.0,2303671.0,1.0,0.0
8,66,2303683.0,2303689.0,7.0,0.0
9,66,2303695.0,2303695.0,1.0,0.0


In [5]:

# concat values from 'one_missing_df' to 'start_end_df' without columns names:
start_end_df = pd.concat([start_end_df, one_missing_df], axis=0, ignore_index=True)

# sort values by 'start_index' column:
start_end_df = start_end_df.sort_values(by='start_index')

# reset index:
start_end_df = start_end_df.reset_index(drop=True)

# add column 'count' with diff 'end_index' - 'start_index' + 1:
start_end_df['count'] = start_end_df['end_index'] - start_end_df['start_index'] + 1

# create col 'pct' using count div by length mul by 100 to get percentage:
start_end_df['pct'] = (start_end_df['count'] / length * 100).round(2)

# subtract 1 from 'start_index' column and 'end_index' column:
start_end_df['start_index'] = start_end_df['start_index'] - 1
start_end_df['end_index'] = start_end_df['end_index'] - 1

# sort by 'count' grouped by 'id':
start_end_df = start_end_df.groupby('id').apply(lambda x: x.sort_values(['count'], ascending=False)).reset_index(drop=True)

# check:
start_end_df.head()

Unnamed: 0,id,start_index,end_index,count,pct,index
0,15,513543.0,516181.0,2639.0,1.04,
1,15,519327.0,519327.0,1.0,0.0,519328.0
2,15,519327.0,519327.0,1.0,0.0,
3,66,2308505.0,2308523.0,19.0,0.01,
4,66,2307906.0,2307922.0,17.0,0.01,


In [12]:
# insert column 'datetime_start' after 'start_index' column:
try:
    start_end_df.insert(2, 'datetime_start', '')
except:
    pass

# insert column 'datetime_end' after 'end_index' column:
try:
    start_end_df.insert(4, 'datetime_end', '')
except:
    pass

# using 'start_index' column from 'start_end_df' and index from 'melted_data.csv', get values from 'datetime' column and insert in 'datetime_start' column:
start_end_df['datetime_start'] = missing_data.loc[start_end_df['start_index'], 'datetime'].values

# same as above but for 'end_index' column:
start_end_df['datetime_end'] = missing_data.loc[start_end_df['end_index'], 'datetime'].values

# if exists, drop 'index' column:
start_end_df = start_end_df.drop(columns=['index'], errors='ignore')

# remove decimals from 'start_index', 'end_index' and 'count' columns:
start_end_df['start_index'] = start_end_df['start_index'].astype(int)
start_end_df['end_index'] = start_end_df['end_index'].astype(int)
start_end_df['count'] = start_end_df['count'].astype(int)

# transform columns 'datetime_start' and 'datetime_end' to datetime:
start_end_df['datetime_start'] = pd.to_datetime(start_end_df['datetime_start'])
start_end_df['datetime_end'] = pd.to_datetime(start_end_df['datetime_end'])

# save to csv 'missing_data.csv':
start_end_df.to_csv('missing_data.csv')

# check:
start_end_df.head(50)

Unnamed: 0,id,start_index,datetime_start,end_index,datetime_end,count,pct
0,15,513543,2013-08-01 01:00:00,516181,2013-11-18 23:00:00,2639,1.04
1,15,519327,2014-03-30 01:00:00,519327,2014-03-30 01:00:00,1,0.0
2,15,519327,2014-03-30 01:00:00,519327,2014-03-30 01:00:00,1,0.0
3,66,2308505,2014-05-05 00:00:00,2308523,2014-05-05 18:00:00,19,0.01
4,66,2307906,2014-04-10 01:00:00,2307922,2014-04-10 17:00:00,17,0.01
5,66,2308674,2014-05-12 01:00:00,2308690,2014-05-12 17:00:00,17,0.01
6,66,2308794,2014-05-17 01:00:00,2308810,2014-05-17 17:00:00,17,0.01
7,66,2308578,2014-05-08 01:00:00,2308594,2014-05-08 17:00:00,17,0.01
8,66,2308530,2014-05-06 01:00:00,2308546,2014-05-06 17:00:00,17,0.01
9,66,2307882,2014-04-09 01:00:00,2307897,2014-04-09 16:00:00,16,0.01


# compilation of code from cells above:

In [15]:
import pandas as pd
import numpy as np

# read melted_data.csv as missing_data with first column as index:
missing_data = pd.read_csv('melted_data.csv', index_col=0)

# get cols 'client_id' and 'KwH':
miss_df = missing_data[['client_id', 'KwH']]

# rename column to 'values':
miss_df = miss_df.rename(columns={'KwH': 'values'})

# rename column id_col to 'id':
miss_df = miss_df.rename(columns={'client_id': 'id'})

# change 0 to NaN:
miss_df = miss_df.replace(0, np.nan)

# insert index starting at 1:
miss_df.insert(0, 'index', miss_df.index + 1)

# get length of df:
length = len(miss_df)
    
# create column with 1 and np.nan:
miss_df['bool'] = np.where(miss_df['values'].notnull(), 1, np.nan)

# create column with cumulative sum of chunks of missing data that restarts to 1 at each non-missing value:
miss_df['cumsum'] = miss_df['bool'].isnull().astype(int).groupby(miss_df['bool'].notnull().astype(int).cumsum()).cumsum()

# create col 'plus' with values from 'cumsum' added by 0.1 skiping 0s:
miss_df['plus'] = miss_df['cumsum'].where(miss_df['cumsum'] != 0, np.nan) + 0.1

# fill NaN with 0s:
miss_df['plus'] = miss_df['plus'].fillna(0)

# create col 'diff' with diff for 'plus' column:
miss_df['diff'] = miss_df['plus'].diff()

# create col 'start_index' that use .eq() for 1.1 in column 'diff' and return index values from 'index' column added by 1:
miss_df['start_index'] = miss_df[miss_df['diff'].eq(1.1)]['index']

# get diff of 'plus' column shifted one row up:
miss_df['diff_up'] = miss_df['plus'].diff().shift(-1)

# create col 'end_index' with index value if .lt(-1.1):
miss_df['end_index'] = miss_df[miss_df['diff_up'].le(-1.1)]['index']

# create column 'sum_1' with value for .eq(1) in 'cumsum' column:
miss_df['sum_1'] = miss_df[miss_df['cumsum'].eq(1)]['cumsum']

# create column 'sum_2' with value for .eq(2) in 'cumsum' column:
miss_df['sum_2'] = miss_df[miss_df['cumsum'].eq(2)]['cumsum']

# change column sum 2 one row up:
miss_df['sum_2'] = miss_df['sum_2'].shift(-1)

# fill NaN for cols 'sum_1' and 'sum_2' with 0s:
miss_df['sum_1'] = miss_df['sum_1'].fillna(0)
miss_df['sum_2'] = miss_df['sum_2'].fillna(0)

# create column 'diff_12' with diff for 'sum_1' and 'sum_2' columns:
miss_df['diff_12'] = miss_df['sum_1'] - miss_df['sum_2']

# create column 'start_index_one' with index value if .eq(1) in 'diff_12' column:
miss_df['start_index_one'] = miss_df[miss_df['diff_12'].eq(1)]['index']

# create column 'end_index_one' with same values as 'start_index_one' column:
miss_df['end_index_one'] = miss_df['start_index_one']

# fill 'end_index' with values from 'end_index_one' column olny when .eq(1) for 'end_index_one' column:
miss_df['end_index'] = np.where(miss_df['end_index_one'].eq(1), miss_df['end_index_one'], miss_df['end_index'])

# rename miss_df to start_end_df:
start_end_df = miss_df

# drop columns except 'id', 'start_index' and 'end_index':
start_end_df = start_end_df.drop(['values', 'bool', 'cumsum', 'plus', 'diff', 'diff_up', 'sum_1', 'sum_2', 'diff_12', 'start_index_one', 'end_index_one'], axis=1)

# name index 'idx':
start_end_df.index.name = 'idx'

# fill 0 with NaN except for col 'id': ***PROBLEM HERE!!! START_INDEX FOR ID 348 BECOMES 348!!!
start_end_df = start_end_df.replace(0, np.nan)

# drop rows with NaN on both columns:
start_end_df = start_end_df.dropna(how='all')

# save rows with equal values in new df 'one_missing_df', including 'id' column:
one_missing_df = start_end_df[start_end_df['start_index'] == start_end_df['end_index']]

# slice df 'start_end_df' to remove rows with equal values:
start_end_df = start_end_df[start_end_df['start_index'] != start_end_df['end_index']]

# split 'start_end_df' in two dfs including 'id' column:
start_df = start_end_df[['id', 'start_index']].copy()
end_df = start_end_df[['id', 'end_index']].copy()

# rename col 'id' from 'start_df' to 'id_start':
start_df = start_df.rename(columns={'id': 'id_start'})

# rename col 'id' from 'end_df' to 'id_end':
end_df = end_df.rename(columns={'id': 'id_end'})

# remove rows of 'start_df' with NaN in 'start_index' column, without considering column 'id_start':
start_df = start_df.dropna(subset=['start_index']).reset_index(drop=True)

# remove rows of 'end_df' with NaN in 'end_index' column, without considering column 'id_end':
end_df = end_df.dropna(subset=['end_index']).reset_index(drop=True)

# rejoin 'start_df' and 'end_df' in 'start_end_df':
start_end_df = pd.concat([start_df, end_df], axis=1)

# drop 'id_end' column:
start_end_df = start_end_df.drop(columns=['id_end'])

# rename 'id_start' column to 'id':
start_end_df = start_end_df.rename(columns={'id_start': 'id'})

# concat values from 'one_missing_df' to 'start_end_df' without columns names:
start_end_df = pd.concat([start_end_df, one_missing_df], axis=0, ignore_index=True)

# sort values by 'start_index' column:
start_end_df = start_end_df.sort_values(by='start_index')

# reset index:
start_end_df = start_end_df.reset_index(drop=True)

# add column 'count' with diff 'end_index' - 'start_index' + 1:
start_end_df['count'] = start_end_df['end_index'] - start_end_df['start_index'] + 1

# create col 'pct' using count div by length mul by 100 to get percentage:
start_end_df['pct'] = (start_end_df['count'] / length * 100).round(2)

# if exists, drop 'index' column:
start_end_df = start_end_df.drop(columns=['index'], errors='ignore')

# check:
start_end_df.head(20)

# concat values from 'one_missing_df' to 'start_end_df' without columns names:
start_end_df = pd.concat([start_end_df, one_missing_df], axis=0, ignore_index=True)

# sort values by 'start_index' column:
start_end_df = start_end_df.sort_values(by='start_index')

# reset index:
start_end_df = start_end_df.reset_index(drop=True)

# add column 'count' with diff 'end_index' - 'start_index' + 1:
start_end_df['count'] = start_end_df['end_index'] - start_end_df['start_index'] + 1

# create col 'pct' using count div by length mul by 100 to get percentage:
start_end_df['pct'] = (start_end_df['count'] / length * 100).round(2)

# subtract 1 from 'start_index' column and 'end_index' column:
start_end_df['start_index'] = start_end_df['start_index'] - 1
start_end_df['end_index'] = start_end_df['end_index'] - 1

# sort by 'count' grouped by 'id':
start_end_df = start_end_df.groupby('id').apply(lambda x: x.sort_values(['count'], ascending=False)).reset_index(drop=True)

# check:
start_end_df.head()

# insert column 'datetime_start' after 'start_index' column:
try:
    start_end_df.insert(2, 'datetime_start', '')
except:
    pass

# insert column 'datetime_end' after 'end_index' column:
try:
    start_end_df.insert(4, 'datetime_end', '')
except:
    pass

# using 'start_index' column from 'start_end_df' and index from 'melted_data.csv', get values from 'datetime' column and insert in 'datetime_start' column:
start_end_df['datetime_start'] = missing_data.loc[start_end_df['start_index'], 'datetime'].values

# same as above but for 'end_index' column:
start_end_df['datetime_end'] = missing_data.loc[start_end_df['end_index'], 'datetime'].values

# if exists, drop 'index' column:
start_end_df = start_end_df.drop(columns=['index'], errors='ignore')

# remove decimals from 'start_index', 'end_index' and 'count' columns:
start_end_df['start_index'] = start_end_df['start_index'].astype(int)
start_end_df['end_index'] = start_end_df['end_index'].astype(int)
start_end_df['count'] = start_end_df['count'].astype(int)

# transform columns 'datetime_start' and 'datetime_end' to datetime:
start_end_df['datetime_start'] = pd.to_datetime(start_end_df['datetime_start'])
start_end_df['datetime_end'] = pd.to_datetime(start_end_df['datetime_end'])

# remove duplicate rows:
start_end_df = start_end_df.drop_duplicates()

# save to csv 'missing_data.csv':
start_end_df.to_csv('missing_data.csv')

# check:
start_end_df.head(50)

Unnamed: 0,id,start_index,datetime_start,end_index,datetime_end,count,pct
0,15,513543,2013-08-01 01:00:00,516181,2013-11-18 23:00:00,2639,1.04
1,15,519327,2014-03-30 01:00:00,519327,2014-03-30 01:00:00,1,0.0
3,66,2308505,2014-05-05 00:00:00,2308523,2014-05-05 18:00:00,19,0.01
4,66,2307906,2014-04-10 01:00:00,2307922,2014-04-10 17:00:00,17,0.01
5,66,2308674,2014-05-12 01:00:00,2308690,2014-05-12 17:00:00,17,0.01
6,66,2308794,2014-05-17 01:00:00,2308810,2014-05-17 17:00:00,17,0.01
7,66,2308578,2014-05-08 01:00:00,2308594,2014-05-08 17:00:00,17,0.01
8,66,2308530,2014-05-06 01:00:00,2308546,2014-05-06 17:00:00,17,0.01
9,66,2307882,2014-04-09 01:00:00,2307897,2014-04-09 16:00:00,16,0.01
10,66,2307329,2014-03-17 00:00:00,2307340,2014-03-17 11:00:00,12,0.0


# transform function bellow to do what cell above do:

In [None]:
def find_start_end_df(df, id_col, column_name):
    
    # create a copy of id_col and column_name with its index:
    miss_df = df[[id_col, column_name]].copy()
    
    # rename column to 'values':
    miss_df = miss_df.rename(columns={column_name: 'values'})

    # rename column id_col to 'id':
    miss_df = miss_df.rename(columns={id_col: 'id'})

    # change 0 to NaN:
    miss_df = miss_df.replace(0, np.nan)

    # insert index starting at 1:
    miss_df.insert(0, 'index', miss_df.index + 1)

    # get length of df:
    length = len(miss_df)
    
    # create column with 1 and np.nan:
    miss_df['bool'] = np.where(miss_df['values'].notnull(), 1, np.nan)

    # create column with cumulative sum of chunks of missing data that restarts to 1 at each non-missing value:
    miss_df['cumsum'] = miss_df['bool'].isnull().astype(int).groupby(miss_df['bool'].notnull().astype(int).cumsum()).cumsum()


    # create col 'plus' with values from 'cumsum' added by 0.1 skiping 0s:
    miss_df['plus'] = miss_df['cumsum'].where(miss_df['cumsum'] != 0, np.nan) + 0.1

    # fill NaN with 0s:
    miss_df['plus'] = miss_df['plus'].fillna(0)


    # create col 'diff' with diff for 'plus' column:
    miss_df['diff'] = miss_df['plus'].diff()

    # create col 'start_index' that use .eq() for 1.1 in column 'diff' and return index values from 'index' column added by 1:
    miss_df['start_index'] = miss_df[miss_df['diff'].eq(1.1)]['index']


    # get diff of 'plus' column shifted one row up:
    miss_df['diff_up'] = miss_df['plus'].diff().shift(-1)

    # create col 'end_index' with index value if .lt(-1.1):
    miss_df['end_index'] = miss_df[miss_df['diff_up'].le(-1.1)]['index']


    # create column 'sum_1' with value for .eq(1) in 'cumsum' column:
    miss_df['sum_1'] = miss_df[miss_df['cumsum'].eq(1)]['cumsum']

    # create column 'sum_2' with value for .eq(2) in 'cumsum' column:
    miss_df['sum_2'] = miss_df[miss_df['cumsum'].eq(2)]['cumsum']

    # change column sum 2 one row up:
    miss_df['sum_2'] = miss_df['sum_2'].shift(-1)

    # fill NaN for cols 'sum_1' and 'sum_2' with 0s:
    miss_df['sum_1'] = miss_df['sum_1'].fillna(0)
    miss_df['sum_2'] = miss_df['sum_2'].fillna(0)

    # create column 'diff_12' with diff for 'sum_1' and 'sum_2' columns:
    miss_df['diff_12'] = miss_df['sum_1'] - miss_df['sum_2']

    # create column 'start_index_one' with index value if .eq(1) in 'diff_12' column:
    miss_df['start_index_one'] = miss_df[miss_df['diff_12'].eq(1)]['index']

    # create column 'end_index_one' with same values as 'start_index_one' column:
    miss_df['end_index_one'] = miss_df['start_index_one']

    # fill 'end_index' with values from 'end_index_one' column olny when .eq(1) for 'end_index_one' column:
    miss_df['end_index'] = np.where(miss_df['end_index_one'].eq(1), miss_df['end_index_one'], miss_df['end_index'])

    # copy 'start_index' and 'id' to df 'start_end_df':
    start_end_df = miss_df[['start_index', 'id']].copy() 
    
    # add column 'end_index' to df 'start_end_df':
    start_end_df = pd.concat([start_end_df, miss_df['end_index']], axis=1)

    # fill NaN with 0s:
    start_end_df = start_end_df.fillna(0)

    # change type to int:
    start_end_df = start_end_df.astype(int)

    # first row of 'start_index' and 'end_index' columns are equal the max of the two columns:
    start_end_df.iloc[0, 0] = max(start_end_df.iloc[0, 0], start_end_df.iloc[0, 1])

    # last row of 'start_index' and 'end_index' columns are equal the max of the two columns:
    start_end_df.iloc[-1, 1] = max(start_end_df.iloc[-1, 0], start_end_df.iloc[-1, 1])

    # fill 0 with NaN:
    start_end_df = start_end_df.replace(0, np.nan)

    # drop rows with NaN on both columns:
    start_end_df = start_end_df.dropna(how='all')

    # save rows with equal values in new df 'one_missing_df':
    one_missing_df = start_end_df[start_end_df['start_index'] == start_end_df['end_index']]

    # slice df 'start_end_df' to remove rows with equal values:
    start_end_df = start_end_df[start_end_df['start_index'] != start_end_df['end_index']]

    # split 'start_end_df' in two dfs:
    start_df = start_end_df[['start_index']]
    end_df = start_end_df[['end_index']]

    # remove NaN from 'start_df' and reset index:
    start_df = start_df.dropna().reset_index(drop=True)

    # remove NaN from 'end_df' and reset index:
    end_df = end_df.dropna().reset_index(drop=True)

    # rejoin 'start_df' and 'end_df' in 'start_end_df':
    start_end_df = pd.concat([start_df, end_df], axis=1)

    # concat values from 'one_missing_df' to 'start_end_df' without columns names:
    start_end_df = pd.concat([start_end_df, one_missing_df], axis=0, ignore_index=True)

    # sort values by 'start_index' column:
    start_end_df = start_end_df.sort_values(by='start_index')

    # reset index:
    start_end_df = start_end_df.reset_index(drop=True)

    # add column 'count' with diff 'end_index' - 'start_index' + 1:
    start_end_df['count'] = start_end_df['end_index'] - start_end_df['start_index'] + 1

    # create col 'pct' using count div by length mul by 100 to get percentage:
    start_end_df['pct'] = (start_end_df['count'] / length * 100).round(2)

    # sort df by 'count' column descending and reset index:
    # start_end_df = start_end_df.sort_values(by='count', ascending=False).reset_index(drop=True)

    return start_end_df