# KPI data preparation #

The purpose of this notebook is to prepare the KPI data for model training. We check the following: 

1. missing values,
2. create a few labels for semi-supervised training.

In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [248]:
df_responses = pd.read_csv('19-01-2021-datasets/dataset-responses-15680x1.csv', sep='|', names=['score'])
df_responses.shape

(15680, 1)

In [249]:
df_responses['label'] = np.nan

ids = df_responses[df_responses['score'] > 8].index.values
df_responses.loc[ids, 'label'] = 0

ids = df_responses[df_responses['score'] <= 6].index.values
df_responses.loc[ids, 'label'] = 1

In [250]:
df_responses[df_responses['label'] == 0].shape, df_responses[df_responses['label'] == 1].shape, df_responses[~df_responses['label'].isnull()].shape


((5665, 2), (7110, 2), (12775, 2))

In [261]:
7110/12775

0.5565557729941292

In [251]:
df_responses[~df_responses['label'].isnull()].astype(int).to_csv("responses.csv", header=None, index=None, sep='|')

In [252]:
ids = df_responses[~df_responses['label'].isnull()].index.values

In [253]:
df_interactions = pd.read_csv('19-01-2021-datasets/dataset-interactions-15680x1.csv', sep=',', names=range(5000))
df_interactions = df_interactions.dropna(axis=1, how='all')
df_interactions.shape

(15680, 1831)

In [254]:
df_interactions = df_interactions.loc[ids, :]
df_interactions.shape

(12775, 1831)

In [257]:
def join_ids(row):
    return ','.join([str(int(s)) for s in list(row.values) if not np.isnan(s)])

In [258]:
df_interactions = df_interactions.apply(join_ids, axis=1)

In [259]:
df_interactions.to_csv('interactions.csv', header=None, index=False)

In [2]:
df_raw = pd.read_csv("19-01-2021-datasets/dataset-kpis-417208x4.csv", sep='|')
df_raw.shape

(417208, 4)

In [3]:
df_raw.fillna(100000, inplace=True)

In [4]:
df = df_raw[~df_raw[df_raw == 100000].any(1)]
df.shape

(171403, 4)

In [5]:
df.columns.values

array(['avg_signal_power_dBm_3g', 'avg_signal_quality_dB_3g',
       'avg_signal_power_dBm_4g', 'avg_signal_quality_dB_4g'],
      dtype=object)

Here we can adjust rules to create labelled data: 

In [20]:
df_bad = df[(df['avg_signal_power_dBm_4g'] < -110) & \
            (df['avg_signal_quality_dB_4g'] < -13) & \
            (df['avg_signal_power_dBm_3g'] < -100) & \
            (df['avg_signal_quality_dB_3g'] < -10)]

df_good = df[(df['avg_signal_power_dBm_4g'] > -90) & \
             (df['avg_signal_quality_dB_4g'] > -10) & \
             (df['avg_signal_power_dBm_3g'] > -80) & \
             (df['avg_signal_quality_dB_3g'] > -7)]

df_bad.shape, df_good.shape

((86, 4), (3896, 4))

In [21]:
good_ids = df_good.sample(n=20).index.values
bad_ids = df_bad.sample(n=20).index.values

In [22]:
df_raw['label'] = 100000
df_raw.loc[good_ids, 'label'] = 0
df_raw.loc[bad_ids, 'label'] = 1

In [23]:
df_raw = df_raw.replace(np.nan, 100000)

In [29]:
df_raw['label'].to_csv("labels.csv", header=None, index=None, sep=';')
df_raw[df.columns.values].to_csv("dataset-kpis.csv", header=None, index=None, sep=';')

  """Entry point for launching an IPython kernel.
