# Telco indicent management record




In [1]:
%%html
<style>
table {float:left}
</style>

In [2]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [3]:
import os
import sys
import gc
import json
import numpy as np
import pandas as pd

# Constant

In [4]:
USER = !whoami
USER = USER[0]

PYSPARK_PYTHON_PATH = f"/home/{USER}/venv/ml/bin/python3"
HADOOP_HOME = "/opt/hadoop/hadoop-3.2.2"
SPARK_HOME = "/opt/spark/spark-3.1.2"

#  Environemnt Variables

## Hadoop

In [5]:
os.environ['HADOOP_CONF_DIR'] = f"{HADOOP_HOME}/etc/hadoop"
os.environ['HADOOP_CONF_DIR']

'/opt/hadoop/hadoop-3.2.2/etc/hadoop'

In [6]:
%%bash -s "$HADOOP_HOME"
export HADOOP_CONF_DIR="$1/etc/hadoop"
ls $HADOOP_CONF_DIR | head -n 3

capacity-scheduler.xml
configuration.xsl
container-executor.cfg


## PYTHONPATH

Refer to the **pyspark** modules to load from the ```$SPARK_HOME/python/lib``` in the Spark installation.

* [PySpark Getting Started](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)

> Ensure the SPARK_HOME environment variable points to the directory where the tar file has been extracted. Update PYTHONPATH environment variable such that it can find the PySpark and Py4J under SPARK_HOME/python/lib. One example of doing this is shown below:

```
export PYTHONPATH=$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH
```

Alternatively install **pyspark** with pip or conda locally which installs the Spark runtime libararies (for standalone).

* [Can PySpark work without Spark?](https://stackoverflow.com/questions/51728177/can-pyspark-work-without-spark)

> As of v2.2, executing pip install pyspark will install Spark. If you're going to use Pyspark it's clearly the simplest way to get started. On my system Spark is installed inside my virtual environment (miniconda) at lib/python3.6/site-packages/pyspark/jars  
> PySpark has a Spark installation installed. If installed through pip3, you can find it with pip3 show pyspark. Ex. for me it is at ~/.local/lib/python3.8/site-packages/pyspark. This is a standalone configuration so it can't be used for managing clusters like a full Spark installation.

In [7]:
# os.environ['PYTHONPATH'] = "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip:/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
sys.path.extend([
    f"{SPARK_HOME}/python/lib/py4j-0.10.9-src.zip",
    f"{SPARK_HOME}/python/lib/pyspark.zip"
])

## PYSPARK_PYTHON

In [8]:
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON_PATH

## JAVA_HOME

In [9]:
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"

## PySpark packages

Execute after the PYTHONPATH setup.

In [10]:
import pyspark.sql 
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col,
    lit,
    when,
    isnan,
    isnull,
    lower,
    upper,
    regexp_replace,
    regexp_extract,
    concat,
    udf,
    array,
    avg,
    stddev,
    to_date,
    to_timestamp,
    from_unixtime,
    year, 
    month,
    months_between,
    add_months,
)

---
# Spark Session


In [11]:
from pyspark.sql import SparkSession

In [12]:
# For YARN cluster
#    .config('spark.yarn.appMasterEnv.PYSPARK_PYTHON', f"/home/{USER}/venv/ml/bin/python3")\

spark = SparkSession.builder\
    .master('yarn') \
    .config('spark.submit.deployMode', 'client') \
    .config('spark.debug.maxToStringFields', 100) \
    .config('spark.executor.memory', '2g') \
    .config('spark.yarn.executorEnv.PYSPARK_PYTHON', f"/home/{USER}/venv/ml/bin/python3")\
    .getOrCreate()

2022-02-27 22:44:18,949 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-02-27 22:44:21,777 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [13]:
NUM_CORES = 4
NUM_PARTITIONS = 3

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set('spark.sql.legacy.timeParserPolicy', 'LEGACY')

---
# Data

## alarms.json

Each tower has sensors to monitor and report issues. When sensor detects an issue, it generates an alarm.

#### ALARM_SRC 

Column ALARM_SRC contains three elements ```(Network, SubNetwork, MeContext)``` of the tower 

1. Network shows the name of the network and it is a constant i.e. UK_TEL_LON
2. SubNetwork shows the tower on which alarm is raised i.e. LON001
3. MeContext shows the part of the tower having issue i.e. LON001LTE4813

Type of network - ALARM_SRC.MeContext also tells us if the network is 2G, 3G or 4G.
* If MeContext contains LTE it is 4G network
* If MeContext contains UTM it is 3G network
* If MeContext contains GSM it is 2G network

Example:
```{"Network":"UK_TEL_LON","SubNetwork":"LON001","MeContext":"LON001LTE4813"}```


## tickets.dat

If an alarm requires attention or some type of fix, a ticket is raised for Operations. Then the "Operation" team would either resolve it remotely or by visiting the site.

## team.csv
Details about the team who works on solving the ticket.


In [35]:
%%bash
cd ./data
unzip -o data.zip
ls -l 

hdfs dfs -mkdir -p mc
hdfs dfs -put -f team.csv tickets.dat alarms.json mc/

Archive:  data.zip
  inflating: alarms.json             
  inflating: team.csv                
  inflating: tickets.dat             
total 13288
-rw-r--r-- 1 oonisim oonisim 9785106 Feb 27 22:51 alarms.json
-rw-rw-r-- 1 oonisim oonisim 1649258 Feb 27 22:52 data.zip
-rw-r--r-- 1 oonisim oonisim     159 Feb 27 22:51 team.csv
-rw-r--r-- 1 oonisim oonisim 2163571 Feb 27 22:51 tickets.dat


---

# Team

## team_id column
The team.csv has the ```team_id``` column that has **team0NN** format whereas the tickets.csv has the corresponding ```team``` that has **teamNN** format. For instance **team028** in team.csv and **team28** in tickets.csv. Convert the team_id to **teamNN** format.

In [15]:
path_to_csv = "mc/team.csv"
team = spark.read\
    .option("compression", "none")\
    .option("header", True)\
    .option("sep", ',')\
    .option("nullValue", np.nan)\
    .option("inferSchema", True)\
    .csv(path_to_csv)\
    .withColumn("team", concat(lit("team"), regexp_extract(col("team_id"), r'^team[0]*(.*)$', 1)))\
    .drop("team_id")\
    .orderBy(col("team").asc())

team.printSchema()
team.show(5, truncate=False)
team.createOrReplaceTempView("team")
team.cache()

                                                                                

root
 |-- team_size: integer (nullable = true)
 |-- avg_experience_years: integer (nullable = true)
 |-- team: string (nullable = true)



[Stage 2:>                                                          (0 + 1) / 1]                                                                                

+---------+--------------------+------+
|team_size|avg_experience_years|team  |
+---------+--------------------+------+
|6        |6                   |team28|
|6        |6                   |team30|
|6        |7                   |team33|
|3        |6                   |team34|
|4        |5                   |team40|
+---------+--------------------+------+
only showing top 5 rows



DataFrame[team_size: int, avg_experience_years: int, team: string]

# Tickets


In [16]:
path_to_csv = "mc/tickets.dat"
date_format = "yyyy-MM-dd"

all_tickets = spark.read\
    .option("compression", "none")\
    .option("header", True)\
    .option("sep", '|')\
    .option("nullValue", np.nan)\
    .option("inferSchema", True)\
    .csv(path_to_csv)\

num_tickets_total = all_tickets.count()
print("Total tickets {}".format(num_tickets_total))
all_tickets.printSchema()
all_tickets.show(5, truncate=False)

[Stage 4:>                                                          (0 + 1) / 1]                                                                                

Total tickets 30000
root
 |-- ticket_id: string (nullable = true)
 |-- alarm: integer (nullable = true)
 |-- started_ts: string (nullable = true)
 |-- ended_ts: string (nullable = true)
 |-- solved_by: string (nullable = true)
 |-- src_system: string (nullable = true)
 |-- site_visit: integer (nullable = true)
 |-- priority: integer (nullable = true)

+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
|ticket_id|alarm   |started_ts         |ended_ts           |solved_by|src_system|site_visit|priority|
+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
|T_0000001|50025222|2018-09-14 03:29:10|14-09-2018 04.06.13|team78   |OP2       |1         |3       |
|T_0000002|50021238|2018-10-29 03:39:49|29/10/2018 04:25:25|team30   |OP1       |0         |2       |
|T_0000003|50034089|2018-03-31 07:25:48|31/03/2018 07:58:45|team68   |OP1       |0         |3       |
|T_0000004|50021918|2018-08-08 19:

## Tickets - rows whose column X has null

In [17]:
num_tickets_with_null = {}
for column in all_tickets.columns:
    nulls = all_tickets.where(col(column).isNull())
    if nulls.count() > 0:
        num_nulls = all_tickets.where(col("ended_ts").isNull()).count()
        num_tickets_with_null[column] = num_nulls
        print("Column {}: there are {} rows where column is null:".format(
            column,
            num_nulls
        ))
        nulls.show(truncate=False)
    else:
        print("Column {}: all rows are valid.".format(column))

                                                                                

Column ticket_id: all rows are valid.


[Stage 10:>                                                         (0 + 1) / 1]                                                                                

Column alarm: all rows are valid.
Column started_ts: all rows are valid.
Column ended_ts: there are 2974 rows where column is null:
+---------+--------+-------------------+--------+---------+----------+----------+--------+
|ticket_id|alarm   |started_ts         |ended_ts|solved_by|src_system|site_visit|priority|
+---------+--------+-------------------+--------+---------+----------+----------+--------+
|T_0000019|50010378|2018-01-04 06:22:13|null    |team28   |OP2       |0         |3       |
|T_0000020|50037567|2018-12-13 05:30:25|null    |team56   |OP1       |1         |3       |
|T_0000036|50038539|2018-11-24 19:15:01|null    |team48   |OP2       |0         |1       |
|T_0000038|50009171|2018-02-19 02:56:04|null    |team68   |OP2       |1         |3       |
|T_0000049|50028391|2018-09-15 03:52:11|null    |team28   |OP2       |0         |1       |
|T_0000062|50046516|2018-04-12 21:23:58|null    |team48   |OP2       |0         |2       |
|T_0000063|50019490|2018-08-10 00:10:55|null    |

## Tickets with no null

In [18]:
path_to_csv = "mc/tickets.dat"
date_format = "yyyy-MM-dd"

tickets_cleaned = spark.read\
    .option("compression", "none")\
    .option("header", True)\
    .option("sep", '|')\
    .option("nullValue", np.nan)\
    .option("inferSchema", True)\
    .csv(path_to_csv)\
    .where(col("ended_ts").isNotNull())

num_tickets_no_null = tickets_cleaned.count()
print("Number of tickets with no null {}".format(num_tickets_no_null))
tickets_cleaned.show(5, truncate=False)

                                                                                

Number of tickets with no null 27026
+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
|ticket_id|alarm   |started_ts         |ended_ts           |solved_by|src_system|site_visit|priority|
+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
|T_0000001|50025222|2018-09-14 03:29:10|14-09-2018 04.06.13|team78   |OP2       |1         |3       |
|T_0000002|50021238|2018-10-29 03:39:49|29/10/2018 04:25:25|team30   |OP1       |0         |2       |
|T_0000003|50034089|2018-03-31 07:25:48|31/03/2018 07:58:45|team68   |OP1       |0         |3       |
|T_0000004|50021918|2018-08-08 19:57:56|08/08/2018 20:42:40|team56   |OP1       |0         |2       |
|T_0000005|50046096|2018-10-10 17:47:00|10/10/2018 18:31:05|team34   |OP1       |0         |3       |
+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
only showing top 5 rows



In [19]:
assert num_tickets_total == (num_tickets_no_null + sum(num_tickets_with_null.values()))

## Clean the ended_ts date

The format of the timestamp in ended_ts is not consistent. Some have ```14-09-2018``` and others have ```29/10/2018```. Make the timeformat consistent with ```dd-MM-yyyy HH:mm:ss```.

In [20]:
tickets = tickets_cleaned\
    .withColumn("ended_ts", regexp_replace("ended_ts", "/", "-"))\
    .withColumn("ended_ts", regexp_replace("ended_ts", "\.", ":"))\
    .withColumn("ended_ts", to_timestamp(col('ended_ts'), "dd-MM-yyyy HH:mm:ss"))\

tickets.printSchema()
tickets.show(5, truncate=False)

root
 |-- ticket_id: string (nullable = true)
 |-- alarm: integer (nullable = true)
 |-- started_ts: string (nullable = true)
 |-- ended_ts: timestamp (nullable = true)
 |-- solved_by: string (nullable = true)
 |-- src_system: string (nullable = true)
 |-- site_visit: integer (nullable = true)
 |-- priority: integer (nullable = true)



[Stage 32:>                                                         (0 + 1) / 1]

+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
|ticket_id|alarm   |started_ts         |ended_ts           |solved_by|src_system|site_visit|priority|
+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
|T_0000001|50025222|2018-09-14 03:29:10|2018-09-14 04:06:13|team78   |OP2       |1         |3       |
|T_0000002|50021238|2018-10-29 03:39:49|2018-10-29 04:25:25|team30   |OP1       |0         |2       |
|T_0000003|50034089|2018-03-31 07:25:48|2018-03-31 07:58:45|team68   |OP1       |0         |3       |
|T_0000004|50021918|2018-08-08 19:57:56|2018-08-08 20:42:40|team56   |OP1       |0         |2       |
|T_0000005|50046096|2018-10-10 17:47:00|2018-10-10 18:31:05|team34   |OP1       |0         |3       |
+---------+--------+-------------------+-------------------+---------+----------+----------+--------+
only showing top 5 rows



                                                                                

## Extract year/month

In [21]:
tickets = tickets_cleaned\
    .withColumnRenamed("solved_by", "team")\
    .withColumn("ended_ts", regexp_replace("ended_ts", "/", "-"))\
    .withColumn("ended_ts", regexp_replace("ended_ts", "\.", ":"))\
    .withColumn("ended_ts", to_timestamp(col('ended_ts'), "dd-MM-yyyy HH:mm:ss"))\
    .withColumn("year", year(col("ended_ts")))\
    .withColumn("month", month(col("ended_ts")))

tickets.printSchema()
tickets.show(5, truncate=False)
tickets.cache()
tickets.createOrReplaceTempView("tickets")

root
 |-- ticket_id: string (nullable = true)
 |-- alarm: integer (nullable = true)
 |-- started_ts: string (nullable = true)
 |-- ended_ts: timestamp (nullable = true)
 |-- team: string (nullable = true)
 |-- src_system: string (nullable = true)
 |-- site_visit: integer (nullable = true)
 |-- priority: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



[Stage 33:>                                                         (0 + 1) / 1]

+---------+--------+-------------------+-------------------+------+----------+----------+--------+----+-----+
|ticket_id|alarm   |started_ts         |ended_ts           |team  |src_system|site_visit|priority|year|month|
+---------+--------+-------------------+-------------------+------+----------+----------+--------+----+-----+
|T_0000001|50025222|2018-09-14 03:29:10|2018-09-14 04:06:13|team78|OP2       |1         |3       |2018|9    |
|T_0000002|50021238|2018-10-29 03:39:49|2018-10-29 04:25:25|team30|OP1       |0         |2       |2018|10   |
|T_0000003|50034089|2018-03-31 07:25:48|2018-03-31 07:58:45|team68|OP1       |0         |3       |2018|3    |
|T_0000004|50021918|2018-08-08 19:57:56|2018-08-08 20:42:40|team56|OP1       |0         |2       |2018|8    |
|T_0000005|50046096|2018-10-10 17:47:00|2018-10-10 18:31:05|team34|OP1       |0         |3       |2018|10   |
+---------+--------+-------------------+-------------------+------+----------+----------+--------+----+-----+
only showi

                                                                                

# Alarms


In [22]:
preview = spark.read\
    .option("compression", "none")\
    .option("inferSchema", True)\
    .json("mc/alarms.json")

preview.printSchema()
preview.show(truncate=False)
del preview

                                                                                

root
 |-- alarm_id: long (nullable = true)
 |-- alarm_src: struct (nullable = true)
 |    |-- MeContext: string (nullable = true)
 |    |-- Network: string (nullable = true)
 |    |-- SubNetwork: string (nullable = true)
 |-- event_end_ts: long (nullable = true)
 |-- event_start_ts: long (nullable = true)
 |-- issue_type: string (nullable = true)



[Stage 35:>                                                         (0 + 1) / 1]

+--------+-----------------------------------+-------------+--------------+----------------+
|alarm_id|alarm_src                          |event_end_ts |event_start_ts|issue_type      |
+--------+-----------------------------------+-------------+--------------+----------------+
|50000001|{LON75UTM3265, UK_TEL_LON, LON75}  |1525685613000|1525683343000 |                |
|50000002|{LON47UTM3159, UK_TEL_LON, LON47}  |1533618469000|1533610843000 |                |
|50000003|{LON75GSM2884, UK_TEL_LON, LON75}  |1541645693000|1541637429000 |3g_failure      |
|50000004|{LON187GSM2391, UK_TEL_LON, LON187}|1525604682000|1525599471000 |power_failure   |
|50000005|{LON111LTE4170, UK_TEL_LON, LON111}|1541342376000|1541335278000 |power_failure   |
|50000006|{LON75GSM2884, UK_TEL_LON, LON75}  |1533844097000|1533841992000 |                |
|50000007|{LON33GSM2601, UK_TEL_LON, LON33}  |1530752968000|1530743892000 |speed_issues    |
|50000008|{LON187LTE4202, UK_TEL_LON, LON187}|1532234526000|1532227123

                                                                                

---
# Question 1

Produce a table showing the nubmer of tickets each team solved.

* team: Team name
* tickets_priority_1_count: Number of priority 1 tickets the team solved
* tickets_priority_2_count
* tickets_priority_3_count

## Sample format

In [23]:
# Example output schema (using generated mock data)
import pandas as pd
import numpy as np
teams = ['team78', 'team30', 'team68', 'team56', 'team34', 'team48',
       'team28', 'team33', 'team90']
example_output_q3 = pd.DataFrame(
    {'team': np.random.choice(teams, 24),
     'tickets_priority_1_count': np.random.randint(0,100,24),
     'tickets_priorityb_2_count': np.random.randint(0,100,24),
     'tickets_priority_3_count': np.random.randint(0,100,24)}
)
example_output_q3.head()

Unnamed: 0,team,tickets_priority_1_count,tickets_priorityb_2_count,tickets_priority_3_count
0,team90,40,25,34
1,team90,18,51,24
2,team30,82,5,45
3,team33,97,40,0
4,team28,93,52,36


## Solution

First, create the long format (stacked) where (attribute,value) = (priority, num_tickets). 

In [24]:
query = """
WITH team_priority_tickets AS (
    SELECT
        team,
        priority as priority,
        count(*) AS num_tickets
    FROM
        tickets
    GROUP BY
        team,
        priority
    ORDER BY
        team
)
SELECT
    *
FROM
    team_priority_tickets
"""
spark.sql(query).show(truncate=False)



+------+--------+-----------+
|team  |priority|num_tickets|
+------+--------+-----------+
|team28|1       |386        |
|team28|3       |1620       |
|team28|2       |964        |
|team30|3       |1674       |
|team30|1       |318        |
|team30|2       |1021       |
|team33|1       |326        |
|team33|2       |1033       |
|team33|3       |1605       |
|team34|2       |963        |
|team34|1       |355        |
|team34|3       |1578       |
|team48|1       |365        |
|team48|3       |1696       |
|team48|2       |981        |
|team56|1       |323        |
|team56|2       |1019       |
|team56|3       |1696       |
|team68|2       |1067       |
|team68|1       |287        |
+------+--------+-----------+
only showing top 20 rows



                                                                                

Then PIVOT the (attribute, value) to Wide Format (Unstacked)

In [25]:
query = """
WITH team_priority_tickets AS (
    SELECT
        team,
        priority as priority,
        count(*) AS num_tickets
    FROM
        tickets
    GROUP BY
        team,
        priority
    ORDER BY
        team
)
SELECT
    *
FROM
    team_priority_tickets
PIVOT (
    SUM(num_tickets)
    FOR priority IN (
        1 AS tickets_priority_1_count,
        2 AS tickets_priority_2_count,
        3 AS tickets_priority_3_count
    )
)
ORDER BY
    team
"""
spark.sql(query).show(truncate=False)



+------+------------------------+------------------------+------------------------+
|team  |tickets_priority_1_count|tickets_priority_2_count|tickets_priority_3_count|
+------+------------------------+------------------------+------------------------+
|team28|386                     |964                     |1620                    |
|team30|318                     |1021                    |1674                    |
|team33|326                     |1033                    |1605                    |
|team34|355                     |963                     |1578                    |
|team48|365                     |981                     |1696                    |
|team56|323                     |1019                    |1696                    |
|team68|287                     |1067                    |1677                    |
|team78|353                     |1023                    |1711                    |
|team90|325                     |999                     |1661              

                                                                                

---
# Question 2

Create a dataframe with the columns:

- year
- month
- team
- team size
- tickets solved in the current month
- tickets solved in the previous year

## Sample format

In [26]:
# Example output schema (using generated mock data)
import pandas as pd
import numpy as np
teams = ['team78', 'team30', 'team68', 'team56', 'team34', 'team48',
       'team28', 'team33', 'team90']
example_output_q4 = pd.DataFrame(
    {'year': [2018]*12 + [2019]*12,
     'month': list(range(1,13))*2,
     'team': np.random.choice(teams, 24),
     'team_size': np.random.randint(1,6,24),
     'tickets_solved_in_the_current_month': np.random.randint(0,100,24),
     'tickets_solved_in_the_previous_year': np.random.randint(0,100,24)}
)
example_output_q4.head()

Unnamed: 0,year,month,team,team_size,tickets_solved_in_the_current_month,tickets_solved_in_the_previous_year
0,2018,1,team34,2,71,38
1,2018,2,team56,3,57,60
2,2018,3,team30,2,62,30
3,2018,4,team30,5,6,40
4,2018,5,team33,4,14,56


## Solution

First, extract year and month as integers.

In [27]:
query = """
WITH year_month_team AS (
    SELECT
        year,
        month,
        team,
        count(*) AS tickets_solved_in_the_current_month
    FROM
        tickets
    GROUP BY
        year,
        month,
        team        
    ORDER BY
        year,
        month,
        team        
),
year_team AS (
    SELECT
        year,
        team,
        count(*) AS tickets_solved_in_the_year
    FROM
        tickets
    GROUP BY
        year,
        team        
    ORDER BY
        year,
        team       
)
SELECT DISTINCT
    o.year,
    o.month,
    o.team,
    t.team_size,
    o.tickets_solved_in_the_current_month,
    i.tickets_solved_in_the_year AS tickets_solved_in_the_previous_year
FROM
    year_month_team o
    INNER JOIN team t
        ON o.team = t.team
    LEFT OUTER JOIN year_team i 
        ON i.team = o.team AND i.year = (o.year - 1)
ORDER BY
    year desc,
    month,
    team
"""
spark.sql(query).show(truncate=False)



+----+-----+------+---------+-----------------------------------+-----------------------------------+
|year|month|team  |team_size|tickets_solved_in_the_current_month|tickets_solved_in_the_previous_year|
+----+-----+------+---------+-----------------------------------+-----------------------------------+
|2019|1    |team34|3        |1                                  |2895                               |
|2019|1    |team48|6        |1                                  |3041                               |
|2019|1    |team56|5        |2                                  |3035                               |
|2019|1    |team78|5        |1                                  |3086                               |
|2018|1    |team28|6        |253                                |null                               |
|2018|1    |team30|6        |266                                |null                               |
|2018|1    |team33|6        |262                                |null             



# Question 3

Create a function that accepts alarms records (part) and returns the alarm network type.

In [28]:
alarms = spark\
    .read\
    .option("inferSchema", True)\
    .json("mc/alarms.json")\
    .select(
        "alarm_id",
        col("alarm_src.MeContext").alias("part"),    
        col("alarm_src.Network").alias("network"),    
        col("alarm_src.SubNetwork").alias("subnet"),    
        col("event_start_ts"),    
        col("event_end_ts"),    
        to_date(
            from_unixtime(col("event_start_ts") / 1000),
            "yyyy-MM-dd"
        ).alias("start_date"),
        to_date(
            from_unixtime(col("event_end_ts") / 1000),
            "yyyy-MM-dd"
        ).alias("end_date"),
        col("issue_type"),    
    )
alarms.printSchema()
alarms.show(truncate=False)
alarms.cache()
alarms.createOrReplaceTempView("alarms")

                                                                                

root
 |-- alarm_id: long (nullable = true)
 |-- part: string (nullable = true)
 |-- network: string (nullable = true)
 |-- subnet: string (nullable = true)
 |-- event_start_ts: long (nullable = true)
 |-- event_end_ts: long (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- issue_type: string (nullable = true)



[Stage 49:>                                                         (0 + 1) / 1]                                                                                

+--------+-------------+----------+------+--------------+-------------+----------+----------+----------------+
|alarm_id|part         |network   |subnet|event_start_ts|event_end_ts |start_date|end_date  |issue_type      |
+--------+-------------+----------+------+--------------+-------------+----------+----------+----------------+
|50000001|LON75UTM3265 |UK_TEL_LON|LON75 |1525683343000 |1525685613000|2018-05-07|2018-05-07|                |
|50000002|LON47UTM3159 |UK_TEL_LON|LON47 |1533610843000 |1533618469000|2018-08-07|2018-08-07|                |
|50000003|LON75GSM2884 |UK_TEL_LON|LON75 |1541637429000 |1541645693000|2018-11-08|2018-11-08|3g_failure      |
|50000004|LON187GSM2391|UK_TEL_LON|LON187|1525599471000 |1525604682000|2018-05-06|2018-05-06|power_failure   |
|50000005|LON111LTE4170|UK_TEL_LON|LON111|1541335278000 |1541342376000|2018-11-04|2018-11-05|power_failure   |
|50000006|LON75GSM2884 |UK_TEL_LON|LON75 |1533841992000 |1533844097000|2018-08-10|2018-08-10|                |
|

### UDF to extract the network type

In [29]:
@udf(returnType=StringType())
def get_networ_type(part: str) -> str:
    """Get the Network Type for the tower
    Args:
        part: Part of the tower having the problem.
    Returns:
        Network Type which is either LTE or UTM, or GSM.
    """
    if (part is not None) and (part != np.nan) and isinstance(part, str):
        part = part.lower()
        for kind in ["gsm", "lte", "utm"]:
            if kind in part:
                return kind.upper()

    return None

In [30]:
alarms.select(
    "alarm_id",
    get_networ_type("part").alias("network_type")
).where(col("network_type").isNotNull()).show()

[Stage 50:>                                                         (0 + 1) / 1]

+--------+------------+
|alarm_id|network_type|
+--------+------------+
|50000001|         UTM|
|50000002|         UTM|
|50000003|         GSM|
|50000004|         GSM|
|50000005|         LTE|
|50000006|         GSM|
|50000007|         GSM|
|50000008|         LTE|
|50000009|         UTM|
|50000010|         GSM|
|50000011|         GSM|
|50000012|         LTE|
|50000013|         UTM|
|50000014|         UTM|
|50000015|         GSM|
|50000016|         LTE|
|50000017|         UTM|
|50000018|         UTM|
|50000019|         UTM|
|50000020|         UTM|
+--------+------------+
only showing top 20 rows



                                                                                

In [31]:
alarms.withColumn("network_type", get_networ_type("part")).show()

+--------+-------------+----------+------+--------------+-------------+----------+----------+----------------+------------+
|alarm_id|         part|   network|subnet|event_start_ts| event_end_ts|start_date|  end_date|      issue_type|network_type|
+--------+-------------+----------+------+--------------+-------------+----------+----------+----------------+------------+
|50000001| LON75UTM3265|UK_TEL_LON| LON75| 1525683343000|1525685613000|2018-05-07|2018-05-07|                |         UTM|
|50000002| LON47UTM3159|UK_TEL_LON| LON47| 1533610843000|1533618469000|2018-08-07|2018-08-07|                |         UTM|
|50000003| LON75GSM2884|UK_TEL_LON| LON75| 1541637429000|1541645693000|2018-11-08|2018-11-08|      3g_failure|         GSM|
|50000004|LON187GSM2391|UK_TEL_LON|LON187| 1525599471000|1525604682000|2018-05-06|2018-05-06|   power_failure|         GSM|
|50000005|LON111LTE4170|UK_TEL_LON|LON111| 1541335278000|1541342376000|2018-11-04|2018-11-05|   power_failure|         LTE|
|5000000

---
# Stop Spark Session

In [32]:
spark.stop()



# Cleanup

In [33]:
del spark
gc.collect()

38