In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import *
import pyspark
from pyspark.sql.functions import struct
from pyspark.sql.functions import monotonically_increasing_id
import pyspark.sql.functions as f

#Starten ohne Spark-Cluster
spark = SparkSession.builder.getOrCreate()

In [2]:
# Verbindung zu Minio
spark.sparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", "minio")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "minio123")
spark.sparkContext._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio1:9000")

In [3]:
schema = StructType([
    StructField("Top level domain", StringType(), True),
    StructField("MX-Record from the name server", StringType(), True),
    StructField("A-Record of the specific domain", StringType(), True),
    StructField("Timestamp", StringType(), True)])

df = spark.read.csv("s3a://bucket/real_new.csv",header=False,sep=";",schema=schema)

### Datenbeschreibung / Data Cleaning

#### Allgemein

In [4]:
df.show(5)

+----------------+------------------------------+-------------------------------+-------------------+
|Top level domain|MX-Record from the name server|A-Record of the specific domain|          Timestamp|
+----------------+------------------------------+-------------------------------+-------------------+
|         0--1.de|          "[""mail.0--1.de"...|           "[""46.38.249.145...|2020-12-13 15:36:05|
|         0--2.de|          "[""mxf993.netcup...|           "[""212.227.212.1...|2020-12-13 15:36:05|
|    0-0-0-0-0.de|          "[""smtp-02.tld.t...|           "[""80.150.6.143""]"|2020-12-13 15:36:05|
|      0-0-0-1.de|          "[""smtp-02.tld.t...|           "[""80.150.6.143""]"|2020-12-13 15:36:05|
|          0-0.de|                            []|           "[""185.53.178.13...|2020-12-13 15:36:05|
+----------------+------------------------------+-------------------------------+-------------------+
only showing top 5 rows



* Top level domain: Namensserver; das, was Personen sich merken können
* Ein MX Resource Record weist einem Namen einen Mailserver zu. Er stellt eine Besonderheit dar, da er sich auf einen speziellen Dienst im Internet, nämlich die E-Mailzustellung mittels SMTP, bezieht. Alle anderen Dienste nutzen CNAME, A und AAAA Resource Records für die Namensauflösung.
* Ein A Resource Record weist einem Namen eine IPv4-Adresse zu.

In [5]:
df= df.withColumn("id", monotonically_increasing_id())

In [6]:
df=df.select("Top level domain","MX-Record from the name server","A-Record of the specific domain")

* die Spalte Timestamp wird nicht benötigt

In [7]:
type(df)

pyspark.sql.dataframe.DataFrame

In [8]:
df.count()

4860885

In [9]:
df.printSchema()

root
 |-- Top level domain: string (nullable = true)
 |-- MX-Record from the name server: string (nullable = true)
 |-- A-Record of the specific domain: string (nullable = true)



#### Top Level Domain

In [10]:
df.filter(df["Top level domain"]=="null").count()

0

* keine Null Values

In [11]:
dfTCount=df.groupBy("Top level domain").count()
dfTCount.orderBy(col("count").desc()).show(5)

+----------------+-----+
|Top level domain|count|
+----------------+-----+
|     007kaess.de|    1|
|        01307.de|    1|
|        01800.de|    1|
|     030pizza.de|    1|
|        04179.de|    1|
+----------------+-----+
only showing top 5 rows



* es gibt keine doppelte TLD

#### MX Records

In [12]:
df.filter(df["MX-Record from the name server"]=="null").count()

0

* keine Null Values

In [14]:
dfMxCount=df.groupBy("MX-Record from the name server").count()
dfMxCount.orderBy(col("count").desc()).show(5)

+------------------------------+------+
|MX-Record from the name server| count|
+------------------------------+------+
|                            []|633040|
|          "[""smtpin.rzone....|509127|
|             "[""localhost""]"|303839|
|          "[""mx01.kundense...|176491|
|          "[""mx00.kundense...|176182|
+------------------------------+------+
only showing top 5 rows



#### A Records

In [15]:
df.filter(df["A-Record of the specific domain"]=="null").count()

10732

* es gibt 10.732 Null Values
* diese können mit [] ersetzt werde

In [16]:
#df = df.withColumn('A-Record of the specific domain', regexp_replace('A-Record of the specific domain', 'null', '[]'))
df=df.na.replace({"null": "[]"})

In [17]:
dfACount=df.groupBy("A-Record of the specific domain").count()
dfACount.orderBy(col("count").desc()).show(5)

+-------------------------------+------+
|A-Record of the specific domain| count|
+-------------------------------+------+
|           "[""91.195.241.13...|303204|
|                             []|172019|
|           "[""80.150.6.143""]"|150156|
|           "[""23.236.62.147...| 59034|
|            "[""52.58.78.16""]"| 49200|
+-------------------------------+------+
only showing top 5 rows



### Entfernen von Sonderzeichen im Datensatz

#### MX Record

In [None]:
commaRep = udf(lambda x: re.sub('"','', x))
df2=df.withColumn('MX-Record',commaRep('MX-Record'))
df2.show(3, False)

In [None]:
commaRep2 = udf(lambda x: re.sub(']','', x))
df3=df2.withColumn('MX-Record',commaRep2('MX-Record'))
df3.show(3, False)

In [None]:
#Entfernen von [ wirft noch Fehler

# commaRep3 = udf(lambda x: re.sub('[','', x))
# df3=df2.withColumn('MX-Record',commaRep3('MX-Record'))
# df3.show(3, False)

#### A Record

In [None]:
#Entfernen von Symbolen in A-Record wirft noch Fehler



commaRep4 = udf(lambda x: re.sub('"','', x))
df5=df.withColumn('A-Record',commaRep4('A-Record'))
df5.show(3, False)

 ### Splitten der Spalten

#### MX Record

In [None]:
split_cols = pyspark.sql.functions.split(df['MX-Record'], '"",""')
df11 = df.withColumn('mx-record1', split_cols.getItem(0)) \
    .withColumn('mx-record2', split_cols.getItem(1)) \
    .withColumn('mx-record3', split_cols.getItem(2))
df11.show(3, False)

#### A Record

In [None]:
split_cols = pyspark.sql.functions.split(df['A-Record'], ',')

In [None]:
df10 = df.withColumn('a-record1', split_cols.getItem(0)) \
    .withColumn('a-record2', split_cols.getItem(1)) \
    .withColumn('a-record3', split_cols.getItem(2)) \
    .withColumn('a-record4', split_cols.getItem(3)) \
    .withColumn('a-record5', split_cols.getItem(4)) \
    .withColumn('a-record6', split_cols.getItem(5)) \
    .withColumn('a-record7', split_cols.getItem(6))

df10.show(20)

In [None]:
#checking ob es bis zu 3 a-records gibt? Ja
df10.select('a-record3').show(200)

In [None]:
#checking ob es bis zu 4 a.records gibt? In den ersten 200 Reihen nein.
df10.select('a-record4').show(200)

In [None]:
# Wie viele IPs max. in der Spalte A-Record?
#tobedone

