In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import *
import pyspark
from pyspark.sql.functions import struct
from pyspark.sql.functions import monotonically_increasing_id
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
import re
#import geoip2.database

#Starten ohne Spark-Cluster
spark = SparkSession.builder.getOrCreate()

In [None]:
# Verbindung zu Minio
spark.sparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", "minio")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "minio123")
spark.sparkContext._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio1:9000")

In [2]:
schema = StructType([
    StructField("Top level domain", StringType(), True),
    StructField("MX-Record from the name server", StringType(), True),
    StructField("A-Record of the specific domain", StringType(), True),
    StructField("Timestamp", StringType(), True)])

df = spark.read.csv("s3a://bucket/real_new.csv",header=False,sep=";",schema=schema)

# Datenbeschreibung

## real_domains

### Allgemein

In [3]:
df.show(5)

+----------------+------------------------------+-------------------------------+-------------------+
|Top level domain|MX-Record from the name server|A-Record of the specific domain|          Timestamp|
+----------------+------------------------------+-------------------------------+-------------------+
|         0--1.de|          "[""mail.0--1.de"...|           "[""46.38.249.145...|2020-12-13 15:36:05|
|         0--2.de|          "[""mxf993.netcup...|           "[""212.227.212.1...|2020-12-13 15:36:05|
|    0-0-0-0-0.de|          "[""smtp-02.tld.t...|           "[""80.150.6.143""]"|2020-12-13 15:36:05|
|      0-0-0-1.de|          "[""smtp-02.tld.t...|           "[""80.150.6.143""]"|2020-12-13 15:36:05|
|          0-0.de|                            []|           "[""185.53.178.13...|2020-12-13 15:36:05|
+----------------+------------------------------+-------------------------------+-------------------+
only showing top 5 rows



* Top level domain: Namensserver; das, was Personen sich merken können
* Ein MX Resource Record weist einem Namen einen Mailserver zu. Er stellt eine Besonderheit dar, da er sich auf einen speziellen Dienst im Internet, nämlich die E-Mailzustellung mittels SMTP, bezieht. Alle anderen Dienste nutzen CNAME, A und AAAA Resource Records für die Namensauflösung.
* Ein A Resource Record weist einem Namen eine IPv4-Adresse zu.

In [4]:
df= df.withColumn("id", monotonically_increasing_id())

In [5]:
df=df.select("Top level domain","MX-Record from the name server","A-Record of the specific domain")

* die Spalte Timestamp wird nicht benötigt

In [6]:
df=df.withColumnRenamed("MX-Record from the name server", "MX-Record")

In [7]:
df=df.withColumnRenamed("A-Record of the specific domain", "A-Record")

In [8]:
type(df)

pyspark.sql.dataframe.DataFrame

In [9]:
df.count()

4860885

In [10]:
df.dropDuplicates().count()

4860885

* es gibt keine Duplikate in dem Datensatz

In [None]:
#df.filter(df["MX-Record"].contains(']')).count()

* es gibt 3 Zeilen, wo die Aufzeichnung von MX Records abgebrochen wurde

In [None]:
#df=df.filter(df["MX-Record"].contains(']'))

* diese Zeilen werden entfernt, da sie sich nicht reparieren lassen

In [9]:
df.printSchema()

root
 |-- Top level domain: string (nullable = true)
 |-- MX-Record: string (nullable = true)
 |-- A-Record: string (nullable = true)



### Top Level Domain

In [12]:
df.filter(df["Top level domain"]=="null").count()

0

* keine Null Values

In [13]:
dfTCount=df.groupBy("Top level domain").count()
dfTCount.orderBy(col("count").desc()).show(5)

+----------------+-----+
|Top level domain|count|
+----------------+-----+
|         000o.de|    1|
| 015119155752.de|    1|
|       020618.de|    1|
|    030berlin.de|    1|
|        07980.de|    1|
+----------------+-----+
only showing top 5 rows



* es gibt keine doppelte TLD

### MX Records

In [14]:
df.filter(df["MX-Record"]=="null").count()

0

* keine Null Values

In [15]:
dfMxCount=df.groupBy("MX-Record").count()
dfMxCount.orderBy(col("count").desc()).show(5,False)

+-----------------------------------------------------+------+
|MX-Record                                            |count |
+-----------------------------------------------------+------+
|[]                                                   |633040|
|"[""smtpin.rzone.de""]"                              |509127|
|"[""localhost""]"                                    |303839|
|"[""mx01.kundenserver.de"",""mx00.kundenserver.de""]"|176491|
|"[""mx00.kundenserver.de"",""mx01.kundenserver.de""]"|176182|
+-----------------------------------------------------+------+
only showing top 5 rows



### A Records

In [16]:
df.filter(df["A-Record"]=="null").count()

10732

* es gibt 10.732 Null Values
* diese können mit [] ersetzt werde

In [10]:
df=df.na.replace({"null": "[]"})

In [18]:
dfACount=df.groupBy("A-Record").count()
dfACount.orderBy(col("count").desc()).show(5, False)

+----------------------+------+
|A-Record              |count |
+----------------------+------+
|"[""91.195.241.137""]"|303204|
|[]                    |172019|
|"[""80.150.6.143""]"  |150156|
|"[""23.236.62.147""]" |59034 |
|"[""52.58.78.16""]"   |49200 |
+----------------------+------+
only showing top 5 rows



## asn ip4 Blocks

### Allgemein

In [11]:
schema_blocks = StructType([
    StructField("network", StringType(), True),
    StructField("autonomous_system_number", IntegerType(), True),
    StructField("autonomous_system_organization", StringType(), True)
])

df_asn_blocks_ipv4 = spark.read.csv("s3a://bucket/GeoLite2-ASN-Blocks-IPv4.csv", header=True, schema=schema_blocks).cache()

In [12]:
df_asn_blocks_ipv4.show(5)

+------------+------------------------+------------------------------+
|     network|autonomous_system_number|autonomous_system_organization|
+------------+------------------------+------------------------------+
|  1.0.0.0/24|                   13335|                 CLOUDFLARENET|
|  1.0.4.0/22|                   38803|          Wirefreebroadband...|
| 1.0.64.0/18|                   18144|          Energia Communica...|
|1.0.128.0/17|                   23969|          TOT Public Compan...|
|  1.1.1.0/24|                   13335|                 CLOUDFLARENET|
+------------+------------------------+------------------------------+
only showing top 5 rows



In [103]:
df_asn_blocks_ipv4_2 = df_asn_blocks_ipv4.select(col("network")).collect()

In [104]:
df_asn_blocks_ipv4_2

[Row(network='1.0.0.0/24'),
 Row(network='1.0.4.0/22'),
 Row(network='1.0.64.0/18'),
 Row(network='1.0.128.0/17'),
 Row(network='1.1.1.0/24'),
 Row(network='1.1.8.0/24'),
 Row(network='1.1.64.0/19'),
 Row(network='1.1.96.0/20'),
 Row(network='1.1.112.0/21'),
 Row(network='1.1.120.0/22'),
 Row(network='1.1.124.0/23'),
 Row(network='1.1.126.0/24'),
 Row(network='1.1.128.0/17'),
 Row(network='1.2.4.0/24'),
 Row(network='1.2.128.0/17'),
 Row(network='1.4.128.0/17'),
 Row(network='1.5.0.0/16'),
 Row(network='1.6.0.0/17'),
 Row(network='1.6.128.0/21'),
 Row(network='1.6.136.0/24'),
 Row(network='1.6.137.0/24'),
 Row(network='1.6.138.0/23'),
 Row(network='1.6.140.0/22'),
 Row(network='1.6.144.0/20'),
 Row(network='1.6.160.0/19'),
 Row(network='1.6.192.0/20'),
 Row(network='1.6.208.0/21'),
 Row(network='1.6.216.0/23'),
 Row(network='1.6.218.0/24'),
 Row(network='1.6.219.0/24'),
 Row(network='1.6.220.0/22'),
 Row(network='1.6.224.0/23'),
 Row(network='1.6.226.0/23'),
 Row(network='1.6.228.0/24'

In [105]:
df_asn_blocks_ipv4_2_array = [str(row.network) for row in df_asn_blocks_ipv4_2]

In [106]:
df_asn_blocks_ipv4_2_array

['1.0.0.0/24',
 '1.0.4.0/22',
 '1.0.64.0/18',
 '1.0.128.0/17',
 '1.1.1.0/24',
 '1.1.8.0/24',
 '1.1.64.0/19',
 '1.1.96.0/20',
 '1.1.112.0/21',
 '1.1.120.0/22',
 '1.1.124.0/23',
 '1.1.126.0/24',
 '1.1.128.0/17',
 '1.2.4.0/24',
 '1.2.128.0/17',
 '1.4.128.0/17',
 '1.5.0.0/16',
 '1.6.0.0/17',
 '1.6.128.0/21',
 '1.6.136.0/24',
 '1.6.137.0/24',
 '1.6.138.0/23',
 '1.6.140.0/22',
 '1.6.144.0/20',
 '1.6.160.0/19',
 '1.6.192.0/20',
 '1.6.208.0/21',
 '1.6.216.0/23',
 '1.6.218.0/24',
 '1.6.219.0/24',
 '1.6.220.0/22',
 '1.6.224.0/23',
 '1.6.226.0/23',
 '1.6.228.0/24',
 '1.6.229.0/24',
 '1.6.230.0/24',
 '1.6.231.0/24',
 '1.6.232.0/21',
 '1.6.240.0/20',
 '1.7.128.0/21',
 '1.7.136.0/22',
 '1.7.140.0/23',
 '1.7.142.0/24',
 '1.7.143.0/24',
 '1.7.144.0/22',
 '1.7.148.0/23',
 '1.7.150.0/24',
 '1.7.151.0/24',
 '1.7.152.0/21',
 '1.7.160.0/24',
 '1.7.161.0/24',
 '1.7.162.0/24',
 '1.7.163.0/24',
 '1.7.164.0/22',
 '1.7.168.0/21',
 '1.7.176.0/23',
 '1.7.178.0/24',
 '1.7.179.0/24',
 '1.7.180.0/24',
 '1.7.181.0/24

In [21]:
df_asn_blocks_ipv4.count()

461902

In [22]:
df_asn_blocks_ipv4.dropDuplicates().count()

461902

* es gibt keine Duplikate in dem Datensatz

In [13]:
df_asn_blocks_ipv4.printSchema()

root
 |-- network: string (nullable = true)
 |-- autonomous_system_number: integer (nullable = true)
 |-- autonomous_system_organization: string (nullable = true)



### Anlayse der Spalten

In [24]:
df_asn_blocks_ipv4.filter(df_asn_blocks_ipv4["network"]=="null").count()

0

* keine Null Values

In [25]:
df_asn_blocks_ipv4.filter(df_asn_blocks_ipv4["autonomous_system_number"]=="null").count()

0

* keine Null Values

In [26]:
df_asn_blocks_ipv4.filter(df_asn_blocks_ipv4["autonomous_system_organization"]=="null").count()

0

* keine Null Values

## GeoLite2-Country-Locations-en

In [14]:
schema_country = StructType([
    StructField("geoname_id", IntegerType(), True),
    StructField("locale_code", StringType(), True),
    StructField("continent_code", StringType(), True),
    StructField("continent_name", StringType(), True),
    StructField("country_iso_code", StringType(), True),
    StructField("country_name", StringType(), True),
    StructField("is_in_european_union", StringType(), True)
])

df_country = spark.read.csv("s3a://bucket/GeoLite2-Country-Locations-en.csv", header=True, schema=schema_country).cache()

In [15]:
df_country.show(5)

+----------+-----------+--------------+--------------+----------------+------------+--------------------+
|geoname_id|locale_code|continent_code|continent_name|country_iso_code|country_name|is_in_european_union|
+----------+-----------+--------------+--------------+----------------+------------+--------------------+
|     49518|         en|            AF|        Africa|              RW|      Rwanda|                   0|
|     51537|         en|            AF|        Africa|              SO|     Somalia|                   0|
|     69543|         en|            AS|          Asia|              YE|       Yemen|                   0|
|     99237|         en|            AS|          Asia|              IQ|        Iraq|                   0|
|    102358|         en|            AS|          Asia|              SA|Saudi Arabia|                   0|
+----------+-----------+--------------+--------------+----------------+------------+--------------------+
only showing top 5 rows



In [16]:
df_country.printSchema()

root
 |-- geoname_id: integer (nullable = true)
 |-- locale_code: string (nullable = true)
 |-- continent_code: string (nullable = true)
 |-- continent_name: string (nullable = true)
 |-- country_iso_code: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- is_in_european_union: string (nullable = true)



In [30]:
df_country.count()

252

In [31]:
df_country.dropDuplicates().count()

252

* es gibt keine Duplikate in dem Datensatz

In [32]:
df_country.filter(df_country["country_name"]=="null").count()

0

* keine Null Values

## GeoLite2-City-Locations-en

In [17]:
schema_city = StructType([
    StructField("geoname_id", IntegerType(), True),
    StructField("locale_code", StringType(), True),
    StructField("continent_code", StringType(), True),
    StructField("continent_name", StringType(), True),
    StructField("country_iso_code", StringType(), True),
    StructField("country_name", StringType(), True),
    StructField("subdivision_1_iso_code", StringType(), True),
    StructField("subdivision_1_name", StringType(), True),
    StructField("subdivision_2_iso_code", StringType(), True),
    StructField("subdivision_2_name", StringType(), True),
    StructField("city_name", StringType(), True),
    StructField("metro_code", StringType(), True),
    StructField("time_zone", StringType(), True),
    StructField("is_in_european_union", StringType(), True)
])

df_city = spark.read.csv("s3a://bucket/GeoLite2-City-Locations-en.csv", header=True, schema = schema_city).cache()

In [18]:
df_city.show(5)

+----------+-----------+--------------+--------------+----------------+------------+----------------------+------------------+----------------------+------------------+---------+----------+----------------+--------------------+
|geoname_id|locale_code|continent_code|continent_name|country_iso_code|country_name|subdivision_1_iso_code|subdivision_1_name|subdivision_2_iso_code|subdivision_2_name|city_name|metro_code|       time_zone|is_in_european_union|
+----------+-----------+--------------+--------------+----------------+------------+----------------------+------------------+----------------------+------------------+---------+----------+----------------+--------------------+
|     49518|         en|            AF|        Africa|              RW|      Rwanda|                  null|              null|                  null|              null|     null|      null|   Africa/Kigali|                   0|
|     49747|         en|            AF|        Africa|              SO|     Somalia|    

In [19]:
df_city.printSchema()

root
 |-- geoname_id: integer (nullable = true)
 |-- locale_code: string (nullable = true)
 |-- continent_code: string (nullable = true)
 |-- continent_name: string (nullable = true)
 |-- country_iso_code: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- subdivision_1_iso_code: string (nullable = true)
 |-- subdivision_1_name: string (nullable = true)
 |-- subdivision_2_iso_code: string (nullable = true)
 |-- subdivision_2_name: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- metro_code: string (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- is_in_european_union: string (nullable = true)



In [20]:
df_city.count()

121754

In [37]:
df_city.dropDuplicates().count()

121754

* es gibt keine Duplikate in dem Datensatz

In [38]:
df_city.filter(df_city["city_name"]=="null").count()

0

* Keine Null Values

# Data Cleaning

## Entfernen von Sonderzeichen im Datensatz real Domains

In [21]:
commaRep = udf(lambda x: re.sub('"','', x))
commaRep2 = udf(lambda x: re.sub(']','', x))
commaRep3 = udf(lambda x: re.sub('\[','', x))
commaRep5 = udf(lambda x: re.sub('\.','', x))

In [22]:
df_MXClean=df.withColumn('MX-Record',commaRep('MX-Record'))
df_MXClean=df_MXClean.withColumn('MX-Record',commaRep2('MX-Record'))
df_MXClean=df_MXClean.withColumn('MX-Record',commaRep3('MX-Record'))
df_MXClean.show(3, False)

+----------------+-----------------------------------------------+-----------------------+
|Top level domain|MX-Record                                      |A-Record               |
+----------------+-----------------------------------------------+-----------------------+
|0--1.de         |mail.0--1.de,mxf993.netcup.net                 |"[""46.38.249.145""]"  |
|0--2.de         |mxf993.netcup.net,mail.0--2.de                 |"[""212.227.212.163""]"|
|0-0-0-0-0.de    |smtp-02.tld.t-online.de,smtp-01.tld.t-online.de|"[""80.150.6.143""]"   |
+----------------+-----------------------------------------------+-----------------------+
only showing top 3 rows



In [23]:
df_AClean=df.withColumn('A-Record',commaRep('A-Record'))
df_AClean=df_AClean.withColumn('A-Record',commaRep2('A-Record'))
df_AClean=df_AClean.withColumn('A-Record',commaRep3('A-Record'))
df_AClean.show(3, False)

+----------------+-----------------------------------------------------------+---------------+
|Top level domain|MX-Record                                                  |A-Record       |
+----------------+-----------------------------------------------------------+---------------+
|0--1.de         |"[""mail.0--1.de"",""mxf993.netcup.net""]"                 |46.38.249.145  |
|0--2.de         |"[""mxf993.netcup.net"",""mail.0--2.de""]"                 |212.227.212.163|
|0-0-0-0-0.de    |"[""smtp-02.tld.t-online.de"",""smtp-01.tld.t-online.de""]"|80.150.6.143   |
+----------------+-----------------------------------------------------------+---------------+
only showing top 3 rows



 ## Splitten der Spalten im Datensatz real_domains

In [24]:
df_MXSplit=df_MXClean.select(
        "Top level domain", "A-Record",
        f.split("MX-Record", ",").alias("MX-Record"),
        f.posexplode(f.split("MX-Record", ",")).alias("pos_MX", "val_MX")
    )
df_MXSplit.show(5)

+----------------+--------------------+--------------------+------+--------------------+
|Top level domain|            A-Record|           MX-Record|pos_MX|              val_MX|
+----------------+--------------------+--------------------+------+--------------------+
|         0--1.de|"[""46.38.249.145...|[mail.0--1.de, mx...|     0|        mail.0--1.de|
|         0--1.de|"[""46.38.249.145...|[mail.0--1.de, mx...|     1|   mxf993.netcup.net|
|         0--2.de|"[""212.227.212.1...|[mxf993.netcup.ne...|     0|   mxf993.netcup.net|
|         0--2.de|"[""212.227.212.1...|[mxf993.netcup.ne...|     1|        mail.0--2.de|
|    0-0-0-0-0.de|"[""80.150.6.143""]"|[smtp-02.tld.t-on...|     0|smtp-02.tld.t-onl...|
+----------------+--------------------+--------------------+------+--------------------+
only showing top 5 rows



In [25]:
df_ASplit=df_AClean.select(
        "Top level domain", "MX-Record",
        f.split("A-Record", ", ").alias("A-Record"),
        f.posexplode(f.split("A-Record", ",")).alias("pos_A", "val_A")
    )
df_ASplit.show(20)

+-------------------+--------------------+-----------------+-----+---------------+
|   Top level domain|           MX-Record|         A-Record|pos_A|          val_A|
+-------------------+--------------------+-----------------+-----+---------------+
|            0--1.de|"[""mail.0--1.de"...|  [46.38.249.145]|    0|  46.38.249.145|
|            0--2.de|"[""mxf993.netcup...|[212.227.212.163]|    0|212.227.212.163|
|       0-0-0-0-0.de|"[""smtp-02.tld.t...|   [80.150.6.143]|    0|   80.150.6.143|
|         0-0-0-1.de|"[""smtp-02.tld.t...|   [80.150.6.143]|    0|   80.150.6.143|
|             0-0.de|                  []|  [185.53.178.13]|    0|  185.53.178.13|
|            0-01.de|"[""mail.0-01.de""]"| [193.34.145.200]|    0| 193.34.145.200|
|             0-1.de|   "[""localhost""]"| [91.195.241.137]|    0| 91.195.241.137|
|            0-10.de|                  []|               []|    0|               |
|          0-1000.de|   "[""localhost""]"| [91.195.241.137]|    0| 91.195.241.137|
|   

In [33]:
#dataFrame.select("ColumnName").rdd.map(r => r(0)).collect()
#df_ASplit.select("val_A").as[Synonym].collect

df_ASplit2 = df_ASplit.select(col("val_A")).collect()

In [81]:
df_ASplit2

[Row(val_A='46.38.249.145'),
 Row(val_A='212.227.212.163'),
 Row(val_A='80.150.6.143'),
 Row(val_A='80.150.6.143'),
 Row(val_A='185.53.178.13'),
 Row(val_A='193.34.145.200'),
 Row(val_A='91.195.241.137'),
 Row(val_A=''),
 Row(val_A='91.195.241.137'),
 Row(val_A='81.169.145.95'),
 Row(val_A='91.195.241.137'),
 Row(val_A='91.195.241.137'),
 Row(val_A='127.0.0.1'),
 Row(val_A='194.38.104.220'),
 Row(val_A='134.209.177.247'),
 Row(val_A=''),
 Row(val_A=''),
 Row(val_A='176.9.76.101'),
 Row(val_A=''),
 Row(val_A='185.163.116.240'),
 Row(val_A='81.169.145.66'),
 Row(val_A='80.150.6.143'),
 Row(val_A='213.239.200.93'),
 Row(val_A='91.195.241.137'),
 Row(val_A='91.195.241.137'),
 Row(val_A=''),
 Row(val_A='104.24.103.72'),
 Row(val_A='104.24.102.72'),
 Row(val_A='172.67.208.86'),
 Row(val_A='85.214.69.88'),
 Row(val_A='85.17.4.21'),
 Row(val_A='80.190.158.32'),
 Row(val_A='46.38.249.161'),
 Row(val_A='85.13.133.233'),
 Row(val_A='91.195.241.137'),
 Row(val_A='109.237.140.53'),
 Row(val_A='188.

In [92]:
df_ASplit2_array = [str(row.val_A) for row in df_ASplit2]

In [131]:
df_ASplit2_array

['46.38.249.145',
 '212.227.212.163',
 '80.150.6.143',
 '80.150.6.143',
 '185.53.178.13',
 '193.34.145.200',
 '91.195.241.137',
 '',
 '91.195.241.137',
 '81.169.145.95',
 '91.195.241.137',
 '91.195.241.137',
 '127.0.0.1',
 '194.38.104.220',
 '134.209.177.247',
 '',
 '',
 '176.9.76.101',
 '',
 '185.163.116.240',
 '81.169.145.66',
 '80.150.6.143',
 '213.239.200.93',
 '91.195.241.137',
 '91.195.241.137',
 '',
 '104.24.103.72',
 '104.24.102.72',
 '172.67.208.86',
 '85.214.69.88',
 '85.17.4.21',
 '80.190.158.32',
 '46.38.249.161',
 '85.13.133.233',
 '91.195.241.137',
 '109.237.140.53',
 '188.68.47.135',
 '91.195.241.137',
 '91.195.241.137',
 '104.27.165.83',
 '104.27.164.83',
 '172.67.143.97',
 '185.230.63.107',
 '185.230.63.171',
 '185.230.63.186',
 '185.30.32.170',
 '185.53.178.11',
 '144.76.158.228',
 '91.195.241.137',
 '217.160.0.224',
 '213.133.107.131',
 '52.213.24.106',
 '85.13.131.16',
 '91.195.241.137',
 '85.214.69.88',
 '87.106.193.115',
 '212.3.79.90',
 '81.169.145.88',
 '91.195.

In [95]:
from ipaddress import ip_network, ip_address
#import ipaddress

# Test mit den 2 Array-Listen

In [127]:
#ipaddress.IPv4Address('192.0.2.1') in ipaddress.IPv4Network('192.0.2.0/29')
#ipaddress.IPv4Address('127.0.0.1') in ipaddress.IPv4Network('192.168.0.0/16')
#ipaddress.IPv4Address(df_ASplit2_array[2]) in ipaddress.IPv4Network('192.0.2.0/29')
ip_address(df_ASplit2_array[0]) in ip_network(df_asn_blocks_ipv4_2_array[0])

False

# Test mit Liste

In [102]:
a = ['1.0.0.0/24', '1.0.4.0/22',
     '1.0.64.0/18', '1.0.128.0/17']

for i in a:
    try:
        print()
        print(i)
        ip_network(i)
    except ValueError:
        print('invalid')
    else:
        print('valid')


1.0.0.0/24
valid

1.0.4.0/22
valid

1.0.64.0/18
valid

1.0.128.0/17
valid


In [47]:
a

['46.38.249.145',
 '255.255.255.0',
 '192.168.1.10',
 '999.999.999.999',
 '123.456.789',
 'foobar',
 '123456']

# Neue Spalte Network in df_ASplit hinzufügen

In [None]:
#df_ASplit = df_ASplit.withColumn('network2', when(ip_address(str(df_ASplit.val_A)) in ip_network(df_asn_blocks_ipv4.network), lit("True"))\
#                                .otherwise(lit("False")))

df_ASplit = df_ASplit.withColumn('network2', when(ip_address(df_ASplit2_array) in ip_network(df_asn_blocks_ipv4_2_array), lit("True"))\
                               .otherwise(lit("False")))

df_ASplit.show(3, False)

In [None]:
df_ASplit.count()

In [None]:
df_ASplit.printSchema()

## Entfernen von Sonderzeichen im Datensatz asn ip4 Blocks

In [None]:
#commaRep4 = udf(lambda x: re.sub('/','', x))

In [128]:
split_col = f.split(df_asn_blocks_ipv4['network'], '/')
df_asn_blocks_ipv4 = df_asn_blocks_ipv4.withColumn('network_adress', split_col.getItem(0))
df_asn_blocks_ipv4 = df_asn_blocks_ipv4.withColumn('subnetzmaske', split_col.getItem(1))

df_asn_blocks_ipv4.show()

+------------+------------------------+------------------------------+--------------+------------+
|     network|autonomous_system_number|autonomous_system_organization|network_adress|subnetzmaske|
+------------+------------------------+------------------------------+--------------+------------+
|  1.0.0.0/24|                   13335|                 CLOUDFLARENET|       1.0.0.0|          24|
|  1.0.4.0/22|                   38803|          Wirefreebroadband...|       1.0.4.0|          22|
| 1.0.64.0/18|                   18144|          Energia Communica...|      1.0.64.0|          18|
|1.0.128.0/17|                   23969|          TOT Public Compan...|     1.0.128.0|          17|
|  1.1.1.0/24|                   13335|                 CLOUDFLARENET|       1.1.1.0|          24|
|  1.1.8.0/24|                   58543|                     Guangdong|       1.1.8.0|          24|
| 1.1.64.0/19|                    2519|          ARTERIA Networks ...|      1.1.64.0|          19|
| 1.1.96.0

## Test

In [129]:
list(ip_network('192.0.2.0/29').hosts()) 

[IPv4Address('192.0.2.1'),
 IPv4Address('192.0.2.2'),
 IPv4Address('192.0.2.3'),
 IPv4Address('192.0.2.4'),
 IPv4Address('192.0.2.5'),
 IPv4Address('192.0.2.6')]

# Neue Spalte Ip-Adresse in df_asn_blocks_ipv4 hinzufügen

In [None]:
df_asn_blocks_ipv4 = df_asn_blocks_ipv4.withColumn('ip_adress', list(ip_network(df_asn_blocks_ipv4.network).hosts()))

#df_asn_blocks_ipv4 = df_asn_blocks_ipv4.withColumn('ip_adress', list(ipaddress.IPv4Network(df_asn_blocks_ipv4.network).hosts()))

In [None]:
#df_networkClean=df_asn_blocks_ipv4.withColumn('network',commaRep4('network'))
#df_networkClean.show(3, False)

In [None]:
#df_networkClean.printSchema()

In [None]:
df_asn_blocks_ipv4.printSchema()

## Unbenötigte Spalten im Datensatz asn ip4 country entfernen 

In [None]:
df_country = df_country.drop("locale_code", "continent_code", "continent_name", "country_iso_code", "is_in_european_union")

In [None]:
df_country.show()

## Unbenötigte Spalten im Datensatz asn ip4 city entfernen

In [None]:
df_city = df_city.drop("locale_code", "continent_code", "continent_name", "country_iso_code", "country_name", 
                             "subdivision_1_iso_code", "subdivision_1_name", "subdivision_2_iso_code", "subdivision_2_name",
                             "metro_code", "time_zone", "is_in_european_union")

In [None]:
df_city.show()

# Analyse

## Joining von Tabelle real_new und asn_blocks_ipv4

In [None]:
df_gejoint = df_ASplit.join(df_asn_blocks_ipv4, df_ASplit["val_A"] == df_asn_blocks_ipv4["network_adress"],"inner")

In [None]:
df_gejoint.count()

In [None]:
df_gejoint.show()

## Joining von Tabelle df_country und df_city

In [None]:
df_gejoint2 = df_country.join(df_city, df_country["geoname_id"] == df_city["geoname_id"])

In [None]:
df_gejoint2.show()

#### Joining von all Tabellen

In [None]:
df_gejoint_all = df_networkClean2\
            .join(df_AClean2, df_networkClean2["network2"] == df_AClean2["A-Record2"])\
            .join(df_country, df_networkClean2["autonomous_system_number"] == df_country["geoname_id"])\
            .join(df_city, df_networkClean2["autonomous_system_number"] == df_city["geoname_id"])

In [None]:
df_gejoint_all.show()