<a href="https://colab.research.google.com/github/mybox-lab/de_test14/blob/main/de_test14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faker

Collecting faker
  Downloading Faker-28.4.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-28.4.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-28.4.1


In [2]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [3]:
!pip install pyspark py4j

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=dd648e1c12489d681258ae59c1263a70528d7833cae1f984e5c08eada53dc584
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as spark_sum, to_date

# Создаем Spark-сессию
spark = SparkSession.builder \
    .appName("Web Server Log Analysis") \
    .getOrCreate()

# Загружаем CSV-файл в Spark DataFrame
file_path = "web_server_logs.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Просмотр структуры данных
df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- method: string (nullable = true)
 |-- url: string (nullable = true)
 |-- response_code: integer (nullable = true)
 |-- response_size: integer (nullable = true)



In [6]:
# Группируем по IP и считаем количество запросов для каждого IP
ip_group = df.groupBy("ip").agg(count("*").alias("request_count"))

# Отсортировать по количеству запросов и вывести топ-10
top_10_ips = ip_group.orderBy(col("request_count").desc()).limit(10)

# Выводим результат
print("Top 10 active IP addresses")
top_10_ips.show()

Top 10 active IP addresses
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
|   106.5.41.138|            1|
|  16.125.110.90|            1|
| 199.32.201.251|            1|
| 134.133.115.95|            1|
|130.194.241.114|            1|
|147.114.117.155|            1|
|  75.191.248.93|            1|
| 155.153.198.39|            1|
| 147.244.173.43|            1|
|  25.130.207.69|            1|
+---------------+-------------+



In [11]:
# Группировка по HTTP-методу и подсчет количества запросов
method_group = df.groupBy("method").agg(count("*").alias("method_count"))

# Выводим результат
print("Request count by HTTP method")
method_group.show()

Request count by HTTP method
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24752|
|DELETE|       25166|
|   PUT|       25222|
|   GET|       24860|
+------+------------+



In [9]:
# Фильтруем по коду ответа 404 и считаем количество запросов
not_found_count = df.filter(col("response_code") == 404).count()

# Выводим результат
print(f"Number of 404 response codes: {not_found_count}")

Number of 404 response codes: 24968


In [10]:
# Преобразуем столбец timestamp в дату
df = df.withColumn("date", to_date(col("timestamp")))

# Группировка по дате и суммирование размера ответов
date_group = df.groupBy("date").agg(spark_sum("response_size").alias("total_response_size"))

# Сортировка по дате
date_group_sorted = date_group.orderBy("date")

# Выводим результат
print("Total response size by day")
date_group_sorted.show()

Total response size by day
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2024-01-01|            1806975|
|2024-01-02|            2218508|
|2024-01-03|            2108598|
|2024-01-04|            1960409|
|2024-01-05|            1891728|
|2024-01-06|            1965383|
|2024-01-07|            1991990|
|2024-01-08|            2041413|
|2024-01-09|            1960275|
|2024-01-10|            2030183|
|2024-01-11|            2139162|
|2024-01-12|            2069330|
|2024-01-13|            2140219|
|2024-01-14|            2024670|
|2024-01-15|            1874887|
|2024-01-16|            2262220|
|2024-01-17|            1979441|
|2024-01-18|            2248879|
|2024-01-19|            2122195|
|2024-01-20|            2076936|
+----------+-------------------+
only showing top 20 rows

