In [28]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StringType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import concat,concat_ws, lit,col, count,udf,collect_list,array, array_union, explode,collect_set
from pyspark.sql.window import Window
import findspark
import csv
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import CountVectorizer

# PART 2:
## 1. Grouping the similar processes according to Jaccard Similarities
## 2. Creating the new data 

In [29]:
findspark.init()
spark = SparkSession.builder \
    .appName("part2Grouping") \
    .master("spark://192.168.1.81:7077") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.driver.cores", "2") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.sql.broadcastTimeout", "3600s") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored", "true") \
    .getOrCreate()

In [30]:
input_file = 'part1Output.txt'
output_file = 'part1Output.csv'

with open(input_file, 'r') as file:
    lines = file.readlines()
 
# Preprocess the lines to handle custom format
processed_lines = []
for line in lines:
    line = line.strip().strip('<>')
    parts = line.split(',')
    if len(parts) != 5:
        print(f"Skipping malformed line: {line}")
        continue
    try:
        processed_line = {
            'FromServer': parts[0].strip("'"),
            'ToServer': parts[1].strip(),
            'time': int(parts[2].strip()),
            'action': parts[3].strip(),
            'processId': int(parts[4].strip())
        }
        processed_lines.append(processed_line)
    except ValueError as e:
        print(f"Error processing line: {line}. Error: {e}")
        continue

# exporting to CSV
headers = ['FromServer', 'ToServer', 'time', 'action', 'processId']
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()
    for row in processed_lines:
        writer.writerow(row)
 
print(f"Data has been successfully written to {output_file}")

Data has been successfully written to part1Output.csv


In [31]:
data_path = "reverse_cases.csv"
dataForPart2 = spark.read.csv(data_path, header=True, inferSchema=True)

# Aggregate FromServer and ToServer into sets for each processId
agg_df = dataForPart2.groupBy("processId").agg(
    collect_set("FromServer").alias("servers_array")
)


In [32]:
# Use CountVectorizer to convert server names to feature vectors
cv = CountVectorizer(inputCol="servers_array", outputCol="features")
cv_model = cv.fit(agg_df)
cv_df = cv_model.transform(agg_df)


In [33]:

print("Distinct Attributes (Vocabulary):")
for i, attr in enumerate(cv_model.vocabulary):
    print(f"{i}. {attr}")


Distinct Attributes (Vocabulary):
0. null
1. s5
2. ps1
3. s6
4. ps3
5. s9
6. ps10
7. p9
8. s8
9. ps4
10. s4
11. p1
12. p8
13. s2
14. s7
15. p10
16. s10
17. ps8
18. p6
19. p7
20. ps9
21. ps6
22. s3
23. p2
24. ps5
25. ps7
26. ps2
27. p3
28. s1
29. p5
30. p4


In [34]:
def jaccard_similarity(vec1, vec2):
    set1 = set(vec1.indices)
    set2 = set(vec2.indices)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if len(union) == 0:
        return 0.0
    return float(len(intersection)) / len(union)
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())
# Merge overlapping groups
def merge_groups(group_list):
    groups = []
    for group in group_list:
        merged = False
        for existing_group in groups:
            if any(item in group for item in existing_group):
                existing_group.update(group)
                merged = True
                break
        if not merged:
            groups.append(set(group))
    return [list(group) for group in groups]
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())
merge_groups_udf = udf(lambda x: merge_groups(x), ArrayType(ArrayType(IntegerType())))

In [35]:

# Apply MinHash LSH
numOftables = 10
minhash = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=numOftables)
model = minhash.fit(cv_df)
transformed_df = model.transform(cv_df)


In [36]:

threshold=0.5
candidates = model.approxSimilarityJoin(transformed_df, transformed_df, threshold , distCol="JaccardDistance") \
    .select(col("datasetA.processId").alias("processIdA"),
            col("datasetB.processId").alias("processIdB"),
            col("JaccardDistance"),col("datasetA.features").alias("featuresA"),col("datasetB.features").alias("featuresB"))
candidates = candidates.filter(col("processIdA") < col("processIdB"))
print("Candidates")
candidates.show()

Candidates
+----------+----------+---------------+--------------------+--------------------+
|processIdA|processIdB|JaccardDistance|           featuresA|           featuresB|
+----------+----------+---------------+--------------------+--------------------+
|      1823|      1824|            0.0|(31,[0,7,11,12,15...|(31,[0,7,11,12,15...|
|      1825|      1826|            0.0|(31,[0,2,4,6,9,17...|(31,[0,2,4,6,9,17...|
|      1821|      1822|            0.0|(31,[0,1,3,5,8,10...|(31,[0,1,3,5,8,10...|
+----------+----------+---------------+--------------------+--------------------+



In [37]:
# Calculate Jaccard similarity for each candidate pair
similarity_df = candidates.withColumn("JaccardSimilarity", jaccard_similarity_udf(col("featuresA"), col("featuresB")))

# Filter pairs with Jaccard similarity above a threshold (e.g., 70%)
similarity_df = similarity_df.filter(col("JaccardSimilarity") >= 0.7)
grouped_df = similarity_df.groupBy("processIdA").agg(collect_list("processIdB").alias("similar_processIDs"))
grouped_df = grouped_df.withColumn("all_processIDs", array_union(array(col("processIdA")), col("similar_processIDs")))
exploded_df = grouped_df.select(explode(col("all_processIDs")).alias("processID"), col("processIdA").alias("group_representative"))
grouped_lists = exploded_df.groupBy("group_representative") \
    .agg(collect_list("processID").alias("group_list")) \
    .agg(collect_list("group_list").alias("group_lists"))
merged_groups = grouped_lists.withColumn("merged_groups", merge_groups_udf(col("group_lists"))) \
    .select(explode(col("merged_groups")).alias("final_group"))
final_groups_df = merged_groups.select(concat_ws("_", col("final_group")).alias("Group"), col("final_group"))
print("the number of groups is ",final_groups_df.count())
output_path = "./output/part2Observations.txt"

processes_from_groups = final_groups_df.selectExpr("explode(final_group) as processID").distinct()
# creating a dataframe only with the processes that were grouped.
df_with_groups = dataForPart2.join(processes_from_groups, "processID", "semi")
df_with_groups.show()

exploded_final_groups_df = final_groups_df.select("Group", explode("final_group").alias("processID"))
joined_df = df_with_groups.join(exploded_final_groups_df, "processID")
joined_df.show()

the number of groups is  3
+---------+----------+--------+----+--------+
|processId|FromServer|ToServer|time|  action|
+---------+----------+--------+----+--------+
|     1821|      null|      S1|   0| Request|
|     1821|        s1|      S2|   1| Request|
|     1821|        s2|      S3|   2| Request|
|     1821|        s3|      S4|   3| Request|
|     1821|        s4|      S5|   4| Request|
|     1821|        s5|      S6|   5| Request|
|     1821|        s6|      S7|   6| Request|
|     1821|        s7|      S8|   7| Request|
|     1821|        s8|      S9|   8| Request|
|     1821|        s9|     S10|   9| Request|
|     1821|       s10|      S9|  10|Response|
|     1821|        s9|      S8|  11|Response|
|     1821|        s8|      S7|  12|Response|
|     1821|        s7|      S6|  13|Response|
|     1821|        s6|      S5|  14|Response|
|     1821|        s5|      S4|  15|Response|
|     1821|        s4|      S3|  16|Response|
|     1821|        s3|      S2|  17|Response|
|     1

# creating the txt files:
## The desired files will be in the folder output

In [38]:
def write_groups_to_txt(grouped_df, output_path):
    with open(output_path, "w") as file:
        for row in grouped_df.collect():
            group_name = row["Group"]
            process_ids = row["processIDs"]
            formatted_rows = row["formatted_rows"]
            
            # Ensure process_ids are unique and sorted
            process_ids = sorted(set(process_ids))
            
            file.write(f"Group: {{{', '.join(map(str, process_ids))}}}\n")
            
            for process_id in process_ids:
                file.write(f"{process_id}:\n")
                rows_for_process_id = [row for row in formatted_rows if row.endswith(f",{process_id}>")]
                if rows_for_process_id:
                    for formatted_row in rows_for_process_id:
                        file.write(f"{formatted_row}\n")
                else:
                    file.write("<No corresponding formatted rows found>\n")
                    
            file.write("\n")

In [39]:
# Format each row into the desired format
formatted_df = joined_df.withColumn(
    "formatted_row",
    concat_ws("", lit("<"), col("FromServer"), lit(","), col("ToServer"),
              lit(","), col("time"), lit(","), col("action"), lit(","), col("processID"), lit(">")))
grouped_df = formatted_df.groupBy("Group").agg(
    collect_list("processID").alias("processIDs"),
    collect_list("formatted_row").alias("formatted_rows"))

output_path = "./output/part2Observations.txt"
write_groups_to_txt(grouped_df, output_path)

In [40]:
spark.stop()