# Testing

In [1]:
import os
import sys
import re
import pandas as pd
import pickle

In [2]:
# Test1 Processing of data to dictionary


D_RAW_DIR = os.path.realpath(os.path.join(os.path.dirname("VINF"), '..'))
symptom_index_path = os.path.realpath(os.path.join(D_RAW_DIR, 'source_code', 'test_tmp', 'symptoms_index_test.pickle'))
main_document_test_path = os.path.realpath(os.path.join(D_RAW_DIR, 'source_code', 'test_tmp', 'main_document_test.csv'))
main_df = pd.read_csv(main_document_test_path, sep = '\t')
main_df = main_df.fillna("")

symptom_index = {}

for symptoms_id in range(main_df['Symptom'].size):
    if main_df.at[symptoms_id, 'Symptom'] != "":
        # Split string with separator ;
        symptoms_list = main_df.at[symptoms_id, 'Symptom'].split(";")
        # Load each item to dictionary with duplication control
        for symptom in symptoms_list:
            if symptom_index.get(symptom.strip()) is None:
                symptom_index[symptom.strip()] = [symptoms_id]                
            else:
                symptom_index[symptom.strip()].append(symptoms_id)
                

actual_dictionary = symptom_index
expected_dictionary = {'cough': [0], 'fatigue': [0], 'headache': [0, 1], 'fever': [0, 1], 'vomiting': [1]}

try:
    assert actual_dictionary == expected_dictionary
    print("Test succeeded")
except(AssertionError):
    print("Test failed")

Test succeeded


In [3]:
# Test2 Writing and reading index - testing main file
symptom_index_path = os.path.realpath(os.path.join(D_RAW_DIR, 'source_code', 'test_tmp', 'symptoms_index_test2.pickle'))

test_dictionary = {'cough': [0], 'fatigue': [0], 'headache': [0, 1], 'fever': [0, 1], 'vomiting': [1]}
with open(symptom_index_path, 'wb') as handle:
    pickle.dump(symptom_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

index_dict = {}
with open(symptom_index_path, 'rb') as handle:
    index_dict = pickle.load(handle)

actual_index_symptom_keys = list(index_dict.keys())
expected_index_symptom_keys = ['cough', 'fatigue', 'headache', 'fever', 'vomiting']

try:
    assert actual_index_symptom_keys == expected_index_symptom_keys
    print("Test succeeded")
except(AssertionError):
    print("Test failed")

Test succeeded


In [4]:
# Test3 testing text searching and processing of spark dataframe
# Setting up spark session
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, ArrayType

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession \
    .builder \
    .appName("VINF_disease_searching_luley_michal") \
    .master("local[*]") \
    .getOrCreate()

sdf_values = list([["Coronavirus", "Coronavirus is dangerous virus that is spread in China and has symptom like cough, fatigue, headache and fever."]])
dfs_valuable = spark.createDataFrame(data=sdf_values, schema=StructType([StructField("title", StringType(), True),
                                                              StructField("_VALUE", StringType(), True)]))

sdf_values_expected = list([["coronavirus", "virus", ["fever", "fatigue", "cough", "headache"], ["china"], None]])
sdf_expected = spark.createDataFrame(data=sdf_values_expected, schema=StructType([StructField("disease", StringType(), True),
                                                            StructField("type", StringType(), True),
                                                            StructField("symptoms", ArrayType(StringType()), True),
                                                            StructField("countries", ArrayType(StringType()), True),
                                                            StructField("transmissions", ArrayType(StringType()), True),]))

# Defining regex patterns to be used for regex searching
diseases_pattern = re.compile(('|'.join(["Coronavirus", "Dengue fever"])).replace("(", "\(").replace(")", "\)"), re.IGNORECASE)
diseases_type_pattern = re.compile('bacteria|virus|viral', re.IGNORECASE)
symptoms_pattern = re.compile('|'.join(["cough", "fatigue", "headache", "fever"]), re.IGNORECASE)
countries_pattern = re.compile('|'.join(['China', "India"]), re.IGNORECASE)
transmission_pattern = re.compile('|'.join(["moscito, blood"]), re.IGNORECASE)

from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import udf

# Defining udf for finding matches between page and item(disease, symptom, transmission, country)
def get_matching_string(line, regex):
    matches = list(set([str(x).lower() for x in re.findall(regex, line)]))
    return matches if matches else None

# Defining udf for finding matches between page and disease type
def get_matching_string_type(line, regex):
    disease_types = re.findall(regex, line)    
    match = max([x.lower().replace("viral", "virus") for x in disease_types], key=disease_types.count) 
    return match if match else None

# Declaring UDF functions
from pyspark.sql.functions import udf
udf_func_type = udf(lambda line :get_matching_string_type(line, diseases_type_pattern), StringType())
udf_func_diseases = udf(lambda line :get_matching_string(line, diseases_pattern), ArrayType(StringType()))
udf_func_symptoms = udf(lambda line :get_matching_string(line, symptoms_pattern), ArrayType(StringType()))
udf_func_countries = udf(lambda line :get_matching_string(line, countries_pattern), ArrayType(StringType()))
udf_func_transmissions = udf(lambda line :get_matching_string(line, transmission_pattern), ArrayType(StringType()))

# Creating collumns disease, type, symptoms, countries, transmissions and dropping yet 
# unnecessary columns _VALUE and title
from pyspark.sql.functions import col, regexp_extract
dfs_final = dfs_valuable \
                .withColumn("disease", udf_func_diseases(col('title'))[0]) \
                .na.drop(subset=["disease"]) \
                .withColumn("type", udf_func_type('_VALUE')) \
                .withColumn("symptoms", udf_func_symptoms(col('_VALUE'))) \
                .withColumn("countries", udf_func_countries(col('_VALUE'))) \
                .withColumn("transmissions", udf_func_transmissions(col('_VALUE'))) \
                .drop(col("_VALUE")) \
                .drop(col("title"))

print(dfs_final.head(5))
print(sdf_expected.head(5))

try:
    assert dfs_final.rdd.collect() == sdf_expected.rdd.collect()
    print("Test succeeded")
except(AssertionError):
    print("Test failed")



[Row(disease='coronavirus', type='virus', symptoms=['fever', 'fatigue', 'cough', 'headache'], countries=['china'], transmissions=None)]
[Row(disease='coronavirus', type='virus', symptoms=['fever', 'fatigue', 'cough', 'headache'], countries=['china'], transmissions=None)]
Test succeeded
