In [0]:
pip install tqdm

In [0]:
pip install faker

In [0]:
pip install pymongo

In [0]:
pip install dnspython

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import sklearn as sk
from sklearn.ensemble import RandomForestClassifier
from scipy import spatial
import collections
from os.path import join, exists


# Read and process gene data

In [0]:
def add_gene_location(df):
    """
    Parsing gene dataframe attributes to find gene name and transforming gene location to good format.
    Args:
        df: Dataframe with gene information.
    Return:
        Gene df with added gene name and location.
    """
    gene_data_df = df[df["type"]=="gene"]
    gene_data_df["gene_name"] = gene_data_df.attributes.apply(lambda at: [x.split("=")[-1] for x in str(at).split(";") if x.startswith("Name")][0])
    gene_data_df["location"] = list(zip(gene_data_df.start, gene_data_df.end))
    
    return gene_data_df

In [0]:
def process_gene_data_to_dict(gene_data_df):
    """
    Grouping gene data (name and location) in a convenient format.
    Args:
        gene_data_df: Dataframe with gene information. Has to columns with location and gene name.
    Return:
        Dictionary with genes and their location.
    """
    ordered_gene_dict = {}
    for seqid, sub_df in gene_data_df.groupby('seqid'):
        dict_seqid = dict(zip(sub_df.location, sub_df.gene_name))
        dict_seqid_sorted = collections.OrderedDict(sorted(dict_seqid.items()))
        ordered_gene_dict[seqid] = dict_seqid_sorted

    return ordered_gene_dict

#### Reading gene data csv as dataframe and parsing every row to fing gene name and gene location. Ordered_gene_dict is a dictionary with basic gene data: chromosome in which they are in as key and their name and exact location in genome as values.

In [0]:
column_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
file_path = "/dbfs/FileStore/tables/Homo_sapiens_GRCh38_85_gff3.gz"
df = pd.read_csv(file_path, compression='gzip',
                 sep='\t', comment='#', low_memory=False,
                 header=None, names=column_names)
gene_data_df = add_gene_location(df)

ordered_gene_dict = process_gene_data_to_dict(gene_data_df)

# Add gene name to patient snp and save to file

In [0]:
def find_gene_name_from_location(chromosome, position, ordered_gene_dict):
    """
    Find gene names for chosen location in genome.
    Args:
        chromosome: chromosome of the gene.
        position: exact position of a gene.
        ordered_gene_dict: dictionary with gene data: their name and exact location in genome.
    Return:
        List of gene names.
    """
    chosen_chromosome_dict = ordered_gene_dict[str(chromosome)]
    genes = [v for k, v in chosen_chromosome_dict.items() if k[0] < position and k[1] > position]

    return genes

In [0]:
patient_data_path = "/dbfs/FileStore/tables"
patient_list = ["Child_1_Genome.csv", "Child_2_Genome.csv", "Child_3_Genome.csv", "Mother_Genome.csv", "Father_Genome.csv"]

#### For every patient in patient_list add new column to csv with gene_name for every snp and save to csv. Skipped if modified csv already exist in patient_data_path.

In [0]:
for patient in patient_list:
    patient_file = join(patient_data_path, patient)
    new_patient_file_name = join(patient_data_path, patient.split(".")[0]+"_geneNames.csv")
    
    if not exists(new_patient_file_name):
        patient_df = pd.read_csv(patient_file)
        tqdm.pandas()
        print("saving: ", new_patient_file_name)
        patient_df['gene_name'] = patient_df.progress_apply(
            lambda row: find_gene_name_from_location(row['chromosome'], float(row['position']), ordered_gene_dict), axis=1)

        patient_df.to_csv(new_patient_file_name, index=False)

# Processing disease data

In [0]:
disease_path = "/dbfs/FileStore/tables/curated_gene_disease_associations.tsv"
disease_df = pd.read_csv(disease_path, sep='\t')

#### Encode gene name to numeric representation and add as new column

In [0]:
disease_df["geneSymbol_encoded"] = disease_df["geneSymbol"].astype('category').cat.codes

In [0]:
disease_df

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source,geneSymbol_encoded
0,1,A1BG,0.700,0.538,C0019209,Hepatomegaly,phenotype,C23;C06,Finding,0.30,1.000,2017.0,2017.0,1,0,CTD_human,0
1,1,A1BG,0.700,0.538,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.30,1.000,2015.0,2015.0,1,0,CTD_human,0
2,2,A2M,0.529,0.769,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.50,0.769,1998.0,2018.0,3,0,CTD_human,1
3,2,A2M,0.529,0.769,C0007102,Malignant tumor of colon,disease,C06;C04,Neoplastic Process,0.31,1.000,2004.0,2019.0,1,0,CTD_human,1
4,2,A2M,0.529,0.769,C0009375,Colonic Neoplasms,group,C06;C04,Neoplastic Process,0.30,1.000,2004.0,2004.0,1,0,CTD_human,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84033,109580095,HBB-LCR,0.743,0.115,C0002875,Cooley's anemia,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human,3605
84034,109580095,HBB-LCR,0.743,0.115,C0005283,beta Thalassemia,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human,3605
84035,109580095,HBB-LCR,0.743,0.115,C0019025,Hemoglobin F Disease,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human,3605
84036,109580095,HBB-LCR,0.743,0.115,C0085578,Thalassemia Minor,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human,3605


### Group genes for each disease

In [0]:
disease_grouped_df = disease_df.groupby('diseaseName', as_index=False).agg(list)
disease_grouped_df_filtered = disease_grouped_df[disease_grouped_df['geneSymbol'].apply(lambda x: len(x) > 400)]
#get a list of unique disease names that are left after filtering
unique_disease_filtered = disease_grouped_df_filtered['diseaseName'].unique()

### One hot encode genes

In [0]:
def create_word_embedding_disease(disease_df, add_weight=False):
    """
    Parse disease_df to create word embeddings for disease and their genes.
    Args:
        disease_df: Dataframe with disease and gene information.
        add_weight: Bool flag to add snp number as weight to word embeddings.
    Return:
        Dataframe with embedded genes and disease.
    """
    one_hot_encoded_genes_df = pd.get_dummies(disease_df.geneSymbol)

    #get disease name and number of snp
    one_hot_encoded_genes_df["diseaseName"] = disease_df["diseaseName"]
    one_hot_encoded_genes_df["NofSnp"] = disease_df["NofSnps"]

    #get only diseases which have more genes than thresh
    one_hot_encoded_genes_filtered_df = one_hot_encoded_genes_df[one_hot_encoded_genes_df["diseaseName"].isin(unique_disease_filtered)]
    
    if add_weight:
        #multipy encoded genes by nr of snps, WORKING ON UNGROUPED DF
        gene_columns = one_hot_encoded_genes_filtered_df.columns.drop(['diseaseName', 'NofSnp'])
        disease_names = one_hot_encoded_genes_filtered_df.diseaseName
        one_hot_encoded_genes_filtered_df = one_hot_encoded_genes_filtered_df[gene_columns].multiply(one_hot_encoded_genes_filtered_df["NofSnp"], axis="index")
        one_hot_encoded_genes_filtered_df['diseaseName'] = disease_names
        
    #group encoding by disease
    one_hot_encoded_genes_grouped_df = one_hot_encoded_genes_filtered_df.groupby('diseaseName', as_index=False).agg(sum)
    one_hot_encoded_genes_grouped_df.set_index("diseaseName")
    return one_hot_encoded_genes_grouped_df

#### Creating vectors with embedded genes for every disease. 1 if gene is responsible for a disease, 0 if it is not. Weighted version calculates also nr of snp: for every gene has nr_of_snp if gene is responsible for a disease, 0 if it is not.

In [0]:
embedded_genes = create_word_embedding_disease(disease_df, False)
embedded_genes_weight = create_word_embedding_disease(disease_df, True)

#### A list of gene names that are left after filtering out gene dataframe

In [0]:
available_genes = embedded_genes.columns
available_genes = available_genes.drop(['diseaseName'])

#### Preparing data for later use in model training

In [0]:
y = embedded_genes.diseaseName.tolist()
X = embedded_genes.loc[:, embedded_genes.columns != 'diseaseName'].to_numpy()

# Processing patient data

In [0]:
def do_embedding_patient_dict(patient_data, gene_data, add_weight=False):
    """
    Parse patient data to create word embeddings for patients snp genes.
    Args:
        patient_data: Dictionary with snp genes and the number of times the occured.
        gene_data: List with available genes.
        add_weight: Bool flag to add snp number as weight to word embeddings.
    Return:
        Dictionary with embedded patient genes, only genes from available list.
    """
    one_hot_encoded_patient = {}
    for gene in gene_data:
        if gene in patient_data.keys():
            if add_weight:
                one_hot_encoded_patient[gene] = patient_data[gene]
            else:
                one_hot_encoded_patient[gene] = 1
        else:
            one_hot_encoded_patient[gene] = 0
    return one_hot_encoded_patient

def flatten(t):
    return [item.strip("''") for sublist in t for item in sublist if item]

def count_genes(gene_data):
    """
    Returns a dict with gene names as keys and the number of snp that appeared in these genes as values.
    """
    return dict(collections.Counter(gene_data))

#### List of files to process with patient data - csv files with added gene names to every snp

In [0]:
patient_data_path = "/dbfs/FileStore/tables"
patient_list = ["Child_1_Genome_geneNames.csv", "Child_2_Genome_geneNames.csv", "Child_3_Genome_geneNames.csv", "Mother_Genome_geneNames.csv", "Father_Genome_geneNames.csv"]

#### Processing genes for every patient. Returns a dictionary with every patient file name and a list of genes that have mutations. Based on available genes from gene data csv. If patient has a mutation in a gene that we don't know the name of it is not considered.

In [0]:
patient_data_for_prediction = {}
for patient in patient_list:
    named_patient_df = pd.read_csv(join(patient_data_path, patient))
    
    patient_gene_data = named_patient_df.gene_name.tolist()
    patient_gene_data = [k.strip("[]").split(", ") for k in patient_gene_data]
    patient_gene_data = flatten(patient_gene_data)
    counted_patient_gene_data = count_genes(patient_gene_data)

    encoded_patient_data = do_embedding_patient_dict(counted_patient_gene_data, available_genes)
    encoded_patient_data_values = list(encoded_patient_data.values())
    
    patient_data_for_prediction[patient] = encoded_patient_data_values

# Random Forest

#### Creating and training Random Forest

In [0]:
clf=RandomForestClassifier(n_estimators=1000)
clf.fit(X, y)

#### Predicting probability of disease for every patient from patient_list

In [0]:
predicted_disease = {}
for patient_path in patient_list:
    encoded_patient_data_values = patient_data_for_prediction[patient_path]
    y_pred = clf.predict([encoded_patient_data_values])
    y_pred_prob=clf.predict_proba([encoded_patient_data_values])
#     n = 5
#     top_n = np.argsort(y_pred_prob)[:,:-n-1:-1]
    top = np.argsort(y_pred_prob)[0][::-1]
    
    disease_prob = {}
    for d in top:
        disease_prob[y[d]] = round(y_pred_prob[0][d]*100, 2)
    predicted_disease[patient_path] = disease_prob

In [0]:
predicted_disease

# Vector similarity

#### Calculating cosine vector similarity for every disease and every patient from patient_list

In [0]:
predicted_disease = {}
for patient_path in patient_list:
    encoded_patient_data_values = patient_data_for_prediction[patient_path]
    disease_prob = {}
    for i in range(len(y)):
        similarity = 1 - spatial.distance.cosine(encoded_patient_data_values, X[i])
        disease_prob[y[i]] = round(similarity*100, 2)
    predicted_disease[patient_path] = disease_prob

In [0]:
predicted_disease

# Product matcher

In [0]:

num = "001000000010000000100000001000000010000000100000001000000010000000100000001000000010000000100" \
      "0000010000000100011001000110010001100100011001000110010001100100011001000110010001100100011001000" \
      "110010001100100011001000110010001100100011001000110000101000100000001000000010000000100000001000000" \
      "01000000010000000100000001000000010000000100000001000110010001100100011001000000010000000100000001000" \
      "1100100011001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100" \
      "1000110010001100001010001000000010000000100000001000000010000000100000001000000010000000100000001" \
      "000000010000000100011001000110010001100100000001000000010000000100011001000110010001100100011001000" \
      "1100100011001000110010001100100011001000110010001100100011001000110010001100100011000010100010000000100" \
      "0000010000000100000001000000010000000100000001000000010000000100000001000000010001100100011001000110010001" \
      "10010001100100011001000110010001100100011001000110010001100100011001000110010001100100011001000" \
      "1100100011001000110010001100100011001000110000101000100000001000000010000000100000001000000010000" \
      "000100000001000000010000000100000001000000010000000100000001000000010000000100000001000000010000000" \
      "1000000010000000100000001000110010001100100011001000110010001100100011001000110010001100100011001000110" \
      "01000110000101000100000001000000010000000100000001000110010001100100011001000110010001100100011001000110010" \
      "001100100011001000110010001100100011001000110010001100100011001000110010001100100011001000110" \
      "0100011001000110010001100100011001000110010001100100011001000110010001100100000001011000010110000" \
      "1011000010110000101100001011000000101000100000001000000010001100100011001000110010001100100011" \
      "001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100" \
      "100011001000110010001100100011001000110010001100100011001000110010001100100011001000110" \
      "0100011001000110010001100100000001011000010110000101100001011000010110000101100001011000010110" \
      "0001011100000101000100000001000110010001100100011001000110010001100100011001000110010001100100011001" \
      "0001100100011001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100100" \
      "01100100011001000110010001100100011001000110010001100100011001000110010000000101100001011000010110000101" \
      "1000010110000101100001011000010110000101100001011100000101000100011001000110010001100100011001000110010" \
      "0011001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100100011001" \
      "0001100100011001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100" \
      "100011001000000010110000101100001011000010110000101100001011000010110000101100001011000010110000001010" \
      "001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100100011001000" \
      "1100100011001000110010001100100011001000110010001100100011001000110010001100100011001000110010001100100" \
      "011001000110010001100100011001000110010001100100000001000000010110000101100001011000010110000101100001011" \
      "000010110000101100001011000010110000001010001000110010001100100011001000110010001100100011001000110010001" \
      "1001000110010001100100011001000110010001100100011001000000010000000100000001000000010000000100000001000" \
      "000010000000100000001000000010000000100000001000000010000000100000001000000010000000101100001011000010110" \
      "00010110000101100001011000010110000101100001011000010110000101100001011000000101000100011001000110010001100" \
      "10001100100011001000110010001100100011001000110010001100100011001000000010000000101110001011000010110000101" \
      "1000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000" \
      "10110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000001" \
      "01000100011001000110010001100100011001000110010001100100011001000110010001100100011001000000010000000101100" \
      "00101100001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100" \
      "001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000" \
      "0101100001011000010110000101100000010100010001100100011001000110010001100100011001000110010001100100011" \
      "00100011001000110010000000101100001011000010110000101100001011000010110000101100001011000010110000101" \
      "100001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001" \
      "011000010110000101100001011000010110000101100001011000010110000101100001011100000101000100000001000110010" \
      "0011001000110010001100100011001000110010001100100011001000110010000000101100001011000010110000101100001011" \
      "0000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100" \
      "0010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000010111000" \
      "001010001000000010000000100000001000110010001100100011001000110010001100100011001000110010000000101100001011" \
      "00001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000" \
      "01011000010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000000" \
      "101000100000001000000010000000100000001000000010000000100000001000000010000000100000001000000010110000101100" \
      "00101100001011000010110000101100001011000010110000101100001011000010110000001010001000000010000000100000" \
      "0010000000100000001000000010000000100000001000000010000000100000001011000010110000101100001011000010110" \
      "00010110000101100001011000010110000101100001011000010110000101100001011000010110000101100001011000010110" \
      "0001011000010110000101100000010100010000000100000001000000010000000100000001000000010000000100000001000000" \
      "010000000100000001011000010110000101100001011000010110000101100001011000010110000101100001011000010110000" \
      "1011000010110000101100001011000010000000100000001000000010110000101100001011000000101000100000001000000010000" \
      "00010000000100000001000000010000000100000001000000010000000100000001011100010110000101100001011000010110000101" \
      "1000010110000101100001011000010110000101100001011000010110000101100001011000010000000100000001000000010" \
      "11000010110000101100000010100010000000100000001000000010000000100000001000000010000000100000001000000" \
      "010000000100000001000000010000000101100001011000010110000101100001011000010110000101100001011000010110000" \
      "1011000010110000101100001011000010110000101100001011000010110000101100"
st = ""


def f(x):
    return chr(eval("0b" + x))


while num:
    st += f(num[:8])
    num = num[8:]
print(st)


#### Creating random products and disorder report template

In [0]:
from faker import Faker
import numpy as np
import pandas as pd
import random
import json
from bson.json_util import dumps
from pymongo import MongoClient


#### patient report

In [0]:
fake = Faker()

gender = "M", "F"

def create_rows_faker(num=1):
    """
    Create medical report template on random patient personal data
    """
    output = [{        
                    
                    "surname": fake.last_name(),
                    "gender": str(random.choice(gender)),
                    "firstname": fake.first_name_male() if "gender"=="M" else fake.first_name_female(),              
                    "age":fake.pyint(min_value = 18, max_value = 77, step = 1),                
                    "data badania": fake.date_time()} for x in range(num)]
    return output

df1 = pd.DataFrame(create_rows_faker(1))
df1["id"] = df1.index + 1
df1 = df1[['id','data badania','firstname','surname', 'age', 'gender' ]]
display(df1)

id,data badania,firstname,surname,age,gender
1,1997-12-09T21:24:26.000+0000,Monica,Patton,47,F


In [0]:
# adding a value of predicted disease to a mediacl report template, filtered for a more serious probability of disease over 15 %
inside2 = predicted_disease['Child_3_Genome_geneNames.csv']
newDict = {key: value for (key, value) in inside2.items() if value > 15 }
df1['dis'] = pd.NaT
df1['dis'] = df1['dis'].apply(lambda x: newDict)
print('Medical report dis:')
list(df1.dis)


#### pro medical products

In [0]:
from faker import Faker
import numpy as np
import pandas as pd
import random
fake = Faker()

typ=  "Cream", "Oil", "Lotion"
choroby = ('Intellectual Disability', 'Liver Cirrhosis, Experimental', 'Mammary Carcinoma, Human', 'Mammary Neoplasms, Human', 'Prostatic Neoplasms', 
           'Chemical and Drug Induced Liver Injury', 'Chemically-Induced Liver Toxicity', 'Drug-Induced Liver Disease', 'Hepatitis, Drug-Induced', 'Hepatitis, Toxic', 'Breast Carcinoma')


def create_rows_faker(num=1):
    """
    Create product data, each as cure on a list of diseases
    """
    output = [{ 
                    "nazwa" :  "Product sample " +   fake.word(),
                    "typ": random.sample(typ, 1),
                    "ocena" : fake.pyfloat(left_digits=None, right_digits=None, positive=False, min_value=5, max_value=10),
                    "kupionyx" : fake.pyint(min_value = 66, max_value = 1000, step = 1), 
                    "dis": random.sample(choroby, 2),
                   
                    "desc": "Sample product describsion: " + fake.sentence(4),

                   } for x in range(num)]
    return output

df2 = pd.DataFrame(create_rows_faker(10))
df2['ocena'] = round(df2['ocena'],2)
df2 = df2.explode('nazwa')
df2 = df2.explode('typ')
df2["id"] = df2.index + 1
df2 = df2[[ 'nazwa','desc','typ', 'ocena', 'kupionyx','dis' ]]

display(df2)

nazwa,desc,typ,ocena,kupionyx,dis
Product sample take,Sample product describsion: Rock wear figure air.,Lotion,5.2,761,"List(Drug-Induced Liver Disease, Prostatic Neoplasms)"
Product sample kind,Sample product describsion: Soldier material people.,Oil,5.0,534,"List(Drug-Induced Liver Disease, Mammary Neoplasms, Human)"
Product sample to,Sample product describsion: Current include wear sister bar.,Lotion,6.5,825,"List(Drug-Induced Liver Disease, Breast Carcinoma)"
Product sample feel,Sample product describsion: Billion yes dream leader.,Cream,7.31,432,"List(Hepatitis, Toxic, Hepatitis, Drug-Induced)"
Product sample especially,Sample product describsion: Professor company within.,Cream,5.74,285,"List(Hepatitis, Drug-Induced, Liver Cirrhosis, Experimental)"
Product sample notice,Sample product describsion: Standard event her.,Oil,9.13,156,"List(Chemical and Drug Induced Liver Injury, Prostatic Neoplasms)"
Product sample pressure,Sample product describsion: Stuff three weight.,Oil,9.32,865,"List(Liver Cirrhosis, Experimental, Hepatitis, Drug-Induced)"
Product sample budget,Sample product describsion: Increase make reveal sort.,Oil,7.6,762,"List(Drug-Induced Liver Disease, Mammary Neoplasms, Human)"
Product sample shoulder,Sample product describsion: Indeed method sit local.,Cream,9.11,804,"List(Liver Cirrhosis, Experimental, Drug-Induced Liver Disease)"
Product sample before,Sample product describsion: Statement front evening.,Cream,6.4,316,"List(Drug-Induced Liver Disease, Intellectual Disability)"


### Adding product weight reomedation system to product list

In [0]:

# Products:
C = df2['ocena'].mean()


# m is the minimum votes required to be listed in the chart, product list is short quantile part set to zero, all products counts 
m = df2['kupionyx'].quantile(0)

q_product = df2.copy().loc[df2['kupionyx'] >= m]

# calculating weighted rating that takes into account the average rating and the number of votes
def weighted_rating(x, m=m, C=C):
    """
    Calculates weighted score of a product
    Args:
        number of product purchases - mocked data
        product review - mocked data    
    Return:
        weighted score
    """    
    v = x['kupionyx']
    R = x['ocena']
    return (v/(v+m) * R) + (m/(m+v) * C)


# adding waithed rating
q_product['score'] = q_product.apply(weighted_rating, axis=1)

# transforming product list sorted by score with ordered columns 
q_product = q_product.sort_values('score', ascending=False)
df2 = q_product[[ 'nazwa','typ','desc', 'ocena', 'kupionyx','dis','score' ]]

print('Product list:')
display(df2)


nazwa,typ,desc,ocena,kupionyx,dis,score
Product sample pressure,Oil,Sample product describsion: Stuff three weight.,9.32,865,"List(Liver Cirrhosis, Experimental, Hepatitis, Drug-Induced)",8.985539666993144
Product sample shoulder,Cream,Sample product describsion: Indeed method sit local.,9.11,804,"List(Liver Cirrhosis, Experimental, Drug-Induced Liver Disease)",8.7884125
Product sample notice,Oil,Sample product describsion: Standard event her.,9.13,156,"List(Chemical and Drug Induced Liver Injury, Prostatic Neoplasms)",8.130500000000001
Product sample budget,Oil,Sample product describsion: Increase make reveal sort.,7.6,762,"List(Drug-Induced Liver Disease, Mammary Neoplasms, Human)",7.520300653594771
Product sample feel,Cream,Sample product describsion: Billion yes dream leader.,7.31,432,"List(Hepatitis, Toxic, Hepatitis, Drug-Induced)",7.262510204081633
Product sample before,Cream,Sample product describsion: Statement front evening.,6.4,316,"List(Drug-Induced Liver Disease, Intellectual Disability)",6.641601694915254
Product sample to,Lotion,Sample product describsion: Current include wear sister bar.,6.5,825,"List(Drug-Induced Liver Disease, Breast Carcinoma)",6.60034250764526
Product sample especially,Cream,Sample product describsion: Professor company within.,5.74,285,"List(Hepatitis, Drug-Induced, Liver Cirrhosis, Experimental)",6.232054421768709
Product sample take,Lotion,Sample product describsion: Rock wear figure air.,5.2,761,"List(Drug-Induced Liver Disease, Prostatic Neoplasms)",5.528501635768811
Product sample kind,Oil,Sample product describsion: Soldier material people.,5.0,534,"List(Drug-Induced Liver Disease, Mammary Neoplasms, Human)",5.481791304347826


### Matching produckt list with medical report

In [0]:
# disorders to list 
diagnosis = df1.values.tolist()

# products to list
meds = df2.values.tolist()

In [0]:

def compare_scores(ncures, disorders):
    """
    Calculates the final result as the sum of multiplied arguments
    Args:
        patient data: name of the disease with the probability of occurrence
        the result of treatment calculated below        
    Return:
        final score as a sum of all calculated numbers of the set
    """
    res = {}
    for disorder in disorders:
        cure_score = ncures.get(disorder)
        disorder_score = disorders[disorder]
        if not cure_score:
            cure_score = 0
        if not disorder_score:
            disorder_score = 0
        res[disorder] = cure_score * disorder_score
    final_score = sum(res.values())
    return final_score

In [0]:

def check_score(random_meds, disorders):
    """
    Calculates the result of treatment for each set together and each disease. For example, drug No. 1 weight score for Parkinson's * 10 + drug No. 2 weight score for Parkinson's * 10....
    Args:
        diseases in set of meds – medical products (which support the treatment of the list of given diseases - sets)
        recomedation score - calculated weighted rating for each product
    Return:
        final score as a result of treatment
    """
    cures_score = {}
    for med in random_meds:
        score = med[6]
        cures = med[5]
        cures = cures
        for cure in cures:
            x = cures_score.get(cure)
            if not x:
                x = 0
            cures_score[cure] = x + (score * 10)
    final_score = compare_scores(cures_score, disorders)
    return final_score

In [0]:
  
def run_search(disorders, meds, no_products):
    """
    Assumes 10,000 iterations of random attempts
    Draws 4 medsy – products to test the result that they have score together
    Having drawn meds, goes to the function above checking what is the score for these meds
    Args:
        meds
    Return:
        best_meds at the end, a set is selected, where the final_score is the largest
    """
    best_meds = []
    max_score = 0
    for a in range(0, 10000):
        mlen = len(meds)
        random_meds = []
        rr = []
        n = 0
        while n < no_products:
            r = random.randrange(0, mlen)
            if r not in rr:
                rr.append(r)
                n = n + 1
                random_meds.append(meds[r])
        final_score = check_score(random_meds, disorders)
        if final_score > max_score:
            max_score = final_score
            best_meds = {'score': max_score, 'meds': random_meds}
    return best_meds  


In [0]:
for diag in diagnosis:
    """
    Showning result of patient disorders report match with recomended products.
    Args:
        patient disorders
        matched best products
    Print:
        Result of match as a medical report
    """
    disorders = diag[6]
    print("Name: " + str(diag[2]) + " " + str(diag[3]))
    print('Date: ' + str(diag[1]))
    print('Age: ' + str(diag[4]))
    print('Gender: ' + str(diag[5]))

    print("Disorders: " + str(disorders))
    best_meds = run_search(disorders, meds, 4)
    print(" ")
    print("Recomended meds products for disorders:")    
    for med in best_meds['meds']:  
        print(med)
        

### Visualizing

#### packing report variables as dict

In [0]:
# setting up variables for patient disorders report 
name = "Name: " + str(diag[2]) + " " + str(diag[3])
date = 'Date: ' + str(diag[1])
age = 'Age: ' + str(diag[4])
gender = 'Gender: ' + str(diag[5])
disorder = str(diag[6])

# matched product 1
pro = best_meds['meds'][0][0]
rate = 'Rate: ' + str(best_meds['meds'][0][3])
rate0 = best_meds['meds'][0][3]
desc = str(best_meds['meds'][0][2])
dis = str(best_meds['meds'][0][5])
typ = best_meds['meds'][0][1]

# matched product 2
pro1 = best_meds['meds'][1][0]
rate1 = 'Rate: ' + str(best_meds['meds'][1][3])
rate10 = best_meds['meds'][1][3]
desc1 = best_meds['meds'][1][2]
dis1 = best_meds['meds'][1][5]
typ1 = best_meds['meds'][1][1]

# matched product 3
pro2 = best_meds['meds'][2][0]
rate2 = 'Rate: ' + str(best_meds['meds'][2][3])
rate20 = best_meds['meds'][2][3]
desc2 = best_meds['meds'][2][2]
dis2 = best_meds['meds'][2][5]
typ2 = best_meds['meds'][2][1]

# matched product 4
pro3 = best_meds['meds'][3][0]
rate3 = 'Rate: ' + str(best_meds['meds'][3][3])
rate30 = best_meds['meds'][3][3]
desc3 = best_meds['meds'][3][2]
dis3 = best_meds['meds'][3][5]
typ3 = best_meds['meds'][3][1]

# all produts list
args = df2.values.tolist()

# packing variables as dict
d = dict(((k, eval(k)) for k in ('name', 'date', 'age', 'gender', 'disorder',
                                 'pro', 'rate', 'rate0', 'desc', 'dis', 'typ',
                                 'pro1', 'rate1', 'rate10', 'desc1', 'dis1', 'typ1',
                                 'pro2', 'rate2', 'rate20', 'desc2', 'dis2', 'typ2',
                                 'pro3', 'rate3', 'rate30', 'desc3', 'dis3', 'typ3','args')))



#### conecting to azure mongodb cluster, inserting dict variables of each report as collection, dumps collection.json to dbfs storage

In [0]:

def get_database():
    """
    Provide the mongodb atlas url to connect python to mongodb using pymongo
    Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    Result:
        Create the database
    """ 
    con_str = "mongodb+srv://mongo:letgo666@cluster0.obxcw.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
    client = MongoClient(con_str)
    return client['mongotest']

get_database()


def insert_sentences(sentences):
    """
    Get the database mongotest3
    Inserting variables dict as a one colection
    Dumping colection.json with variables to dbfs storage
    """ 
    # Get the database
    mydb = get_database()
    mycol = mydb["mongotest3"]

    x = mycol.insert_one(sentences)

    collection = mycol
    cursor = collection.find({})
    with open('/dbfs/collection.json', 'w') as file:
        json.dump(json.loads(dumps(cursor)), file)
        
insert_sentences(d)

#### from mongodb collection, variables can be use for other cloud services 
https://inter-medical.herokuapp.com/

#### dbfs storage collection.json dump check

In [0]:
%fs
ls

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,1650530220000
dbfs:/collection.json,collection.json,41212,1652180993000
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
