set the "path" variable to the directory of the downloaded databases.

In [None]:
path = '???'

In [43]:
from Bio import SeqIO
import pandas as pd
import re
import numpy as np

### HMDDv4

In [44]:

def clean_data(text):
    if pd.isna(text):
      return text
    if isinstance(text, str):
      text = text.lower()  # convert to lowercase
      text = re.sub(r'\s+', ' ', text)  # delete additional spaces
      text = text.strip('"')
      text = text.strip()
      return text

def process_data(df):
    df = df.drop(df.index[0]).drop_duplicates().reset_index(drop=True)
    df['Disease'] = df['Disease'].apply(clean_data)
    df['miRNA'] = df['miRNA'].apply(clean_data)

    print(df.head())
    print('DF Length: ', len(df),  
      ' - miRNA Count', len(df.iloc[:, 0].drop_duplicates()), 
      ' - Disease Count', len(df.iloc[:, 1].drop_duplicates()))
    return df

In [45]:

df_hmdd3 = pd.read_csv(f'{path}/HMDD/HMDD3.csv', sep=',', usecols=[0, 1], names=['miRNA', 'Disease'], encoding='latin-1')
df_hmdd3 = process_data(df_hmdd3)

#df_hmdd3.to_csv(path+'HMDD3.csv', index=False)

         miRNA              Disease
0  hsa-mir-9-2  wounds and injuries
1  hsa-mir-9-3  wounds and injuries
2   hsa-mir-21  wounds and injuries
3  hsa-mir-483  wounds and injuries
4   hsa-mir-31  wounds and injuries
DF Length:  14549  - miRNA Count 917  - Disease Count 792


In [46]:
df_hmdd3 = pd.read_csv(f'{path}/HMDD/HMDD3.txt', sep='\t', usecols=[1, 2], names=['miRNA', 'Disease'], encoding='latin-1')
df_hmdd3 = process_data(df_hmdd3)

df_hmdd3.to_csv(path+'HMDD3.csv', index=False)

         miRNA                                 Disease
0  hsa-mir-15a  leukemia, lymphocytic, chronic, b-cell
1   hsa-mir-16  leukemia, lymphocytic, chronic, b-cell
2  hsa-mir-143                         colon neoplasms
3  hsa-mir-145                         colon neoplasms
4  hsa-mir-223  leukemia, lymphocytic, chronic, b-cell
DF Length:  18732  - miRNA Count 1206  - Disease Count 894


In [69]:
df_hmdd4 = pd.read_csv(f'{path}/HMDD/HMDDv4 (The whole dataset of miRNA-disease association data).txt', sep='\t', usecols=[2, 3], names=['miRNA', 'Disease'])
df_hmdd4 = process_data(df_hmdd4)

df_hmdd4.to_csv(path+'HMDD.csv', index=False)

          miRNA               Disease
0    hsa-mir-40       liver cirrhosis
1   hsa-mir-29b  zika virus infection
2  hsa-mir-130b         wound healing
3   hsa-mir-221     stomach neoplasms
4   hsa-mir-497  pancreatic neoplasms
DF Length:  31543  - miRNA Count 1890  - Disease Count 2349


In [48]:
merged_df = pd.merge(df_hmdd3, df_hmdd4, on='Disease', suffixes=('3', '4'))

different_rows = merged_df[merged_df['miRNA3'] != merged_df['miRNA4']]
result = different_rows[['Disease', 'miRNA3', 'miRNA4']]

print(different_rows)

result.to_csv(path +'hmdd_v_diff2.csv', index=False)



              miRNA3                                 Disease        miRNA4
0        hsa-mir-15a  leukemia, lymphocytic, chronic, b-cell  hsa-mir-16-1
1        hsa-mir-15a  leukemia, lymphocytic, chronic, b-cell   hsa-mir-20a
2        hsa-mir-15a  leukemia, lymphocytic, chronic, b-cell  hsa-mir-181c
3        hsa-mir-15a  leukemia, lymphocytic, chronic, b-cell  hsa-mir-125b
4        hsa-mir-15a  leukemia, lymphocytic, chronic, b-cell   hsa-mir-15b
...              ...                                     ...           ...
1465216   hsa-mir-7a                       parkinson disease   hsa-mir-19a
1465217   hsa-mir-7a                       parkinson disease   hsa-mir-27b
1465218   hsa-mir-7a                       parkinson disease   hsa-mir-105
1465219   hsa-mir-7a                       parkinson disease   hsa-mir-599
1465220   hsa-mir-7a                       parkinson disease   hsa-mir-26a

[1461733 rows x 3 columns]


### miR2Disease

In [49]:
df_mir2dis = pd.read_csv(f'{path}/miR2Disease/AllEntries.txt', sep='\t', usecols=[0, 1], names=['miRNA', 'Disease'])
df_mir2dis = process_data(df_mir2dis)

df_mir2dis.to_csv(path+'miR2Disease.csv', index=False)

        miRNA                             Disease
0  hsa-let-7g      hepatocellular carcinoma (hcc)
1  hsa-let-7g                         lung cancer
2  hsa-let-7g  non-small cell lung cancer (nsclc)
3  hsa-let-7g                 ovarian cancer (oc)
4  hsa-let-7g                   colorectal cancer
DF Length:  2437  - miRNA Count 468  - Disease Count 149


### dbDEMC

In [50]:
df_db = pd.read_csv(f'{path}/dbDEMC/miRExpAll.txt', sep='\t', usecols=[0, 1, 4], names=['miRNA', 'Disease',  'miRBaseID'])
df_db['miRBaseID'], df_db['Disease'] = df_db['Disease'], df_db['miRBaseID']
df_db = process_data(df_db)

df_db.to_csv(path+'dbDEMC.csv', index=False)

           miRNA   Disease        miRBaseID
0   hsa-mir-106a  lymphoma  hsa-miR-106a-5p
1     hsa-let-7g  lymphoma    hsa-let-7g-5p
2  hsa-mir-17-5p  lymphoma    hsa-miR-17-5p
3     hsa-let-7f  lymphoma    hsa-let-7f-5p
4     hsa-let-7a  lymphoma    hsa-let-7a-5p
DF Length:  56655  - miRNA Count 4495  - Disease Count 40


### miRCancer

In [52]:
df_mircan = pd.read_csv(f'{path}/miRCancer/miRCancerJune2020.txt', sep='\t', usecols=[0, 1], names=['miRNA', 'Disease'], encoding='latin-1')
df_mircan = process_data(df_mircan)

df_mircan.to_csv(path+'miRCancer.csv', index=False)

        miRNA                 Disease
0  hsa-let-7a  acute myeloid leukemia
1  hsa-let-7a           breast cancer
2  hsa-let-7a          chondrosarcoma
3  hsa-let-7a            colon cancer
4  hsa-let-7a       colorectal cancer
DF Length:  5660  - miRNA Count 1034  - Disease Count 131


In [53]:
df_merged = pd.concat([df_db, df_mircan, df_mir2dis, df_hmdd]).drop_duplicates().reset_index(drop=True)
df_merged = pd.DataFrame(df_merged, columns=['miRNA', 'Disease'])
print(df_merged.head())
print(len(df_merged))

adjacency_matrix = pd.pivot_table(df_merged, 
                                  values=None, 
                                  index='miRNA', 
                                  columns='Disease', 
                                  aggfunc=lambda x: 1, 
                                  fill_value=0)
adjacency_matrix.to_csv(path+'adjacency_matrix.csv', index=False)


           miRNA   Disease
0   hsa-mir-106a  lymphoma
1     hsa-let-7g  lymphoma
2  hsa-mir-17-5p  lymphoma
3     hsa-let-7f  lymphoma
4     hsa-let-7a  lymphoma
82455


### Case study

In [191]:
import pandas as pd

def check_mirna_across_dataframes(mir_list, df1, df2, df3, df4, df5, disease_prefix='breast'):
    result_df = pd.DataFrame(index=mir_list, columns=['DF1', 'DF2', 'DF3', 'DF4', 'DF5'])

    dataframes = [df1, df2, df3, df4, df5]
    
    # Check each miRNA in each DataFrame
    for mir in mir_list:
        print(mir)
        for i, df in enumerate(dataframes, 1):
            if not df[(df['miRNA'] == mir) & (df['Disease'].str.contains(disease_prefix))].empty:
                # Get the matching disease(s)
                matching_diseases = df[(df['miRNA'] == mir) & (df['Disease'].str.contains(disease_prefix))]['Disease'].unique()
                
                # Join multiple diseases with a separator if there's more than one match
                disease_str = ' | '.join(matching_diseases)
                
                # Assign the disease string to the result DataFrame
                result_df.loc[mir, f'DF{i}'] = disease_str
            else:
                result_df.loc[mir, f'DF{i}'] = '?'
    
    # Print the result
    new_column_names = {
        'DF1': 'HMDD4',
        'DF2': 'HMDD3',
        'DF3': 'miR2Disease',
        'DF4': 'dbDEMC',
        'DF5': 'miRCancer'
        }
    result_df = result_df.rename(columns=new_column_names)
    return result_df

filename = f'leukemia_top_50_predictions'
key = 'leukemia'


case_df = pd.read_csv(f'{path}/cases/pred/{filename}.csv')
mir_list = case_df['miRNA'].dropna().tolist()
print(mir_list)
result = check_mirna_across_dataframes(mir_list, df_hmdd4, df_hmdd3, df_mir2dis, df_db, df_mircan, disease_prefix=key)
result.to_csv(f'{path}/cases/true/{key}_other2.csv')
print(result.to_string())

# for disease in case_df.columns:
#     print(f"\nProcessing disease: {disease}")
    
#     mir_list = case_df[disease].dropna().tolist()
    
#     result = check_mirna_across_dataframes(mir_list, df_hmdd4, df_hmdd3, df_mir2dis, df_db, df_mircan, disease_prefix=disease.split()[0])
#     result.to_csv(f'{path}cases/{disease}.csv')
#     # Print the result
#     print(result.to_string())

['hsa-mir-21', 'hsa-mir-1908', 'hsa-mir-126', 'hsa-mir-18a', 'hsa-mir-147b', 'hsa-mir-23b', 'hsa-mir-155', 'hsa-mir-132', 'hsa-mir-335', 'hsa-mir-148b', 'hsa-mir-184', 'hsa-mir-221', 'hsa-mir-195', 'hsa-mir-31', 'hsa-mir-145', 'hsa-mir-193a', 'hsa-mir-483', 'hsa-mir-137', 'hsa-mir-146a', 'hsa-mir-223', 'hsa-mir-328', 'hsa-mir-143', 'hsa-mir-383', 'hsa-mir-92a-2', 'hsa-mir-497', 'hsa-mir-23a', 'hsa-mir-205', 'hsa-mir-29a', 'hsa-mir-142', 'hsa-mir-363', 'hsa-mir-135b', 'hsa-mir-628', 'hsa-mir-152', 'hsa-mir-494', 'hsa-mir-146b', 'hsa-mir-20a', 'hsa-mir-200b', 'hsa-mir-100', 'hsa-mir-196a-2', 'hsa-mir-542', 'hsa-mir-320a', 'hsa-mir-634', 'hsa-mir-15a', 'hsa-mir-485', 'hsa-mir-106b', 'hsa-mir-150', 'hsa-mir-9-1', 'hsa-mir-2861', 'hsa-mir-34b', 'hsa-mir-16-2']
hsa-mir-21
hsa-mir-1908
hsa-mir-126
hsa-mir-18a
hsa-mir-147b
hsa-mir-23b
hsa-mir-155
hsa-mir-132
hsa-mir-335
hsa-mir-148b
hsa-mir-184
hsa-mir-221
hsa-mir-195
hsa-mir-31
hsa-mir-145
hsa-mir-193a
hsa-mir-483
hsa-mir-137
hsa-mir-146a
hsa

### miRBASE

In [55]:

# Initialize lists to store data
mirna_names = []
mirna_ids = []
mirna_sequences = []

# Replace 'your_file.dat' with the path to your downloaded .dat file
with open(f'{path}/miRBase/miRNA.dat', 'r') as handle:
    for record in SeqIO.parse(handle, 'embl'):
            mirna_ids.append(record.id)
            mirna_names.append(record.name)
            mirna_sequences.append(str(record.seq))

# Create a pandas DataFrame
df = pd.DataFrame({
    'ID': mirna_ids,
    'miRNA': mirna_names,
    'Sequence': mirna_sequences
})

# Display the first few rows of the DataFrame
print(df.head())
print(len(df))
# Optionally, save the DataFrame to a CSV file
df.to_csv('miRBase.csv', index=False)


          ID       miRNA                                           Sequence
0  MI0000001   cel-let-7  UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAU...
1  MI0000002   cel-lin-4  AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUU...
2  MI0000003   cel-mir-1  AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAU...
3  MI0000004   cel-mir-2  UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCA...
4  MI0000005  cel-mir-34  CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCC...
38589


In [56]:
mature = []
with open(f'{path}/miRBase/mature.fa', 'r') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.id.startswith('hsa'):
            mature.append(record.id)
    
df = pd.DataFrame({'miRNA': mature})
print(df.head())
print(len(df))

             miRNA
0    hsa-let-7a-5p
1    hsa-let-7a-3p
2  hsa-let-7a-2-3p
3    hsa-let-7b-5p
4    hsa-let-7b-3p
2656


### MISIM

In [57]:
df = pd.read_csv(f'/Users/macbook/Downloads/similarity/miRNA_name.txt', sep='\t', usecols=[0], names=['miRNA'])
print(df.head())
df.to_csv('MISIM_miRNA.csv', index=False)

df = pd.read_csv('/Users/macbook/Downloads/similarity/similarity.txt', sep='\t')
print(df.head())
df.to_csv('MISIM_sim.csv', index=False)

          miRNA
0    hsa-mir-93
1     hsa-let-7
2    hsa-let-7a
3  hsa-let-7a-1
4  hsa-let-7a-2
        1.0  0.00836525358685  0.0125233401372  0.0133543439123  \
0  0.008365          1.000000         0.476067         0.435902   
1  0.012523          0.476067         1.000000         0.651843   
2  0.013354          0.435902         0.651843         1.000000   
3  0.013472          0.418217         0.639863         0.989002   
4  0.013479          0.417713         0.631654         0.980446   

   0.0134721277588  0.0134785269796  0.0167795070699  0.011490927577  \
0         0.418217         0.417713         0.461889        0.486344   
1         0.639863         0.631654         0.612102        0.580480   
2         0.989002         0.980446         0.686102        0.731127   
3         1.000000         0.991349         0.674790        0.719203   
4         0.991349         1.000000         0.667432        0.721142   

   0.013638931576  0.0124013523877  ...  0.000601931287893  0.005543