## Importing the Necessary Libraries

In [1]:
import  numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotnine as p9
import scipy
import geopandas as gpd
import geodatasets
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest

In [2]:
# Writing the necessary configurations for the Libraries
## I want to ignore all warnings
warnings.filterwarnings("ignore")

## I want all the columns in the dataset to be displayed
pd.set_option("display.max_columns", None)

## Setting the Plot Styles So that the visualizations will be publication ready
sns.set_style("whitegrid")

plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.25,
    "grid.linestyle": "--",
    "grid.alpha": 0.7,
    "axes.labelsize": 14,
    "axes.titlesize": 16,
    "legend.fontsize": 8,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "font.family": "sans-serif",
    "font.sans-serif": ["Arial", "Helvetica", "DejaVu Sans"],
    "lines.linewidth": 1,
    "lines.markersize": 4,
    "figure.dpi": 300,
    "savefig.dpi": 300,
    "figure.figsize": [10,6]
})

## Merging Darasets to use

In [8]:
metadata_path = "accessions/correct_metadata.csv"
metadata_df = pd.read_csv(metadata_path)
metadata_df.columns

Index(['Unnamed: 0', '#', 'SE id(SA)', 'Accession', 'GI number', 'Version',
       'Se ID', 'Sequence Length', 'GB comment', 'GB create date',
       'GB update date', 'Percent non-ACGT', 'SE id(SSAM)', 'PAT id(SSAM)',
       'Name', 'Locus Name', 'Isolate Name', 'Clone Name', 'Georegion',
       'Country', 'Sampling City', 'Sampling Year', 'Sampling Year Upper',
       'Patient Age', 'Patient Health', 'Organism', 'Subtype', 'Phenotype',
       'Coreceptor', 'Sample Tissue', 'Culture Method', 'Molecule type',
       'Drug Naive', 'Problematic Sequence', 'Viral load', 'CD4 count',
       'CD8 count', 'Days from Infection', 'Days from Seroconversion',
       'Days from first Sample', 'Sequencing method', 'Amplification strategy',
       'Fiebig Stage', 'Anno', 'Days from treatment start',
       'Days from treatment end', 'Vaccine status', 'RIP subtype',
       'Patient Id', 'Patient Sex', 'Risk Factor', 'Infection Year'],
      dtype='object')

In [7]:
resistance_data_path ="../data/csv_output/drug_resistance_mutations.csv"
resistance_df = pd.read_csv(resistance_data_path)
resistance_df.head(2)

Unnamed: 0,Input Sequence,Gene Name,Drug Class,Drug,Drug Full Name,Score,Level,Mutation Text,Mutation Primary Type
0,KC422797,RT,NRTI,ABC,ABC,10.0,2,T215Y,NRTI
1,KC422797,RT,NRTI,AZT,AZT,60.0,5,T215Y,NRTI


In [9]:
# In order to allow for better understanding of the dataset, we merge the metadata
# with the resistance data by the countries, the subtypes and the sampling years.
# Hence we can do subtype analysis, mutation type analysis, etc
merged_df = pd.merge(
    resistance_df, # Table that I am joining to
    metadata_df[["Accession", "Sampling Year", "Country", "Subtype"]], # the columns that I want to join with
    left_on= "Input Sequence", # When joining, I wamt it to look at this in the first dataset
    right_on = "Accession", # and look at this in the second dataset
    how = "left" # And add the data that it finds in the first dataset if it is in the second dataset column that was specified.
)

In [11]:
merged_df = merged_df.dropna(subset=["Sampling Year"])

In [14]:
for column in merged_df.columns:
    print(f'{column} contains {merged_df[column].isnull().sum()} values')

Input Sequence contains 0 values
Gene Name contains 0 values
Drug Class contains 0 values
Drug contains 0 values
Drug Full Name contains 0 values
Score contains 0 values
Level contains 0 values
Mutation Text contains 0 values
Mutation Primary Type contains 0 values
Accession contains 0 values
Sampling Year contains 0 values
Country contains 0 values
Subtype contains 589 values


In [16]:
# Grouping the dataframe into better chunks
grouped_dfs = {
    drug_class: group.reset_index(drop=True)
    for drug_class, group in merged_df.groupby("Drug Class")
}

In [17]:
grouped_dfs.keys()

dict_keys(['INSTI', 'NNRTI', 'NRTI', 'PI'])

In [18]:
insti_df = grouped_dfs["INSTI"]
nnrti_df = grouped_dfs["NNRTI"]
nrti_df = grouped_dfs["NRTI"]
pi_df = grouped_dfs["PI"]