In [1]:
import os
import sys
import pandas as pd
sys.path.append("/Users/leonardomuntaner/Documents/GitHub/MDCv1/src/db")

from database import DatabaseConnection
from preprocessing import preprocess_drug_details_1

In [2]:
# Load and preprocess data
db_connection = DatabaseConnection()
raw_df = db_connection.load_drug_details('drug_details_1')

In [3]:
raw_df.head()

Unnamed: 0,drug_id,drug_name,generic_name_1,generic_name_2,generic_name_3,generic_name_4,generic_name_5,side_effect_1,side_effect_2,side_effect_3,side_effect_4,side_effect_5,therapeutic_type,administration_method,habit_forming,usecase,data_source
0,35020,babygest,nexgest,sugest,pogest,kreate,cygest,hot flushes,,,,,gynaecological,capsule,no,hormone replacement therapy,www.kaggle.com/datasets/shudhanshusingh/250k-m...
1,6548,asteclav,penciclav,themiclav,moxikind-cv,moxiforce-cv,novamox,vomiting,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
2,32009,bilin,themiclav,amoxyclav,moxikind-cv,moxiforce-cv,fightox,vomiting,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
3,48184,clopizest,anticlot,clopikind,platloc,clopicard,clopivas,bleeding,,,,,blood related,tablet,no,prevention of heart attack and stroke,www.kaggle.com/datasets/shudhanshusingh/250k-m...
4,54712,capadex,gudcef,zipod,brotacef,oxipod,monocef-o,rash,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...


In [None]:
# Create a copy to avoid modifying the original
processed_df = raw_df.copy()

In [None]:
# Combine generic names
processed_df['generic_names'] = processed_df.apply(
    lambda row: ', '.join([row[f'generic_name_{i}'] 
                        for i in range(1, 6) 
                        if pd.notna(row[f'generic_name_{i}']) and row[f'generic_name_{i}'] != 'NA']), 
    axis=1
)

In [None]:
processed_df.head()

In [None]:
processed_df['side_effects'] = processed_df.apply(
    lambda row: ', '.join([row[f'side_effect_{i}'] 
                        for i in range(1, 6) 
                        if pd.notna(row[f'side_effect_{i}']) and row[f'side_effect_{i}'] != 'NA']), 
    axis=1
)

In [None]:
processed_df.head(20)

In [None]:
# Identify drugs with generic names
generic_columns = ['generic_name_1', 'generic_name_2', 'generic_name_3', 'generic_name_4', 'generic_name_5']

# Calculate percentage of drugs with actual generic names
def has_valid_generic(row):
    return any(pd.notna(row[col]) and row[col] != 'NA' for col in generic_columns)

drugs_with_generics = raw_df.apply(has_valid_generic, axis=1)

In [None]:
drugs_with_generics

In [None]:
generic_percentage = (drugs_with_generics.sum() / len(raw_df)) * 100

In [5]:
# Filter dataframe
use_case_filter = raw_df['usecase'].unique()

filtered_df = raw_df.copy()
filtered_df = filtered_df[filtered_df['usecase'].isin(use_case_filter)]

In [7]:
filtered_df.head(30)

Unnamed: 0,drug_id,drug_name,generic_name_1,generic_name_2,generic_name_3,generic_name_4,generic_name_5,side_effect_1,side_effect_2,side_effect_3,side_effect_4,side_effect_5,therapeutic_type,administration_method,habit_forming,usecase,data_source
0,35020,babygest,nexgest,sugest,pogest,kreate,cygest,hot flushes,,,,,gynaecological,capsule,no,hormone replacement therapy,www.kaggle.com/datasets/shudhanshusingh/250k-m...
1,6548,asteclav,penciclav,themiclav,moxikind-cv,moxiforce-cv,novamox,vomiting,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
2,32009,bilin,themiclav,amoxyclav,moxikind-cv,moxiforce-cv,fightox,vomiting,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
3,48184,clopizest,anticlot,clopikind,platloc,clopicard,clopivas,bleeding,,,,,blood related,tablet,no,prevention of heart attack and stroke,www.kaggle.com/datasets/shudhanshusingh/250k-m...
4,54712,capadex,gudcef,zipod,brotacef,oxipod,monocef-o,rash,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
5,63467,clofirst,anticlot,stayhappi,clopikind,platloc,clopivas,bleeding,,,,,blood related,tablet,no,prevention of heart attack and stroke,www.kaggle.com/datasets/shudhanshusingh/250k-m...
6,75477,daxiclav,penciclav,moxikind-cv,moxiforce-cv,fightox,novamox,vomiting,nausea,diarrhea,,,anti infectives,tablet,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
7,81583,dumox,megox,amoxyclav,moxilium,lactoclaav,indclav,vomiting,nausea,diarrhea,,,anti infectives,injection,no,treatment of bacterial infections,www.kaggle.com/datasets/shudhanshusingh/250k-m...
8,82256,eva,totalax,lacsyp,gutclear,lacrelax,cremahep,cramps,abdominal distension,flatulence,,,gastro intestinal,syrup,no,constipation,www.kaggle.com/datasets/shudhanshusingh/250k-m...
9,83764,emidas,bestflow,equitrix,,,,nausea,upset stomach,indigestion,weight gain,rash,neuro cns,tablet,no,vertigo,www.kaggle.com/datasets/shudhanshusingh/250k-m...


In [13]:
# Side Effects Bar Chart
side_effect_data = filtered_df.melt(
    id_vars=['drug_name'], 
    value_vars=['side_effect_1'],
    var_name='side_effect_column', 
    value_name='side_effect'
)

In [14]:
side_effect_data.head(15)

Unnamed: 0,drug_name,side_effect_column,side_effect
0,babygest,side_effect_1,hot flushes
1,asteclav,side_effect_1,vomiting
2,bilin,side_effect_1,vomiting
3,clopizest,side_effect_1,bleeding
4,capadex,side_effect_1,rash
5,clofirst,side_effect_1,bleeding
6,daxiclav,side_effect_1,vomiting
7,dumox,side_effect_1,vomiting
8,eva,side_effect_1,cramps
9,emidas,side_effect_1,nausea


In [15]:
side_effect_counts = side_effect_data[(side_effect_data['side_effect'] != 'NA')].dropna()['side_effect'].value_counts().head(10)

In [16]:
side_effect_counts

side_effect
nausea                                    65884
diarrhea                                  23871
headache                                  16163
vomiting                                  14674
rash                                      10978
dizziness                                  9143
hypoglycemia (low blood glucose level)     7125
sleepiness                                 6752
constipation                               5808
abdominal pain                             5445
Name: count, dtype: int64