In [6]:
import pandas as pd
import scipy.cluster.hierarchy as spc
import datetime

# Clustering Drugs based on Sales correlation

In [12]:
class NdcClass:
    def __init__(self, description_df, sales_df):
        self.description_df = description_df
        self.sales_df = sales_df
def create_ndc_sales_data():
    #read data from csv
    sales_df= pd.read_csv('merge_with_iqvia_2017_01_2020_09_monthly.csv')
    df_ndc_description=sales_df[['NDC','Major Class','Acute/Chronic','Prod Form']].drop_duplicates()
    df_ndc_description.columns=['NDC','Major_Class','Acute_Chronic','Prod_Form']
    df_ndc_description.set_index('NDC',inplace=True)
    #pick columns
    sales_df=sales_df[['NDC','Month','Year','TRx','WAC']]
    #convert date to datetime
    sales_df['Date']=sales_df.apply (lambda row: datetime.datetime(int(row.Year),int(row.Month),1), axis=1)
    #rename columns
    sales_df.columns=['ndc','month','year','trx','wac','date']
    #index dataframe
    ssales_df=sales_df.set_index(['date','ndc'])
    #pick the columns we need
    sales_df=ssales_df[['trx','wac']]
    #compute sales
    sales_df['sales']=sales_df.apply (lambda row: row.trx*row.wac, axis=1)
    #pivot the dataframe to get classes on columns and dates on rows
    sales_df=pd.pivot_table(sales_df, values=['sales'], index=['date'],columns=['ndc'])
    #fill missing data
    sales_df.fillna(method='ffill',inplace=True)
    sales_df.fillna(method='bfill',inplace=True)
    sales_df.fillna(0,inplace=True)
    
    ndcclass=NdcClass(df_ndc_description,sales_df)
    return ndcclass
sales_df_cls=create_ndc_sales_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
def build_ndc_sales_clusters(sales_df_cls):
    #get sales dataframe
    sales_df=sales_df_cls.sales_df
    #compute correlation
    sales_df_c=sales_df.corr()
    #fill missing data
    sales_df_c.fillna(0,inplace=True)
    #compute distance between correlations
    pdist_sales = spc.distance.pdist(sales_df_c)
    #link data base com their correlation
    linkage_sales = spc.linkage(pdist_sales, method='complete')
    #build clusters indices
    idx_sales = spc.fcluster(linkage_sales, 0.5 * pdist_sales.max(), 'distance')
    #build cluster
    clusters_sales = pd.DataFrame(idx_sales, index =sales_df_c.index, columns =['cluster']) 
    #reset index
    clusters_sales=clusters_sales.reset_index()[['ndc','cluster']]
    #set index
    clusters_sales.set_index('ndc', inplace=True)
    #Add description to cluster
    clusters_sales=pd.merge(clusters_sales,sales_df_cls.description_df, left_index=True, right_index=True)
    return clusters_sales
cluster_ndc_sales=build_ndc_sales_clusters(sales_df_cls)
cluster_ndc_sales


# Clustering Drugs' Classes based on Sales correlation

In [35]:
def get_class_data():
    #read class level trx
    df_trx=pd.read_csv('aggregated_trx_class.csv')
    #read class data price
    df_wac=pd.read_csv('aggregated_weighted_WAC_class.csv')
    #create empty sales dataframe
    df_sales = pd.DataFrame()
    #loop throught dates and compute sales
    for x in df_trx:
        if(x!='Date'):
            df_sales[x]=df_trx[x]*df_wac[x]
    #add Date column
    df_sales['Date']=df_wac['Date']
    #convert dates to datetime type
    df_sales['Date']=df_sales.apply (lambda row: datetime.datetime(int(row.Date.split("_")[0]), int(row.Date.split("_")[1]), 28),axis=1)
    #set date as the index
    df_sales.set_index('Date',inplace=True)
    return df_sales


def get_build_class_cluster(df_sales):
    #compute sales correlation
    df_sales_corr=df_sales.corr()
    #fill missing data
    df_sales_corr=df_sales_corr.fillna(0)
    #compute distance between corelation
    pdist_sales_corr = spc.distance.pdist(df_sales_corr)
    #link classes based on these correlations
    linkage_sales_corr = spc.linkage(pdist_sales_corr, method='complete')
    #create indice bases on the distance and correlation
    idx_sales = spc.fcluster(linkage_sales_corr, 0.5 * pdist_sales_corr.max(), 'distance')
    #build clusters
    clusters_sales_class = pd.DataFrame(idx_sales, index =df_sales_corr.index, columns =['cluster'])
    return clusters_sales_class


In [37]:
class_sales=get_class_data()
class_clusters=get_build_class_cluster(class_sales)

In [38]:
class_clusters

Unnamed: 0,cluster
A.C.T.H.,1
A05A1 CHOLERETICS+CHOLEKINETIC,1
ADHD,1
ALL OTHER RESPIRATORY,2
ALL OTHER THERAPEUTICS,3
...,...
THYROID ANTI-THYROID AND IODINE PREPS,1
URINARY INCONTINENCE,2
"VACCINES (PURE, COMB, OTHER)",3
VIRAL HEPATITIS,1
