# Aggregating EDA for whole universe

In [1]:
import pandas as pd
import re
pd.set_option('display.max_rows', None)

In [4]:
# Retrieving all data about the drug class
data = pd.read_csv('../2 Processed_Data/drugs_pct_changes_monthly.csv')
data.Date = pd.to_datetime(data.Date, format='%Y_%m')

# Only NDCs that have full period
last_period = data[(data.Year == 2020) & (data.Month == 7)].NDC.unique()
first_period = data[(data.Year == 2014) & (data.Month == 8)].NDC.unique()
selected_NDC = list(set(last_period).intersection(first_period)) 
data = data[data.NDC.isin(selected_NDC)]

# Updating NDC that do not have LOE date to a future LOE date
data.loc[data['Estimated LOE Date'] == 'Unspecified', 'Estimated LOE Date'] = 'May-24'

# Getting rid of NDC that are too close to their LOE date
data['Estimated LOE Date'] = pd.to_datetime(data['Estimated LOE Date'], format='%b-%y')
data = data[data['Estimated LOE Date'] >= pd.to_datetime('2020-07-01')]

# Retrieving year over year percent change data
yy_data = pd.read_csv('../1 Data Preprocessing/Year Over Year/df_yearoveryear_class.csv')
yy_data.Date = pd.to_datetime(yy_data.Date, format='%Y_%m')

In [7]:
# Extracting number of unique drug per month
nb_drugs = data.groupby('Date')['NDC'].nunique().to_frame(name='# drugs').reset_index().rename(columns={'Date': 'date'})

# Extracting year over year percent change in Sales weighted WAC
# yy_change = yy_data[yy_data['Major Class'] == drug_class][['Date', 'Class_wght_pct_change_y/y']].reset_index(drop=True).rename(columns={'Date': 'date', 'Class_wght_pct_change_y/y': 'Y/Y Percent change'})

# Extracting number of changes per month
nb_price_change = data.groupby('Date')['Changed'].sum().to_frame(name='# price changes').reset_index().rename(columns={'Date': 'date'})

# Helper function
def weigthed_average(data, quantity, weights):
    try:
        return (data[quantity] * data[weights]).sum() / data[weights].sum()
    except ZeroDivisionError:
        return data[quantity]    
    
# Extracting Sales weighted WAC per month
data['Sales'] = data['WAC'] * data['TRx']
sales_wac = data.groupby('Date').apply(weigthed_average, 'WAC', 'Sales').to_frame(name = 'Sales weighted WAC').reset_index().rename(columns={'Date': 'date'})

# Extracting TRx weighted WAC per month
trx_wac = data.groupby('Date').apply(weigthed_average, 'WAC', 'TRx').to_frame(name = 'TRx weighted WAC').reset_index().rename(columns={'Date': 'date'})

# Merging all and formatting
# deliverable = nb_drugs.merge(yy_change, how='outer', on='date')
deliverable = nb_drugs.merge(nb_price_change, how='outer', on='date')
deliverable = deliverable.merge(sales_wac, how='outer', on='date')
deliverable = deliverable.merge(trx_wac, how='outer', on='date')

# Formatting date
deliverable['Year'] = deliverable.date.dt.year
deliverable['Month'] = deliverable.date.dt.month
deliverable.drop('date', axis=1, inplace=True)

# Computing percentage nb of drugs that have price change
deliverable['Percent # price changes'] = deliverable['# price changes'] / deliverable['# drugs']

# Cleaning year 2014
deliverable = deliverable[deliverable.Year != 2014]

# Creating date column - handier
deliverable['Date'] = deliverable.Year.astype(str) + '-' + deliverable.Month.astype(str)
deliverable['Date'] = pd.to_datetime(deliverable.Date)

# Cleaning order column
# deliverable = deliverable[['Class', 'Date', 'Year', 'Month', 'Y/Y Percent change', '# drugs','# price changes', 'Percent # price changes',  'TRx weighted WAC', 'Sales weighted WAC']]
deliverable = deliverable[['Date', 'Year', 'Month', '# drugs','# price changes', 'Percent # price changes',  'TRx weighted WAC', 'Sales weighted WAC']]



In [8]:
deliverable

Unnamed: 0,Date,Year,Month,# drugs,# price changes,Percent # price changes,TRx weighted WAC,Sales weighted WAC
5,2015-01-01,2015,1,908,474,0.522026,72.572725,4123.458879
6,2015-02-01,2015,2,908,65,0.071586,76.180482,4193.029493
7,2015-03-01,2015,3,908,61,0.067181,78.280391,4264.5546
8,2015-04-01,2015,4,908,54,0.059471,80.390102,4221.435068
9,2015-05-01,2015,5,908,62,0.068282,77.655018,4376.901533
10,2015-06-01,2015,6,908,145,0.159692,86.211126,4636.732754
11,2015-07-01,2015,7,908,137,0.150881,83.1217,4460.515755
12,2015-08-01,2015,8,908,61,0.067181,83.984784,4337.764502
13,2015-09-01,2015,9,908,49,0.053965,90.75513,4239.285345
14,2015-10-01,2015,10,908,79,0.087004,92.733855,3897.53704


In [23]:
deliverable.to_csv(f'deliverable_{drug_class}.csv', index=False)