# __Clean Patent Data__
This file will clean the patent data in preparation for merging with price data.

The patent_cleaner, at one point, combines the values in three columns to produce a single 'name' column.  

In a later notebook, I'll compare that 'name' column to a similar column in the prices dataset and hopefully be able to accurately combine all this information for further analysis.

In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
# Read in relevant files (the first 3 belong in a set)
patent_data = pd.read_csv('data/patent_data.csv', index_col = False)

In [33]:
def patent_cleaner(df):
    # Split DF and Route information into different columns
    df['dosage_form'] = df['DF;Route'].str.split(';').str[0]
    df['route'] = df['DF;Route'].str.split(';').str[-1]

    # Make all column headers lowercase
    df.columns = map(str.lower, df.columns)

    # Create an aggregate column from the all_data dataframe to match against the prices 'ndc description' dataframe with fuzzywuzzy's Levenshtein Distance generator
    df['ndc_description_agg'] = df['trade_name'] + " " + df['strength'] + " " + df['route']
    df.drop('df;route', axis=1).head()

    #From here on, this modification means that these drugs were approved prior to Jan 1, 1982
    df['approval_date'].replace('Approved Prior to Jan 1, 1982', 'Dec 31, 1981', inplace = True) 
    df['exclusivity_date'].replace('Approved Prior to Jan 1, 1982', 'Dec 31, 1981', inplace = True)
    df['patent_expire_date_text'].replace('Approved Prior to Jan 1, 1982', 'Dec 31, 1981', inplace = True)
    df['submission_date'].replace('Approved Prior to Jan 1, 1982', 'Dec 31, 1981', inplace = True)

    # Convert the following columns to datetime format
    df['approval_date'] = pd.to_datetime(df['approval_date']) #, format='%Y/%m/%d')
    df['exclusivity_date'] = pd.to_datetime(df['exclusivity_date']) #, format='%Y/%m/%d')
    df['patent_expire_date_text'] = pd.to_datetime(df['patent_expire_date_text']) #, format='%Y/%m/%d')
    df['submission_date'] = pd.to_datetime(df['submission_date']) #, format='%Y/%m/%d')

    return df.info()

In [34]:
patent_cleaner(patent_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59716 entries, 0 to 59715
Data columns (total 29 columns):
unnamed: 0                 59716 non-null int64
ingredient                 59716 non-null object
df;route                   59716 non-null object
trade_name                 59716 non-null object
applicant                  59716 non-null object
strength                   59648 non-null object
appl_type_x                59716 non-null object
appl_no                    59716 non-null int64
product_no                 59716 non-null int64
te_code                    20659 non-null object
approval_date              59716 non-null datetime64[ns]
rld                        59716 non-null object
rs                         59716 non-null object
type                       59716 non-null object
applicant_full_name        59716 non-null object
appl_type_y                24230 non-null object
patent_no                  24230 non-null object
patent_expire_date_text    24230 non-null datetime64[

In [None]:
# Visualize frequency of nulls in patent data
sns.heatmap(Cleaned_Patent_Data.isnull(), cbar = False)

In [43]:
# Export cleaned files
patent_data.to_csv('data/Cleaned_Patent_Data.csv')