# Exploring data
# Version 2

Version 2 pulls data from a standard source and can be used for any bgCAS. (Although this is 
primarily designed to examine "proprietary" data).

## Step 1: Get data into Colab
Run the cells below to import Open-FF data

In [None]:
# This cell downloads some support code that is used to pull together the data set.  
!git clone https://github.com/gwallison/colab-support.git &>/dev/null;

# now run the code that defines the routine
%run colab-support/get_dataframe.py

In [None]:
# get_dataset pulls together a set of CSV files from a google storage site, then merges them
#  result: df is a dataframe with all records (though not ALL fields)
df = get_dataset()

# if you want to see what fields are in df, uncomment the following line
# df.columns

In [None]:
# Filter to a single bgCAS type
df_one = df[df.bgCAS=='proprietary'].copy()

## Aggregating to a single value for each disclosure
When there can be more than one record of a "chemical" in a disclosure and you want the total, you can aggregate to 'UploadKey', the unique value for each disclosure.  

We use 'groupby' to accomplish this.

In [None]:
gb1 = df_one.groupby('UploadKey',as_index=False)[['calcMass','PercentHFJob']].sum()
gb1.head()

In [None]:
# if you want to add other fields to these sets, you need to do some other groupbys and then merge:
# for these disclosure-wide fields, take just the first
gb2 = df_one.groupby('UploadKey',as_index=False)[['APINumber','date','bgOperatorName','TotalBaseWaterVolume']].first()

agg_df = pd.merge(gb1,gb2,on='UploadKey',how='left',validate='1:1')
agg_df.head()

# you can use this resulting aggregated df as your source data


---
# Proprietary area maps

In [None]:
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText

def proprietary_plot(df,plot_title='TEST',minyr=2011,maxyr=2021):
    df = df.copy()
    df['year'] = df.date.dt.year
    df = df[df.year<=maxyr]
    df = df[df.year>=minyr]
    
    prop = df.bgCAS=='proprietary'
    conflict = df.bgCAS=='conflictingID'
    df['is_valid_CAS'] = prop | conflict | (df.bgCAS.str[0].isin(['0','1','2','3','4','5','6','7','8','9']))

    # first get the number of proprietary records in each disclosure
    gb = df[prop].groupby('UploadKey',as_index=False)['bgCAS'].count().rename({'bgCAS':'numprop'},axis=1)
    # now get the total number of valid CAS records in each disclosure (this ignore ambiguous and system approach records)
    gb1 = df[df.is_valid_CAS].groupby('UploadKey',as_index=False)['bgCAS'].count().rename({'bgCAS':'numvalid'},axis=1)
    gb2 = df.groupby('UploadKey',as_index=False)['date'].first()
    mg = pd.merge(gb2,gb,on='UploadKey',how='left')
    mg = pd.merge(mg,gb1,on='UploadKey',how='left')
    mg.fillna(0,inplace=True)
    # make the Precent Proprietary field for each disclosure
    mg['percProp'] = (mg.numprop / mg.numvalid) * 100

    # cut it up into bins
    mg['propCut'] = pd.cut(mg.percProp,right=False,bins=[0,0.0001,10,25,50,101],
                          labels=['no proprietary claims','up to 10% proprietary claims',
                                  'between 10 and 25% proprietary claims',
                                  'between 25 and 50% proprietary claims',
                                  'greater than 50% proprietary claims'])
    mg['year'] = mg.date.dt.year
    out = mg.drop(['date','UploadKey'],axis=1)
    # summarize to year
    t = out[out.numvalid>0].groupby(['year','propCut'],as_index=False)['numvalid'].count()
    sums = t.groupby('year',as_index=False)['numvalid'].sum().rename({'numvalid':'tot'},axis=1)
    t = pd.merge(t,sums,on='year',how='left')
    # do the by-year calc
    t['PercentProp'] = t.numvalid/t.tot *100

    # pivot to make it easy to plot
    piv = t.pivot(index='year', columns='propCut', values='PercentProp')

    ax = piv.plot.area(figsize=(12,7),ylim=(0,100),xlim=(minyr,maxyr),colormap='Reds')
    ax.set_title(f'Percentage of valid records that are Trade Secret claims at the disclosure level', fontsize=16)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], title='Disclosure Proprietary\nPercentage class\n',
              loc='upper left',bbox_to_anchor=(1, 1))
    ax.set_ylabel('Percentage of disclosures', fontsize=16)
    ax.set_xlabel('Year', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.suptitle(f'{plot_title}',fontsize=24)

    gb = df.groupby(['year','UploadKey'],as_index=False)['bgCAS'].count()
    gb = gb.groupby('year',as_index=False)['UploadKey'].count()#.rename({'UploadKey':'number of disclosures'},axis=1)
    s = 'Number of disclosures by year:\n\n'
    for i,row in gb.iterrows():
        s+= f'   {row.year}: {row.UploadKey:7,} \n'
    at2 = AnchoredText(s,
                       loc='lower left', prop=dict(size=10), frameon=False,
                       bbox_to_anchor=(1., 0.),
                       bbox_transform=ax.transAxes
                       )
    at2.patch.set_boxstyle("square,pad=0.")
    ax.add_artist(at2)
    
proprietary_plot(df,'All data',minyr=2011,maxyr=2022)
