In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import powerlaw
import random
plt.style.use('bmh')
%matplotlib inline

'Powerlaw' is a python package that allows easy fitting of powerlaw distributions to data 

In [51]:
#!pip install powerlaw

In [52]:
print(plt.style.available)

['seaborn-dark', 'seaborn-darkgrid', 'seaborn-ticks', 'fivethirtyeight', 'seaborn-whitegrid', 'classic', '_classic_test', 'fast', 'seaborn-talk', 'seaborn-dark-palette', 'seaborn-bright', 'seaborn-pastel', 'grayscale', 'seaborn-notebook', 'ggplot', 'seaborn-colorblind', 'seaborn-muted', 'seaborn', 'Solarize_Light2', 'seaborn-paper', 'bmh', 'tableau-colorblind10', 'seaborn-white', 'dark_background', 'seaborn-poster', 'seaborn-deep']


In [53]:
!pwd

/Users/mgambhir/OneDrive - SEEK/code/notebooks/RoleTitleSampling


In [54]:
!ls

README.md                 RoleTitlesAnalysis1.ipynb RoleTitlesAnalysis2.ipynb


Below we upload (into pandas dataframes) the two data files that we will be investigating here

The first 'ANZ_RT_norm.xslx' was sent to me by Kate a few weeks ago. She had done some analysis of the data file from a whole year; this is a ranked frequency distribution of the normalised role titles

The second is 'roletitle12-12-2019.csv' which was downloaded directly from the normalised structured data s3 bucket on 12/12/2019. 

In [55]:
roles=pd.read_excel("ANZ_RT_norm.xlsx")
roles1=pd.read_csv("roletitle12-12-2019.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'ANZ_RT_norm.xlsx'

In [None]:
print(roles.columns)

Take a quick look at the top of the frequency distribution dataframe

In [None]:
roles.head()

In [None]:
roles['log_count']=np.log(roles['Count of jobId'])

In [None]:
roles['log_rank']=np.log(roles['top'])

In [None]:
roles.head()

## First, look at the data
Below: Log-log plot of frequency vs. rank (i.e. log frequency vs. log rank)

In [None]:
roles.plot('log_rank','log_count',kind='line')

Below: semi-log plot of frequency vs. rank (i.e. log frequency vs. rank)

In [None]:
roles.plot('top','log_count',kind='line')

In [None]:
!pwd

In [None]:
!ls

In [None]:
roles1.columns

Find the number of unique role titles in the normalised RT column

In [None]:
uniqueRoles1=roles1['normalisedLongestMatch'].unique()

In [None]:
len(uniqueRoles1.tolist())

New dataframe to store the unique Role Title names and their occurrence count in the dataset

In [None]:
df2=roles1['normalisedLongestMatch'].value_counts()

In [None]:
df2.head()

In [None]:
roles1.shape

## Second, fit power law function to the data
Below, we're looking at the log-log plot of frequency vs. rank for 10%, 20% ...100% of the data, showing the evolution of the plot as a function of the amount of data. We're investigating whether the power law distribution converges in the limit of an extremely large data set and what he limiting value of the power exponent might be.   

In [None]:
import decimal
sampleNum = 20
z = [decimal.Decimal(i) / decimal.Decimal(sampleNum) for i in range(1, sampleNum+1)]

In [None]:
numRows=roles1.shape[0]

In [None]:
alphaList=[]
sigmaList=[]

fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(6,10))
fig.suptitle("""Top: Log-log plots of the rank-frequency distribution illustrating the effect of plotting an 
                increasing amount of data (10%, 20%,...up to 100% of a year\'s data, multicolored lines) 
                Bottom: Fitting a power law (blue dotted lines) to the log-probabilty vs. log-rank data (cyan lines)""")

for frac in z:
    rowNum=int(frac*numRows)
    print(rowNum)
    rankedRoles=roles1['normalisedLongestMatch'][0:rowNum].value_counts()  
    #rankedRoles=roles1['coreTitle'][0:rowNum].value_counts()  
    dfRankedRoles=pd.DataFrame({'rank':range(0,rankedRoles.shape[0]),'count':rankedRoles.values})
    x=dfRankedRoles['rank']
    y=dfRankedRoles['count']
    yLog=np.log(dfRankedRoles['count'])
    #dfRankedRoles.plot('rank','count',kind='line')
    ax1.plot(np.log(x), yLog)
    #plt.plot(np.log(x),yLog)
    #plt.plot(x,yLog)
    fit=powerlaw.Fit(y)
    fig2=fit.plot_pdf(color='c', linewidth=2)
    fit.power_law.plot_pdf(color='b',linestyle='--',ax=fig2)
    #fit.plot_ccdf(color='r', linewidth=2, ax=fig2)
    #fit.power_law.plot_ccdf(color='r', linestyle='--', ax=fig2)
    alphaList.append(fit.alpha)
    sigmaList.append(fit.sigma)

Below: plot of the value of alpha (the exponent of the power law expression) for the fitted power law using increasing amounts of data

In [None]:
plt.plot(alphaList)

In [None]:
powerlaw.plot_pdf(y, linear_bins=True)

In [None]:
fig2 = fit.plot_pdf(color='b', linewidth=2)
fit.power_law.plot_pdf(color='b',linestyle='--',ax=fig2)
fit.plot_ccdf(color='r', linewidth=2, ax=fig2)
fit.power_law.plot_ccdf(color='r', linestyle='--', ax=fig2)

In [None]:
#now do the same as above but with 50 samples of size x, randomly drawn from the full dataset

sampleSize = 20000
Start = 0
Stop = numRows -1 
limit = sampleSize

# List of random integers chosen from a range
randomSample = random.sample(range(Start, Stop), limit)

In [None]:
alphaList=[]
sigmaList=[]
numSamples = 20

for frac in range(1,numSamples):
    randomSample = random.sample(range(Start, Stop), limit)
    #rankedRoles=roles1['normalisedLongestMatch'][randomSample].value_counts()
    rankedRoles=roles1['coreTitle'][randomSample].value_counts() 
    dfRankedRoles=pd.DataFrame({'rank':range(0,rankedRoles.shape[0]),'count':rankedRoles.values})
    x=dfRankedRoles['rank']
    y=dfRankedRoles['count']
    yLog=np.log(dfRankedRoles['count'])
    #dfRankedRoles.plot('rank','count',kind='line')
    plt.plot(x,yLog)
    fit=powerlaw.Fit(y)
    alphaList.append(fit.alpha)
    sigmaList.append(fit.sigma)

In [None]:
plt.plot(alphaList)

In [None]:
y.tail

In [None]:
ySingle=y[y==1]

In [None]:
ySingle.index

## Fourth, try the capture-recapture method for estimating the size of the Role Title population given our samples

In [None]:
rankedRoles.index

In [None]:
idx1 = pd.Index([1, 2, 3, 4])
idx2 = pd.Index([3, 4, 5])

In [None]:
over=idx1.intersection(idx2)

In [None]:
len(over)

In [None]:
##### now do the same as above but with 50 samples of size x, randomly drawn from the full dataset
numSamples=2000
z = [decimal.Decimal(i) / decimal.Decimal(numSamples) for i in range(1, numSamples + 1)]

In [None]:
# First do the sample that will be the 'capture' part of the 'capture-recapture'
frac=z[1]
rowNumUpper=int(frac*numRows)
print(rowNumUpper)
#rolesSubset1=roles1['normalisedLongestMatch'][0:rowNumUpper]
rolesSubset1=roles1['coreTitle'][0:rowNumUpper]
rankedRoles1=rolesSubset1.value_counts()  
captureTitles=rankedRoles1.index[rankedRoles1.values==1]
lengthCapture=len(captureTitles)

In [None]:
overlap=[]

for frac in z[2:]:
    rowNumLower=rowNumUpper+1
    rowNumUpper=int(frac*numRows)
    #print(rowNumUpper)
    #rolesSubset=roles1['normalisedLongestMatch'][rowNumLower:rowNumUpper]
    rolesSubset=roles1['coreTitle'][rowNumLower:rowNumUpper]
    rankedRoles=rolesSubset.value_counts()
    
    recaptureTitles=rankedRoles.index[rankedRoles.values==1]
    capRecap=captureTitles.intersection(recaptureTitles)
    fracCapRecap=len(capRecap)/len(recaptureTitles)
    
    combDf=rolesSubset1.append(rolesSubset)
    combRankedRoles=combDf.value_counts()
    #correctionFactor=len(combRankedRoles)/len(combRankedRoles[combRankedRoles.values==1])
    correctionFactor=1
    overlap.append((correctionFactor**2)*lengthCapture/fracCapRecap)
    

In [None]:
plt.hist(overlap,bins=50)

In [None]:
# allRolesDf=pd.read_json('roletitle.json')

In [None]:
#i=(list(range(0,row)),list(range(15,20)))

In [None]:
#roles1['normalisedLongestMatch'][i]

In [None]:
!pip install xelatex