# Sector labelling

Here we use an industrial classifier trained on business website data to predict the sector of GtR projects.

We also label the data with a couple of variables (prediction percentile and prediction tightness that should help evaluate the quality of the classification).

## 0. Preamble

In [None]:
% run notebook_preamble.ipy

In [None]:
# Put functions and things here

def get_latest_file(date_list,date_format='%d-%m-%Y'):
    '''
    This function takes a list of date strings and returns the most recent one
    
    Args:
        date_list: a list of strings with the format date_filename
        date_format: the format for the date, defaults to %d-%m-%Y
    
    Returns:
        The element with the latest date
    
    '''
    
    #This gets the maximum date in the gtr directory
    dates = [datetime.datetime.strptime('-'.join(x.split('_')[:3]),date_format) for x in date_list]
    
    #Return the most recent file
    most_recent = sorted([(x,y) for x,y in zip(date_list,dates)],key=lambda x:x[1])[-1][0]
    
    return(most_recent)
                                   

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])
    


## 1. Load data

In [None]:
# prediction_files = [x for x in os.listdir('../data/processed/') if 'labelled' in x]


# latest_file = get_latest_file(prediction_files)

# latest_file

### Labelled data

Note that we are working on a subset of the GtR data that removes projects with garbagey abstracts (see `02_jmg_discipline_labelling`)

In [None]:
#gtr_labelled = pd.read_csv('../data/processed/'+latest_file,compression='zip')

gtr_labelled = pd.read_csv('../data/processed/2_5_2019_gtr_labelled.csv')

In [None]:
gtr_labelled.columns

### Predictions

In [None]:
sector_predictions = pd.read_csv('../data/processed/gtr_abstracts_industries.csv')

In [None]:
sector_predictions.shape


In [None]:
gtr_labelled.shape

In [None]:
assert gtr_labelled.shape[0] == sector_predictions.shape[0]

In [None]:
#We need to make sure we are matching the right labels with the right industry predictions
gtr_w_industries = pd.concat([gtr_labelled,sector_predictions],axis=1)

In [None]:
industry_names = sector_predictions.columns[2:-1]

In [None]:
gtr_w_industries['top_industry'] = gtr_w_industries[industry_names].idxmax(axis=1)

gtr_w_industries['top_industry'].value_counts().plot.bar(color='blue',figsize=(10,4))

In [None]:
gtr_w_industries.to_csv(f'../data/processed/{today_str}_gtr_with_industry_labels.csv',compression='zip')

### 2. Measures of quality

#### Looking for tight predictions

We are particularly interested in predictions that are 'tight' (ie the distribution is highly skewed) and confident (they have high values)

We do this a couple of ways

1. Calculate variance in prediction for each observation

In [None]:
gtr_w_industries['industry_pred_var'] = gtr_w_industries[industry_names].apply(lambda x: np.var(x),axis=1)

industry_sorted_variance = gtr_w_industries.groupby('top_industry')['industry_pred_var'].mean().sort_values(ascending=False).index

In [None]:
boxplot_data = [[x for x,y in zip(gtr_w_industries['industry_pred_var'],gtr_w_industries['top_industry']) if y== ind] for ind in industry_sorted_variance]

fig,ax = plt.subplots(figsize=(10,5))

ax.boxplot(boxplot_data)

ax.set_xticklabels(industry_sorted_variance,rotation=90)

In [None]:
pred_variance_quartile = gtr_w_industries.groupby('top_industry')['industry_pred_var'].apply(lambda x: np.percentile(x,75))

In [None]:
gtr_w_industries['tight_prediction'] = [x>pred_variance_quartile[sector] for x,sector in zip(gtr_w_industries['industry_pred_var'],gtr_w_industries['top_industry'])]

#### Estimate kurtosis for each prediction

In [None]:
from scipy.stats import kurtosis

In [None]:
gtr_w_industries['kurtosis'] = gtr_w_industries[industry_names].apply(lambda x: kurtosis(x),axis=1)

In [None]:
gtr_w_industries.groupby('top_industry')['kurtosis'].mean().sort_values(ascending=False).plot.bar(figsize=(10,5),color='blue')

In [None]:
median_kurtosis = np.percentile(gtr_w_industries['kurtosis'],50)

gtr_w_industries['above_median_kurtosis'] = gtr_w_industries['kurtosis']>median_kurtosis


#### Identify more highly confident predictions in total and by sector

In [None]:
pc_75_preds_all = np.percentile(flatten_list([gtr_w_industries[sector] for sector in industry_names]),75)

#Are any of the predictions for a project above the 75 pc for all predictions?

gtr_w_industries['has_top_pred']= gtr_w_industries[industry_names].apply(lambda x: any(v>pc_75_preds_all for v in x),axis=1)

In [None]:
pc_75_by_sector = {sector: np.percentile(gtr_w_industries[sector],75) for sector in industry_names}

pd.DataFrame(pc_75_by_sector,index=[0]).T.sort_values(0,ascending=False).plot.bar(figsize=(10,5),color='blue',legend=False)

In [None]:
for sector in industry_names:
    
    gtr_w_industries[sector+'_top_q'] = gtr_w_industries[sector]>pc_75_by_sector[sector]
    

### Remove some sectors

In [None]:
#After some manual checking, we remove the below. They tend to misclassify projects for a variety of reasons potentially linked to noise in the source data

sectors_remove = ['services_consumer_retail','services_education_post_primary','services_travelling','services_real_state','services_administrative',
                 'services_electronics_machinery','primary_fishing','services_utilities)']

industry_selected = [x for x in industry_names if x not in sectors_remove]

gtr_w_industries['top_industry_2'] = gtr_w_industries[industry_selected].idxmax(axis=1)


## 3. Save data

In [None]:
gtr_w_industries.to_csv(f'../data/processed/{today_str}_gtr_w_industries.csv',compression='zip')

## Appendix: salient word extraction

Here we want to extract salient words from groups of data in order to visually interpret results.

We will create a function that groups the data into aggregated corpora by categories of interest, generates counts and normalises.

It returns a dict with words that can also be visualised. 

In [None]:
import string as st

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
class SalientWords():
    '''
    Class that extracts salient words from clusters of data.
    
    Arguments:
        A dataframe and two strings (the variable to groupby and the variable to use as text)
        
    Methods:
        .count_vect(): word frequencies for all words (takes **kwargs for additional parameters in the count vectorisation)
        .salient(): tfidf. It will also take **kwargs and a threshold for including words in the results
        .visualise(): it visualises the data as wordclouds
    
    '''
    
    def __init__(self,df,categories,text):
        '''
        Initialises with key variables
        
        '''    
        
        
        #This creates the joined corpus
        self.grouped_corpus = df.groupby(categories)[text].apply(lambda x: ' '.join(x))
        
        
        #Remove digits and special 
        dig = r'|'.join(st.digits)
        out = '\W+'
        
        self.processed_text = [re.sub(out,' ',re.sub(dig,' ',x.lower())) for x in self.grouped_corpus]
        
        #This is a dict we will use to store the results later
        self.groups = {i:[] for i in self.grouped_corpus.index}
        
        #return(self)
        
    def word_freqs(self,**kwargs):
        '''
        Terms frequencies over categories
        
        '''
        #load corpus
        X = self.processed_text
        
        count_vect = CountVectorizer(**kwargs)
        
        #Store outputs
        self.count_vect = count_vect
        self.token_freqs = count_vect.fit_transform(X)
        
        return(self)
    
    def salient(self,min_threshold=1000,extra_stops=['research','project','new','projects'],**kwargs):
        '''
        Salient terms in the data.
        
        '''
        
        #Get selected words
        
        word_freqs = pd.DataFrame(self.token_freqs.todense(),columns=self.count_vect.get_feature_names())
        
        word_freqs_total = word_freqs.sum(axis=0)
        
        #Create a dict so we can put word frequencies together with salient words later
        #word_freqs_dict = word_freqs.to_dict()
        
        #I am interested in any words above the threshold
        my_words = [x for x in word_freqs_total.index[word_freqs_total>min_threshold] if x not in extra_stops]
        
        
        #Initialise the tfidf
        tf = TfidfTransformer(**kwargs)
        
        
        #out
        X = tf.fit_transform(self.token_freqs)
        
        X_selected = pd.DataFrame(X.todense(),columns=self.count_vect.get_feature_names())[my_words]
            
            
        #Store the results
        for n,x in enumerate(self.groups.keys()):
            
            #Creates the dataframe combining tfs and wfs
            result = pd.concat([X_selected.iloc[n],word_freqs.iloc[n][my_words]],axis=1)
            
            result.columns = ['tfidf','count'] 
                   
            self.groups[x] = result
            
        return(self)
        
    def get_summary(self,tf_threshold=90,wf_threshold=75):
        '''
        
        Extracts a summary of the data based on tf and wf thresholds
        
        '''
        
        self.summary={i:[] for i in self.groups.keys()}
        
        for x in self.groups.keys():
            
            #Creates the dataframe
            result = self.groups[x]
            
            tf_thres = np.percentile(result['tfidf'],tf_threshold)
            
            summary = result.loc[result['tfidf']>tf_thres]
            
            wf_thres = np.percentile(result['count'],wf_threshold)
            
            summary_2 = summary.loc[summary['count']>wf_thres]
                   
            self.summary[x] = summary_2.sort_values('tfidf',ascending=False)
        
        return(self)

In [None]:
sal = SalientWords(gtr_w_industries,categories='top_industry_2',text='abstract')

In [None]:
sal.word_freqs(**{'stop_words':'english','max_features':20000,'ngram_range':(1,2)}).salient(min_threshold=500).get_summary(wf_threshold=50)

In [None]:
def make_wordcloud(term_freqs_df,var,name,ax):
    '''
    This function takes a df generated by the SalientWords class and returns a wordcloud
    
    '''
    
    input_dict = {w:f for w,f in zip(term_freqs_df.index,term_freqs_df[var])}

    wc = wordcloud.WordCloud(background_color="black").generate_from_frequencies(input_dict)

    ax.imshow(wc)
    
    ax.axis('off')
    
    ax.set_title(name)

In [None]:
fig,ax = plt.subplots(ncols=2,nrows=20,figsize=(10,40))

for n,name in enumerate(sal.summary.keys()):
    
    #print(n)
    
    if n<20:
        make_wordcloud(sal.summary[name],'tfidf',name,ax=ax[n][0])
        
    else:
        make_wordcloud(sal.summary[name],'tfidf',name,ax=ax[n-20][1])
        
plt.tight_layout()