In [1]:
import pandas as pd
import numpy as np
import ast
import nltk
import re
from bs4 import BeautifulSoup

In [2]:
class Acc_Sheet():
    '''
__init__ : INPUT: Data_Path, Technology Dictionary path, English Dictionary path
                  4 Column Names-- ID, text, Correct Tags(MANUAL), BERT Tags
                  
clean_text: Can modify this according to requirement
            INPUT: Text
            RETURNS: Cleaned Text
            
ENG_SHEET: INPUT: None
           RETURNS:English Accuracy DataFrame (From BERT)
           
TECH_SHEET: INPUT: None
            RETURNS:Technology Accuracy DataFrame (From String Match)
            
ENG_TECH_SHEET: INPUT: None 
                RETURNS: English & Technology Combines Accuracy DataFrame 
                
There are other in-built functions in this class which is required for the above functions to return,
It is advisable to not to modify them
    '''
    def __init__(self,data_path,tech_path,eng_path,ID,summaries,manually_tagged,bert_tags):
        self.data=pd.read_csv(data_path)
        self.tech_data=pd.read_csv(tech_path)
        self.eng_data=pd.read_csv(eng_path)
        self.data_path=data_path
        self.ID=ID
        self.summaries=summaries
        self.manually_tagged=manually_tagged
        self.bert_tags=bert_tags
        self.data[manually_tagged]=self.data[manually_tagged].apply(self.clean_text)
        self.data[manually_tagged]=self.data[manually_tagged].apply(eval)

        self.data[bert_tags] = self.data[bert_tags].apply(eval)
        self.data[bert_tags] = self.data[bert_tags].apply(self.remove_space)

        col_name_tech=self.tech_data.columns[-1]
        self.tech_data[col_name_tech]=self.tech_data[col_name_tech].astype(str)
        self.tech_data[col_name_tech]=self.tech_data[col_name_tech].apply(self.clean_text)
        self.tech_list = list(set(self.tech_data[col_name_tech]))

        col_name_eng=self.eng_data.columns[-1]
        self.eng_data[col_name_eng]=self.eng_data[col_name_eng].astype(str)
        self.eng_data[col_name_eng]=self.eng_data[col_name_eng].apply(self.clean_text)
        self.eng_list= list(set(self.eng_data[col_name_eng]))
        
    def remove_space(self,string_list):
        k = [" ".join(i.split()) for i in string_list]
        return k

    def clean_text(self,text):
        text = text.lower() # lowercase text
        REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
        BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    #     text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #     text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
        return text

    def to_list(self,inp):
        return "['"+inp+"']"

    # Pass the matched technology column that we got
    def convert_data_format(self,df,column_name):
        for i in range(0, len(df)):
            if df[column_name][i][0] == '[':
                res = ast.literal_eval(df[column_name][i])
                df[column_name][i] = res
            else:
                x = []
                x.insert(0, df[column_name][i])
                df[column_name][i] = x
        return df

    def get_TP(self,df,manual_tagged_string,op_string):
        tp=[]
        for ind in df.index:
            TP = FP = 0
            for i in range(0, len(df[op_string][ind])):
                if df[op_string][ind][i] in df[manual_tagged_string][ind]:
                    TP = TP + 1
                else:
                    pass
            tp.append(TP)
        return tp

    def get_FP(self,df,manual_tagged_string, op_string):
        fp=[]
        fp_entry=[]
        for ind in df.index:
            main_list = np.setdiff1d(df[op_string][ind],df[manual_tagged_string][ind])
            fp.append(len(main_list))
            fp_entry.append(main_list)
        return fp, fp_entry

    def get_FN(self,df,manual_tagged_string, op_string):
        fn=[]
        for ind in df.index:
            main_list = np.setdiff1d(df[manual_tagged_string][ind], df[op_string][ind])
            if df[manual_tagged_string][ind]=='[]':
                fn.append(0)
            else:
                fn.append(len(main_list))
        return fn

    def get_tech_tagged_data(self,manually_tagged, tech_list):
        keep=[]
        for word in manually_tagged:
            if word in tech_list:
                keep.append(word)
        return keep

    def combine(self,lis1, lis2):
        return list(set(lis1 + lis2))

    def get_precision(self,tp,fp):    
        if tp+fp ==0:
            precision = 0
        else:
            precision = tp/(tp+fp)
        return precision

    def get_recall(self,tp,fn):
        if tp+fn ==0:
            recall = 0
        else:
            recall = tp/(tp+fn)
        return recall

    def get_f_score(self,precision, recall):
        if precision+recall == 0:
            f_score = 0
        else:
            f_score = 2*(precision*recall)/(precision+recall)
        return f_score
    
    def reg_match(self,txt, keyw):
            sep='|'
            temp = sep.join(['\\b' + i+'\\b' for i in keyw])
            matches = re.findall(temp,txt)
            if len(''.join(list(set(matches))))==0:
                return []
            else:
                return list(set(matches))

    def Diff(self,li1, li2): #FP entries
        return list(set(li2)-set(li1))

    def Same(self,lis1,lis2): #TP entries
        return list(set(lis1).intersection(lis2))

    def Common(self,lis1,lis2): #merging same entries
        return list(set(lis1)-(set(lis2)))
    
    def ENG_SHEET(self):
    # (all entries which are in eng dictionary)
        eng_bert_tags=[]
        removed=0
        kept=0
        for i in range(0,len(self.data)):
            eng_bert_1sen=[]
            tags= self.data[self.bert_tags][i]
            for j in tags:
                if j in self.eng_list:
                    kept+=1
                    eng_bert_1sen.append(j)
                else:
                    removed+=1
            eng_bert_tags.append(eng_bert_1sen)
        self.data[bert_tags+"_ENG"]=eng_bert_tags
        ##############################################################################################
        self.data.to_csv("BERT_ENG_ONLY_TEMP_DATA.csv",index=False)
        '''SAME FILE WRITE AND READ'''
        summary = pd.read_csv("BERT_ENG_ONLY_TEMP_DATA.csv")
        ##############################################################################################

        summary = summary.sort_values(by=[self.ID], ascending=False)
        summary.reset_index(inplace = True, drop = True)
        col_name_op = "BERT_Tags_ENG"

        df_cm = summary
        df_cm = self.convert_data_format(df_cm, col_name_op)
        df_cm = self.convert_data_format(df_cm, manually_tagged)

        count_context_technologies = []

        for i in range(0, len(df_cm)):
            count_context_technologies.extend(df_cm[manually_tagged][i])
        d = list(set(count_context_technologies))

        tp = []
        tn = []
        fp = []
        fn = []
        TP = TN = FP = FN = 0
        df_cm[manually_tagged] = df_cm[manually_tagged].fillna('[]')

        df_cm['Eng_Label']=df_cm[manually_tagged].copy()

        tech_tag_list = []

        df_cm['Eng_Label'] = df_cm.apply(
            lambda x: self.get_tech_tagged_data(x["Eng_Label"], self.eng_list),
            axis = 1)
        list_tech_dataset = []

        # fill with 
        for i in range(0, len(df_cm)):
            list_tech_dataset.extend(df_cm["Eng_Label"][i])

        set_tech = set(list_tech_dataset)

        for i in range(0, len(df_cm)):
            l = df_cm[col_name_op][i]
            df_cm[col_name_op][i] = [0 if element == '0' else element for element in l]

        for i in range(0, len(df_cm)):
            words = df_cm[col_name_op][i]
            stopwords = [0]
            for word in list(words):
                if word in stopwords:
                    words.remove(word)

        # list of technologies which are not in extracted list but are in tagged list i.e. FN entries
        df_cm['FP_entries_eng'] = df_cm.apply(lambda x: self.Diff(x["Eng_Label"], x[col_name_op]), axis = 1)

        # list of technologies which are in extracted list and are in tagged list i.e. TP entries
        df_cm['TP_entries_eng'] = df_cm.apply(lambda x: self.Same(x["Eng_Label"], x[col_name_op]), axis = 1)

        # list of technologies which are NOT extracted list and are in tagged list i.e. FN entries
        df_cm['FN_entries_eng'] = df_cm.apply(lambda x: self.Common(x["Eng_Label"], x[col_name_op]), axis = 1)

        fp_entry =[]
        tp = self.get_TP(df_cm, "Eng_Label", col_name_op)
        fp, fp_entry_names = self.get_FP(df_cm, "Eng_Label", col_name_op)
        fn = self.get_FN(df_cm, "Eng_Label", col_name_op)
        # print(len(tp), len(fp), len(fn))
        dict_fp = {}

        for i in range(0, len(fp_entry_names)):
            fp_entry_names[i] = fp_entry_names[i].tolist()
            dict_fp[i] = fp_entry_names[i]

        # creating a dataframe
        df_matrix = pd.DataFrame(list(zip(tp, fp, fn)), 
                       columns =['TP', 'FP', "FN"])
        df_matrix['Precision'] = df_matrix.apply(lambda x: self.get_precision(x["TP"], x["FP"]), axis = 1)

        df_matrix["Recall"] = df_matrix.apply(lambda x: self.get_recall(x["TP"], x["FN"]), axis = 1)

        df_matrix["f_score"] = df_matrix.apply(lambda x: self.get_f_score(x["Precision"], x["Recall"]), axis = 1)

        final_data = df_cm.join(df_matrix)
        return final_data

    def TECH_SHEET(self):
        summary = pd.read_csv(self.data_path)
        summary[self.manually_tagged]=summary[self.manually_tagged].apply(self.clean_text)
        summary = summary.sort_values(by=[self.ID], ascending=False)
        summary.reset_index(inplace = True, drop = True)
        summary[self.summaries]=summary[self.summaries].astype(str)
        all_strings = list(summary[self.summaries].str.lower())
        col_name_op = "Tech_matched"
        i=1
        tech_keys=[]
        for item in all_strings:
        #     print("***","TEST No,",i,"***")
            i=i+1
            tech_row=[]
            for k in self.tech_list:
                if k in item and len(k)>2:
                    tech_row.append(k)
            tech_keys.append(tech_row)

        summary['keywords']=tech_keys

        summary[col_name_op] = summary.apply(
            lambda x: self.reg_match(x[self.summaries], x["keywords"]),
            axis = 1)
        df_cm = summary
        df_cm = self.convert_data_format(df_cm, manually_tagged)
        df_cm[manually_tagged] = df_cm[manually_tagged].fillna('[]')
        count_context_technologies = []
        for i in range(0, len(df_cm)):
            count_context_technologies.extend(df_cm[manually_tagged][i])
        d = list(set(count_context_technologies))

        tp = []
        tn = []
        fp = []
        fn = []

        TP = TN = FP = FN = 0

        df_cm['Tech_Label'] = df_cm.apply(
            lambda x: self.get_tech_tagged_data(x[manually_tagged],self.tech_list), axis = 1)
        string_match_tech_col = df_cm['Tech_Label']
        list_tech_dataset = []

        for i in range(0, len(df_cm)):
            list_tech_dataset.extend(df_cm["Tech_Label"][i])

        set_tech = set(list_tech_dataset)

        for i in range(0, len(df_cm)):
            l = df_cm[col_name_op][i]
            df_cm[col_name_op][i] = [0 if element == '0' else element for element in l]

        for i in range(0, len(df_cm)):
            words = df_cm[col_name_op][i]
            stopwords = [0]
            for word in list(words):
                if word in stopwords:
                    words.remove(word)

        # list of technologies which are not in extracted list but are in tagged list i.e. FN entries
        df_cm['FP_entries_TECH'] = df_cm.apply(lambda x: self.Diff(x["Tech_Label"], x[col_name_op]), axis = 1)

        # list of technologies which are in extracted list and are in tagged list i.e. TP entries
        df_cm['TP_entries_TECH'] = df_cm.apply(lambda x: self.Same(x["Tech_Label"], x[col_name_op]), axis = 1)

        # list of technologies which are NOT extracted list and are in tagged list i.e. FN entries
        df_cm['FN_entries_TECH'] = df_cm.apply(lambda x: self.Common(x["Tech_Label"], x[col_name_op]), axis = 1)
        # function call
        fp_entry =[]
        tp = self.get_TP(df_cm, "Tech_Label", col_name_op)
        fp, fp_entry_names = self.get_FP(df_cm, "Tech_Label", col_name_op)
        fn = self.get_FN(df_cm, "Tech_Label", col_name_op)

        # creating a dataframe
        df_matrix = pd.DataFrame(list(zip(tp, fp, fn)), 
                       columns =['TP', 'FP', "FN"])

        df_matrix['Precision'] = df_matrix.apply(lambda x: self.get_precision(x["TP"], x["FP"]), axis = 1)

        df_matrix["Recall"] = df_matrix.apply(lambda x: self.get_recall(x["TP"], x["FN"]), axis = 1)

        df_matrix["f_score"] = df_matrix.apply(lambda x: self.get_f_score(x["Precision"], x["Recall"]), axis = 1)

        df_cm=df_cm.drop(columns="keywords")
        final_data = df_cm.join(df_matrix)
        return final_data
    
    def ENG_TECH_SHEET(self):
        data=pd.read_csv("BERT_ENG_ONLY_TEMP_DATA.csv")#####READ ENG SHEET GENERATED from ENG_SHEET########
        bert_tags="BERT_Tags_ENG"
        col_name_op="BERT_ENG+string_matched"

        data[bert_tags] = data[bert_tags].apply(str)
        data[bert_tags] = data[bert_tags].apply(eval)
        data[bert_tags] = data[bert_tags].apply(self.remove_space)
        summary=data
        summary[self.summaries]=summary[self.summaries].astype(str)
        all_strings = list(summary[self.summaries].str.lower())

        tech_keys=[]
        for item in all_strings:
            tech_row=[]
            for k in self.tech_list:
                if k in item and len(k)>2:
                    tech_row.append(k)
            tech_keys.append(tech_row)
        summary['keywords']=tech_keys
        summary['string_matched'] = summary.apply(lambda x: self.reg_match(x[self.summaries], x["keywords"]), axis = 1)
        summary['BERT_ENG+string_matched'] = summary.apply(
            lambda x: self.combine(x[bert_tags], x["string_matched"]), 
            axis = 1)
        data=summary
        ########CHANGE NAME ##################
        data.to_csv("BERT_ENG_ONLY_TEMP_DATA+TECH.csv")
        ###########READ SAME FILE####################
        summary = pd.read_csv("BERT_ENG_ONLY_TEMP_DATA+TECH.csv")
        col_name_op ="BERT_ENG+string_matched"#column name
        summary = summary.sort_values(by=[self.ID], ascending=False)
        summary.reset_index(inplace = True, drop = True)

        df_cm = summary
        df_cm = self.convert_data_format(df_cm, col_name_op)
        df_cm = self.convert_data_format(df_cm,manually_tagged)
        count_context_technologies = []
        for i in range(0, len(df_cm)):
            count_context_technologies.extend(df_cm[manually_tagged][i])
        d = list(set(count_context_technologies))
        tp = []
        tn = []
        fp = []
        fn = []

        TP = TN = FP = FN = 0
        df_cm[manually_tagged] = df_cm[manually_tagged].fillna('[]')
        for i in range(0, len(df_cm)):
            l = df_cm[col_name_op][i]
            df_cm[col_name_op][i] = [0 if element == '0' else element for element in l]

        for i in range(0, len(df_cm)):
            words = df_cm[col_name_op][i]
            stopwords = [0]
            for word in list(words):
                if word in stopwords:
                    words.remove(word)

        # list of technologies which are not in extracted list but are in tagged list i.e. FN entries
        df_cm['FP_entries'] = df_cm.apply(lambda x: self.Diff(x[manually_tagged], x[col_name_op]), axis = 1)

        # list of technologies which are in extracted list and are in tagged list i.e. TP entries
        df_cm['TP_entries'] = df_cm.apply(lambda x: self.Same(x[manually_tagged], x[col_name_op]), axis = 1)

        # list of technologies which are NOT extracted list and are in tagged list i.e. FN entries
        df_cm['FN_entries'] = df_cm.apply(lambda x: self.Common(x[manually_tagged], x[col_name_op]), axis = 1)
        
        fp_entry =[]
        tp = self.get_TP(df_cm, manually_tagged, col_name_op)
        fp, fp_entry_names = self.get_FP(df_cm, manually_tagged, col_name_op)
        fn = self.get_FN(df_cm, manually_tagged, col_name_op)

        dict_fp = {}

        for i in range(0, len(fp_entry_names)):
            fp_entry_names[i] = fp_entry_names[i].tolist()
            dict_fp[i] = fp_entry_names[i]

        # creating a dataframe
        df_matrix = pd.DataFrame(list(zip(tp, fp, fn)), 
                       columns =['TP', 'FP', "FN"])

        df_matrix['Precision'] = df_matrix.apply(lambda x: self.get_precision(x["TP"], x["FP"]), axis = 1)

        df_matrix["Recall"] = df_matrix.apply(lambda x: self.get_recall(x["TP"], x["FN"]), axis = 1)

        df_matrix["f_score"] = df_matrix.apply(lambda x: self.get_f_score(x["Precision"], x["Recall"]), axis = 1)

        final_data = df_cm.join(df_matrix)
        return final_data

## Give Paths or Required Names

In [3]:
print(Acc_Sheet.__doc__)


__init__ : INPUT: Data_Path, Technology Dictionary path, English Dictionary path
                  4 Column Names-- ID, text, Correct Tags(MANUAL), BERT Tags
                  
clean_text: Can modify this according to requirement
            INPUT: Text
            RETURNS: Cleaned Text
            
ENG_SHEET: INPUT: None
           RETURNS:English Accuracy DataFrame (From BERT)
           
TECH_SHEET: INPUT: None
            RETURNS:Technology Accuracy DataFrame (From String Match)
            
ENG_TECH_SHEET: INPUT: None 
                RETURNS: English & Technology Combines Accuracy DataFrame 
                
    


In [4]:
data_path = "BERT_production_tags_JD_mergedkeys.csv" #ENTER DATA PATH
tech_path = "Tech_pydictionary_final_JD.csv"
eng_path = "Eng_pydictionary_final_JD.csv"
ID = 'job_id'                                # ID column name
text_col = 'job_desc'                       # column name on which predictions are made
manually_tagged = 'prod_tag'                 #manual tag column name
bert_tags = 'BERT_Tags'                      #column name

obj_name = Acc_Sheet(data_path,tech_path,eng_path,ID,text_col,manually_tagged,bert_tags)

temp = obj_name.ENG_SHEET()    ##########GIVE ENG ACC NAME############
print("ENG_SHEET_NAME", temp['TP'].sum(), temp['FP'].sum(), temp['FN'].sum())
ENG_SHEET_NAME = "ACC-BERT_TITLE_ENG_ONLY_DATE.csv"
# temp.to_csv(ENG_SHEET_NAME,index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][i] = res
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cm[col_name_op][i] = [0 if element == '0' else element for element in l]


ENG_SHEET_NAME 5493 20 1080


In [5]:
temp = obj_name.TECH_SHEET()
TECH_SHEET_NAME = "ACC-TECH_SM_TITLE_DATE.csv"  ##########GIVE TECH ACC NAME############

print(TECH_SHEET_NAME, temp['TP'].sum(), temp['FP'].sum(), temp['FN'].sum())
# temp.to_csv(TECH_SHEET_NAME,index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][i] = res
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cm[col_name_op][i] = [0 if element == '0' else element for element in l]


ACC-TECH_SM_TITLE_DATE.csv 804 30 1295


In [6]:
temp = obj_name.ENG_TECH_SHEET()
ENG_TECH_SHEET_NAME = "ACC-ENG_TECH_SM_TITLE_DATE.csv"  ##########GIVE ENG+TECH ACC NAME############

print(ENG_TECH_SHEET_NAME, temp['TP'].sum(), temp['FP'].sum(), temp['FN'].sum())
# temp.to_csv(ENG_TECH_SHEET_NAME,index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][i] = res
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cm[col_name_op][i] = [0 if element == '0' else element for element in l]


ACC-ENG_TECH_SM_TITLE_DATE.csv 6297 50 2375
