In [None]:
import pickle
import json
import os
import re
import redshift_connector
import pandas as pd
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
import numpy as np

from collections import Counter
from math import ceil

In [None]:
import tensorflow as tf
import unidecode
from langdetect import detect
from transformers import create_optimizer, TFAutoModelForSequenceClassification, DistilBertTokenizer
from transformers import DataCollatorWithPadding, TFDistilBertForSequenceClassification, PreTrainedTokenizerFast
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
# Redshift credentials for querying the OpenAlex database
with open("redshift_creds.txt", "r") as f:
    host = f.readline()[:-1]
    password= f.readline()[:-1]

In [None]:
# Creating a connection
conn = redshift_connector.connect(
     host=host,
     database='dev',
     user='app_user',
     password=password
  )

cursor = conn.cursor()

### Loading the Test Data

##### Make sure the gold_1000.parquet and gold_500.parquet files are in the current directory.

In [None]:
# Loading the Gold 1000 dataset (strings with affiliation ID in MAG/OpenAlex)
gold_1000 = pd.read_parquet("gold_1000.parquet")
gold_1000['true_affiliation_id'] = gold_1000['true_affiliation_id'].apply(lambda x: [int(i) for i in 
                                                                                     x.split(",a")])

In [None]:
# Loading the Gold 500 dataset (strings with empty affiliation IDs in MAG/OpenAlex)
gold_500_empty = pd.read_parquet("gold_500.parquet")
gold_500_empty['true_affiliation_id'] = gold_500_empty['true_affiliation_id'].apply(lambda x: [int(i) for i in 
                                                                                     x.split(",a")])

In [None]:
def check_for_correct_pred(pred, target_list):
    if pred in target_list:
        return 1
    else:
        return 0

#### Getting affiliation ID mapping

In [None]:
query = """select affiliation_id, display_name, city, region, country
           from mid.institution"""

cursor.execute("ROLLBACK;")
cursor.execute(query)
df = cursor.fetch_dataframe()
df.shape

In [None]:
full_affiliation_dict = df.set_index('affiliation_id').to_dict('index')

#### Getting Other Needed Data

In [None]:
# File that contains dictionary of countries and some alternate names
with open("countries.json", "r") as f:
    countries_dict = json.load(f)

countries_list = []

_ = [countries_list.append(j) for j in countries_dict.values()]
countries_list_flat = [x for y in countries_list for x in y]

In [None]:
# Writing the flat list file to be used in deployment later on
with open("countries_list_flat.pkl", "wb") as f:
    pickle.dump(countries_list_flat, f)

In [None]:
# List of departments to check for when doing initial string prediction
list_of_departments = ['Psychology','Nephrology','Other departments', 'Other Departments', 'Nursing & Midwifery',
                       'Literature and Creative Writing','Neuroscience','Engineering','Computer Science',
                       'Chemistry','Biology','Medicine']

In [None]:
# Writing the departments out to a pickle file
with open("departments_list.pkl", "wb") as f:
    pickle.dump(list_of_departments, f)

In [None]:
# Public file of countries and associated cities
# -------> http://download.geonames.org/export/dump/
all_countries = pd.read_csv("allCountries.txt", delimiter="\t", header=None)

all_countries.columns = ["geonameid","name", "asciiname", "alternatenames", "latitude" , 
                         "longitude", "feature_class","feature_code","country","cc2","admin1","admin2",
                         "admin3","admin4","population","elevation","dem","timezone","modification"]

# ISO codes for each country
country_codes = pd.read_csv("country_codes.txt", delimiter='\t') \
[['ISO','Country']]

country_codes.columns = ['country','country_name']

In [None]:
# Looking to get different combinations of cities and countries to check if string contains no
# useful information about the insitution other than the city and country

# For example if a affiliation string was "Barcelona, Spain" we would want to flag it so that the
# model does not try to predict an institution

# Only using cities that have a population over 100,000
country_solo = all_countries[(all_countries['feature_class']=='P') & 
              (all_countries['population']>100000)].sort_values("population", ascending=False) \
[["name", "asciiname","country","population"]] \
.merge(country_codes, how='left', on='country') \
[['name','country_name']]

# Creating additional strings to check for
country_solo['city_country'] = country_solo['name'] + ", " + country_solo['country_name']
country_solo['country_country'] = country_solo['country_name'] + ", " + country_solo['country_name']

In [None]:
city_country_solo_list = list(set(country_solo['name'].drop_duplicates().to_list() + 
                                  country_solo['country_name'].drop_duplicates().to_list() + 
                                  country_solo['city_country'].drop_duplicates().to_list() + 
                                  country_solo['country_country'].drop_duplicates().to_list()))

In [None]:
# Writing out the list to a file to be used in deployment
with open("city_country_list.pkl", "wb") as f:
    pickle.dump(city_country_solo_list, f)

### Loading the Models

#### Language Model

In [None]:
MAX_LEN_lang = 512
language_model_dir = "language_model/"
language_model = TFAutoModelForSequenceClassification.from_pretrained(language_model_dir)
language_model_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", 
                                                               return_tensors='tf')
data_collator = DataCollatorWithPadding(tokenizer=language_model_tokenizer, 
                                        return_tensors='tf')

with open(f"{language_model_dir}vocab.pkl", "rb") as f:
    language_model_tokenizer_affiliation_vocab = pickle.load(f)
    
inverse_language_affiliation_vocab = {i:j for j,i in language_model_tokenizer_affiliation_vocab.items()}
language_model.compile()

#### Basic Model

In [None]:
MAX_LEN_basic = 128
basic_model_dir = "basic_model/"
basic_model = tf.keras.models.load_model(f'./{basic_model_dir}')
basic_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f"./{basic_model_dir}basic_tokenizer")

with open(f"./{basic_model_dir}affiliation_vocab.pkl", "rb") as f:
    basic_affiliation_vocab = pickle.load(f)
    
inverse_basic_affiliation_vocab = {i:j for j,i in basic_affiliation_vocab.items()}
basic_model.compile()

### Ensemble Models

Looking at using both models at the same time because the individual performance was not as good as expected. This code was refined into the final deployment code seen in the 004 notebook and in the deployment folder.

In [None]:
def get_country_in_string(text):
    """
    Looks for countries in the affiliation string to be used in filtering later on.
    """
    countries_in_string = []
    _ = [countries_in_string.append(x) for x,y in countries_dict.items() if 
         np.max([1 if re.search(fr"\b{i}\b", text) else 0 for i in y]) > 0]
    _ = [countries_in_string.append(x) for x,y in countries_dict.items() if 
         np.max([1 if re.search(fr"\b{i}\b", text.replace(".","")) else 0 for i in y]) > 0]
    return list(set(countries_in_string))

def preprocess_function_pandas_language(examples):
    """
    Returns a tokenized version of the string for the language model.
    """
    return language_model_tokenizer(examples, truncation=True, padding=True, max_length=MAX_LEN_lang)

def country_and_language_aware_prediction_language(orig_aff_string, tok_data, top_k=5):
    """
    Prediction for the language model that looks at country in string as well as
    the string language.
    """
    # checking for the string language
    try:
        string_lang = detect(orig_aff_string)
    except:
        string_lang = 'en'
    
    # simple string search for country
    countries_in_string = get_country_in_string(orig_aff_string)
    comma_split = [x for x in orig_aff_string.split(",") if x]
    
    # logic for changing prediction to -1 (for strings that can't be predicted)
    if string_lang in ['fa','ko','zh-cn','zh-tw','ja','uk','ru','vi']:
        final_pred = -1
        final_score = 0.0
    elif ((orig_aff_string.startswith("Dep") | 
           orig_aff_string.startswith("School")) & 
          ("," not in orig_aff_string) & 
          (not countries_in_string)):
        final_pred = -1
        final_score = 0.999
    elif ((orig_aff_string.startswith("Dep") | 
           orig_aff_string.startswith("School")) & 
          (len(comma_split) < 2) & 
          (not countries_in_string)):
        final_pred = -1
        final_score = 0.999
    elif orig_aff_string in list_of_departments:
        final_pred = -1
        final_score = 0.999
    elif orig_aff_string in city_country_solo_list:
        final_pred = -1
        final_score = 0.999
    elif re.search(r"\b(LIANG|YANG|LIU|et al|XIE|JIA|ZHANG|QU)\b", orig_aff_string):
        final_pred = -1
        final_score = 0.999
    else:
        # getting the predictions and probabilities
        data = data_collator(tok_data)
        scores, labels = tf.math.top_k(tf.nn.softmax(
            language_model.predict([data['input_ids'], 
                                    data['attention_mask']]).logits)[0].numpy(), 5)
        scores = scores.numpy().tolist()
        labels = labels.numpy().tolist()
        mapped_labels = [inverse_language_affiliation_vocab[i] for i,j in zip(labels,scores) if 
                         i!=language_model_tokenizer_affiliation_vocab[-1]]
        scores = [j for i,j in zip(labels,scores) if i!=language_model_tokenizer_affiliation_vocab[-1]]
        final_pred = mapped_labels[0]
        final_score = scores[0]
        if mapped_labels[0] < 0:
            pass
        elif not full_affiliation_dict[mapped_labels[0]]['country']:
            pass
        else:
            if not countries_in_string:
                pass
            else:
                for pred,score in zip(mapped_labels, scores):
                    if pred < 0:
                        break
                    elif not full_affiliation_dict[pred]['country']:
                        # trying pass instead of break to give time to find the correct country
                        pass
                    elif full_affiliation_dict[pred]['country'] in countries_in_string:
                        final_pred = pred
                        final_score = score
                        break
                    else:
                        pass
    return [final_pred,final_score,string_lang]

def explore_sent_prediction_language(raw_sentence):
    """
    Takes in a raw sentence and returns the prediction from the language model.
    """
    top_k = 5
    
    sentence = unidecode.unidecode(raw_sentence)
    tokenized_data = preprocess_function_pandas_language([sentence])
    pred,score,lang = country_and_language_aware_prediction_language(raw_sentence, tokenized_data, top_k)
    
    return [pred,score,lang]

def max_len_and_pad(tok_sent):
    """
    Processes the basic model data to the correct input length.
    """
    max_len = MAX_LEN_basic
    tok_sent = tok_sent[:max_len]
    tok_sent = tok_sent + [0]*(max_len - len(tok_sent))
    return tok_sent

def preprocess_function_pandas_basic(examples):
    """
    Returns a tokenized version of the string for the basic model.
    """
    examples = unidecode.unidecode(examples)
    examples = basic_tokenizer.encode(examples)
    examples = max_len_and_pad(examples)
    
    return examples

def country_and_language_aware_prediction_basic(orig_aff_string, tok_data, top_k=5):
    """
    Prediction for the basic model that looks at country in string as well as
    the string language.
    """
    # checking for the string language
    try:
        string_lang = detect(orig_aff_string)
    except:
        string_lang = 'en'
        
    # getting the predictions and probabilities
    scores, labels = tf.math.top_k(basic_model.predict([tok_data]), 2)
    scores = scores.numpy()[0].tolist()
    labels = labels.numpy()[0].tolist()
    
    # check if the initial prediction is a -1, if it is use the second high prediction
    if labels[0] == basic_affiliation_vocab[-1]:
        final_pred = inverse_basic_affiliation_vocab[labels[1]]
        final_score = scores[1]
    else:
        final_pred = inverse_basic_affiliation_vocab[labels[0]]
        final_score = scores[0]
    
    # simple string search for country
    countries_in_string = get_country_in_string(orig_aff_string)
    comma_split = [x for x in orig_aff_string.split(",") if x]
    
    # logic for changing prediction to -1 (for strings that can't be predicted)
    if string_lang in ['fa','ko','zh-cn','zh-tw','ja','uk','ru','vi']:
        final_pred = -1
        final_score = 0.0
    elif ((orig_aff_string.startswith("Dep") | 
           orig_aff_string.startswith("School")) & 
          ("," not in orig_aff_string) & 
          (not countries_in_string)):
        final_pred = -1
        final_score = 0.999
    elif ((orig_aff_string.startswith("Dep") | 
           orig_aff_string.startswith("School")) & 
          (len(comma_split) < 2) & 
          (not countries_in_string)):
        final_pred = -1
        final_score = 0.999
    elif orig_aff_string in list_of_departments:
        final_pred = -1
        final_score = 0.999
    elif orig_aff_string in city_country_solo_list:
        final_pred = -1
        final_score = 0.999
    elif re.search(r"\b(LIANG|YANG|LIU|et al|XIE|JIA|ZHANG|QU)\b", orig_aff_string):
        final_pred = -1
        final_score = 0.999
    else:
        pass

    return [final_pred,final_score,string_lang]

def explore_sent_prediction_basic(raw_sentence):
    """
    Takes in a raw sentence and returns the prediction from the basic model.
    """
    top_k = 5
    
    tokenized_data = preprocess_function_pandas_basic(raw_sentence)
    pred,score,lang = country_and_language_aware_prediction_basic(raw_sentence, tokenized_data, top_k)
    
    return [pred,score,lang]

### Gold 1000

In [None]:
# Loading the gold 1000 dataset
data_1000 = gold_1000.copy()

# Getting predictions for the language model
data_1000['pred_score_language'] = data_1000['raw_affiliation'].apply(explore_sent_prediction_language)
data_1000['pred_language'] = data_1000['pred_score_language'].apply(lambda x: x[0])
data_1000['score_language'] = data_1000['pred_score_language'].apply(lambda x: x[1])

# Getting predictions for the basic model
data_1000['pred_score_basic'] = data_1000['raw_affiliation'].apply(explore_sent_prediction_basic)
data_1000['pred_basic'] = data_1000['pred_score_basic'].apply(lambda x: x[0])
data_1000['score_basic'] = data_1000['pred_score_basic'].apply(lambda x: x[1])

# Checking if basic model prediction is correct
data_1000['pred_correct_basic'] = data_1000.apply(lambda x: check_for_correct_pred(x.pred_basic,
                                                                         x.true_affiliation_id), axis=1)

# Checking if language model prediction is correct
data_1000['pred_correct_language'] = data_1000.apply(lambda x: check_for_correct_pred(x.pred_language,
                                                                         x.true_affiliation_id), axis=1)

# Checking if the language model and basic model predict the same affiliation
data_1000['pred_same'] = data_1000.apply(lambda x: x.pred_basic==x.pred_language, axis=1).astype('int')

# Checking if true affiliation should be empty
data_1000['equals_negative_one'] = data_1000['true_affiliation_id'] \
.apply(lambda x: x[0]==-1).astype('int')

# Checking if true affiliation should not be empty
data_1000['not_equals_negative_one'] = data_1000['true_affiliation_id'] \
.apply(lambda x: x[0]!=-1).astype('int')

#### Grid search for the optimal combination of thresholds

In [None]:
for thresh_basic in [0.0, 0.05, 0.10,0.15,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8]:
    for thresh_language in [0.0, 0.05, 0.1, 0.3,0.6,0.7,0.8,0.9,0.95,0.96,0.98,0.99]:
        final_pred_data = pd.DataFrame()
        
        # Getting all predictions that were matched by both models
        match_data_1000 = data_1000[data_1000['pred_same']==1] \
        [['paper_id','true_affiliation_id','raw_affiliation',
          'pred_basic','pred_correct_basic']].copy()
        match_data_1000.columns = ['paper_id','true_affiliation_id',
                                   'raw_affiliation','pred','pred_correct']
        match_data_1000['data_type'] = 'match'
        match_data_id_list = match_data_1000['paper_id'].to_list()
        final_pred_data = pd.concat([final_pred_data, match_data_1000], axis=0)

        # Getting all predictions that meet the basic model threshold
        basic_thresh_1000 = data_1000[(data_1000['score_basic']>thresh_basic) & 
                                      (~data_1000['paper_id'].isin(match_data_id_list))]\
        [['paper_id','true_affiliation_id',
          'raw_affiliation','pred_basic','pred_correct_basic']].copy()
        basic_thresh_1000.columns = ['paper_id','true_affiliation_id',
                                     'raw_affiliation','pred','pred_correct']
        basic_thresh_1000['data_type'] = 'basic_thresh'
        final_pred_data = pd.concat([final_pred_data, basic_thresh_1000], axis=0)
        match_data_id_list += basic_thresh_1000['paper_id'].to_list()

        # Getting all predictions that meet the language model threshold
        language_thresh_1000 = data_1000[(data_1000['score_language']>thresh_language) & 
                                      (~data_1000['paper_id'].isin(match_data_id_list))]\
        [['paper_id','true_affiliation_id','raw_affiliation',
          'pred_language','pred_correct_language']].copy()
        language_thresh_1000.columns = ['paper_id','true_affiliation_id',
                                        'raw_affiliation','pred','pred_correct']
        language_thresh_1000['data_type'] = 'language_thresh'
        final_pred_data = pd.concat([final_pred_data, language_thresh_1000], axis=0)
        match_data_id_list += language_thresh_1000['paper_id'].to_list()

        # Setting the prediction for all leftover strings to the basic model prediction
        # These predictions will most likely not be used
        last_preds_1000 = data_1000[(~data_1000['paper_id'].isin(match_data_id_list))]\
        [['paper_id','true_affiliation_id',
          'raw_affiliation','pred_basic','pred_correct_basic']].copy()
        last_preds_1000.columns = ['paper_id','original_affiliation','true_affiliation_id',
                                   'raw_affiliation','pred','pred_correct']
        last_preds_1000['data_type'] = 'last_preds'
        final_pred_data = pd.concat([final_pred_data, last_preds_1000], axis=0)
        match_data_id_list += last_preds_1000['paper_id'].to_list()

        # Making sure columns are available for new dataframe
        final_pred_data['equals_negative_one'] = final_pred_data['true_affiliation_id'] \
        .apply(lambda x: x[0]==-1).astype('int')

        final_pred_data['not_equals_negative_one'] = final_pred_data['true_affiliation_id'] \
        .apply(lambda x: x[0]!=-1).astype('int')

        # Removing predictions that were in the "last_preds"
        test_over_1000 = final_pred_data[final_pred_data['data_type']!='last_preds'] \
        [['pred_correct','equals_negative_one','not_equals_negative_one']]

        test_under_1000 = final_pred_data[final_pred_data['data_type']=='last_preds'] \
        [['pred_correct','equals_negative_one','not_equals_negative_one']]

        # Getting true positives, false positives, and false negatives
        TP = test_over_1000[(test_over_1000['pred_correct']==1) & 
                             (test_over_1000['not_equals_negative_one']==1)].shape[0]
        FP =  test_over_1000[(test_over_1000['pred_correct']==0)].shape[0]
        FN = test_under_1000[(test_under_1000['not_equals_negative_one']==1)].shape[0]

        # Calculating precision and recall
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        print(f"--- {round(thresh_basic, 2)} {round(thresh_language, 2)} --- Precision: {round(precision, 3)}     Recall: {round(recall, 3)}")

### Gold 500

In [None]:
# Loading the gold 500 dataset
data_500 = gold_500.copy()

# Getting predictions for the language model
data_500['pred_score_language'] = data_500['raw_affiliation'].apply(explore_sent_prediction_language)
data_500['pred_language'] = data_500['pred_score_language'].apply(lambda x: x[0])
data_500['score_language'] = data_500['pred_score_language'].apply(lambda x: x[1])
data_500['lang'] = data_500['pred_score_language'].apply(lambda x: x[2])

# Getting predictions for the basic model
data_500['pred_score_basic'] = data_500['raw_affiliation'].apply(explore_sent_prediction_basic)
data_500['pred_basic'] = data_500['pred_score_basic'].apply(lambda x: x[0])
data_500['score_basic'] = data_500['pred_score_basic'].apply(lambda x: x[1])

# Checking if language model prediction is correct
data_500['pred_correct_language'] = data_500.apply(lambda x: check_for_correct_pred(x.pred_language,
                                                                            x.true_affiliation_id), axis=1)

# Checking if basic model prediction is correct
data_500['pred_correct_basic'] = data_500.apply(lambda x: check_for_correct_pred(x.pred_basic,
                                                                   x.true_affiliation_id), axis=1)

# Checking if the language model and basic model predict the same affiliation
data_500['pred_same'] = data_500.apply(lambda x: x.pred_basic==x.pred_language, axis=1).astype('int')

# Checking if true affiliation should be empty
data_500['equals_negative_one'] = data_500['true_affiliation_id'] \
.apply(lambda x: x[0]==-1).astype('int')

# Checking if true affiliation should not be empty
data_500['not_equals_negative_one'] = data_500['true_affiliation_id'] \
.apply(lambda x: x[0]!=-1).astype('int')

In [None]:
for thresh_basic in [0.0, 0.05, 0.10,0.15,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8]:
    for thresh_language in [0.0, 0.05, 0.1, 0.3,0.6,0.7,0.8,0.9,0.95,0.96,0.98,0.99]:
        final_pred_data_500 = pd.DataFrame()
        
        # Getting all predictions that were matched by both models
        match_data_500 = data_500[data_500['pred_same']==1] \
        [['paper_id','original_affiliation','true_affiliation_id',
          'raw_affiliation','pred_basic','pred_correct_basic']].copy()
        match_data_500.columns = ['paper_id','original_affiliation','true_affiliation_id',
                                  'raw_affiliation','pred','pred_correct']
        match_data_500['data_type'] = 'match'
        match_data_id_list_500 = match_data_500['paper_id'].to_list()
        final_pred_data_500 = pd.concat([final_pred_data_500, match_data_500], axis=0)

        # Getting all predictions that meet the basic model threshold
        basic_thresh_500 = data_500[(data_500['score_basic']>thresh_basic) & 
                                      (~data_500['paper_id'].isin(match_data_id_list_500))]\
        [['paper_id','original_affiliation','true_affiliation_id',
          'raw_affiliation','pred_basic','pred_correct_basic']].copy()
        basic_thresh_500.columns = ['paper_id','original_affiliation','true_affiliation_id',
                                    'raw_affiliation','pred','pred_correct']
        basic_thresh_500['data_type'] = 'basic_thresh'
        final_pred_data_500 = pd.concat([final_pred_data_500, basic_thresh_500], axis=0)
        match_data_id_list_500 += basic_thresh_500['paper_id'].to_list()

        # Getting all predictions that meet the language model threshold
        language_thresh_500 = data_500[(data_500['score_language']>thresh_language) & 
                                      (~data_500['paper_id'].isin(match_data_id_list_500))]\
        [['paper_id','original_affiliation','true_affiliation_id',
          'raw_affiliation','pred_language','pred_correct_language']].copy()
        language_thresh_500.columns = ['paper_id','original_affiliation','true_affiliation_id',
                                       'raw_affiliation','pred','pred_correct']
        language_thresh_500['data_type'] = 'language_thresh'
        final_pred_data_500 = pd.concat([final_pred_data_500, language_thresh_500], axis=0)
        match_data_id_list_500 += language_thresh_500['paper_id'].to_list()

        # Setting the prediction for all leftover strings to the basic model prediction
        # These predictions will most likely not be used
        last_preds_500 = data_500[(~data_500['paper_id'].isin(match_data_id_list_500))]\
        [['paper_id','original_affiliation','true_affiliation_id',
          'raw_affiliation','pred_basic','pred_correct_basic']].copy()
        last_preds_500.columns = ['paper_id','original_affiliation','true_affiliation_id',
                                  'raw_affiliation','pred','pred_correct']
        last_preds_500['data_type'] = 'last_preds'
        final_pred_data_500 = pd.concat([final_pred_data_500, last_preds_500], axis=0)
        match_data_id_list_500 += last_preds_500['paper_id'].to_list()

        # Making sure columns are available for new dataframe
        final_pred_data_500['equals_negative_one'] = final_pred_data_500['true_affiliation_id'] \
        .apply(lambda x: x[0]==-1).astype('int')

        final_pred_data_500['not_equals_negative_one'] = final_pred_data_500['true_affiliation_id'] \
        .apply(lambda x: x[0]!=-1).astype('int')

        # Removing predictions that were in the "last_preds"
        test_over = final_pred_data_500[final_pred_data_500['data_type']!='last_preds'] \
        [['pred_correct','equals_negative_one','not_equals_negative_one']]

        test_under = final_pred_data_500[final_pred_data_500['data_type']=='last_preds'] \
        [['pred_correct','equals_negative_one','not_equals_negative_one']]

        # Getting true positives, false positives, and false negatives
        TP = test_over[(test_over['pred_correct']==1) & 
                             (test_over['not_equals_negative_one']==1)].shape[0]
        FP =  test_over[(test_over['pred_correct']==0)].shape[0]
        FN = test_under[(test_under['not_equals_negative_one']==1)].shape[0]

        # Calculating precision and recall
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        
        if precision > 0.7:
            print(f"--- {round(thresh_basic, 2)} {round(thresh_language, 2)} --- Precision: {round(precision, 3)}     Recall: {round(recall, 3)}")