In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import swifter
from rapidfuzz import process, fuzz
from datetime import datetime, timedelta
from dateutil import relativedelta as rd
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def replace_short(item):
    if len(item)> 2:
        return item
    else:
        return ''
        
def extract_orgs(row):
    return [item.split(',')[0] for item in row.split(';')]

In [3]:
master_ticker_pairs = pd.read_csv('master_ticker_name.csv')
master_ticker_pairs['COMNAM'] = master_ticker_pairs['COMNAM'].str.lower()
master_ticker_pairs['combined'] = master_ticker_pairs['COMNAM'].str.replace(' ', '')
master_ticker_pairs = master_ticker_pairs.fillna('')
master_ticker_pairs['combined'] = master_ticker_pairs['combined'].apply(replace_short)

namelist = list(master_ticker_pairs.combined)
tickerlist = list(master_ticker_pairs.TICKER)

In [4]:
threshold = 75
def map_to_comnam(row):
    mapped = []
    for name in row:
        ticker_result, ticker_score, _ = process.extractOne(name, tickerlist, scorer=fuzz.QRatio)      
        if ticker_score > threshold:
            # print(f'name:{name} | ticker_result:{ticker_result} | ticker_score:{ticker_score}')
            # print(f'added: {sel_tickers[sel_tickers.TICKER == ticker_result].TICKER.values[0]}')
            found_ticker = master_ticker_pairs[master_ticker_pairs.TICKER == ticker_result].TICKER.values[0]
            if (found_ticker not in mapped):
                mapped.append(found_ticker)

        name_result, name_score, _ = process.extractOne(name, namelist, scorer=fuzz.QRatio)
        if name_score > threshold:
            # print(f'name:{name} | ticker_result:{name_result} | ticker_score:{name_score}')
            # print(f'added: {sel_tickers[sel_tickers.first_second == name_result].TICKER.values[0]}')
            found_ticker = master_ticker_pairs[master_ticker_pairs.combined == name_result].TICKER.values[0]
            if (found_ticker not in mapped):
                mapped.append(found_ticker)
    if len(mapped) == 0:
        return None
    else:
        return mapped

In [5]:
gkg_df = pd.read_csv('sample.csv')
gkg_df_ = gkg_df[~gkg_df.V2Organizations.isnull()].reset_index(drop=True)\

print(f'Length before removing nulls: {len(gkg_df)}, after removing nulls: {len(gkg_df_)}')

gkg_df_['orgs'] = gkg_df_.V2Organizations.apply(extract_orgs)
gkg_df_['tags'] = gkg_df_.loc[:,'orgs'].swifter.progress_bar(False).apply(map_to_comnam)

filtered_gkg = gkg_df_[~gkg_df_.tags.isnull()]
print(f'Number of articles with tags: {len(filtered_gkg)}')

Length before removing nulls: 10, after removing nulls: 5
Number of articles with tags: 1


In [6]:
filtered_gkg

Unnamed: 0,DATE,SourceCollectionIdentifier,SourceCommonName,DocumentIdentifier,V2Themes,V2Locations,V2Persons,V2Organizations,V2Tone,SharingImage,RelatedImages,Quotations,AllNames,Amounts,date,orgs,tags
0,20201100000000.0,1,iheart.com,https://y969.iheart.com/content/2020-11-05-ihe...,"MEDIA_MSM,143;MEDIA_SOCIAL,787;GEN_HOLIDAY,72;...",,"Carrie Underwood,404;Josh Groban,418;Josh Grob...","Facebook,798","11.1764705882353,11.1764705882353,0,11.1764705...",https://i.iheart.com/v3/re/new_assets/5fa47d29...,,,"HeartRadio Holiday Special,373;Carrie Underwoo...","7000000,local time via iHeartRadio,642;",7/11/2020 7:30,[Facebook],[FB]
