### LinkedIn Text Processing

In [3]:
import pandas as pd
import warnings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
df = pd.read_csv('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/Webscrape from Linkedin.csv')
ref = pd.read_csv('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/List of 90 Coy and SSIC.csv', dtype={'ssic_code': str})
ssic = pd.read_excel('/Users/Michael/Documents/GitHub/ssicsync/ssic2020-detailed-definitions.xlsx', skiprows=[0,1,2,3], usecols=[0,1,3,5], dtype={'SSIC 2020': str})

In [5]:
df.head()

Unnamed: 0,Company,Linkedin Page,Org Category,Org Website,Text Content
0,ABR HOLDINGS LIMITED,https://www.linkedin.com/company/abr-holdings-limited/about,Food & Beverages,http://www.abr.com.sg/,"ABR Holdings Limited (“ABR”) began as the owner and operator of the first full-service Swensen’s ice cream restaurant in Singapore back in 1979. The Swensen’s brand, with over 25 restaurants in Singapore, remains one of the market leaders in the western casual dining category and one of the preferred choices in good value family dining.\r\n\r\nBeyond Swensen’s and Earle Swensen's, we also manage and develop a portfolio of well-known food and beverage companies and brands. These include Season Confectionary & Bakery, Season’s Café, Tip Top Curry Puffs, Yogen Früz, and Chilli Padi Nonya Catering."
1,ABUNDANCE INTERNATIONAL LIMITED,https://www.linkedin.com/company/life-in-abundance-international/about,Non-profit Organizations,https://lifeinabundance.org,"Life In Abundance International (LIA) is an African-founded, African-led, and faith-based organization with over 25 years of experience bringing hope to more than a million people in need.\r\n\r\nLIA exists to empower local churches in Africa and the Caribbean to serve the poor and vulnerable, creating sustainable transformation in their communities."
2,ABUNDANTE LIMITED,https://www.linkedin.com/company/abundanteseguros/about,Insurance,http://www.abundanteseguros.com.br,Abundante Corretora de Seguros - Todos os Ramos
3,ACCRELIST LTD,https://www.linkedin.com/company/accrelist-ai-tech/about,IT Services and IT Consulting,http://www.weh.sg/,"Our Business scope:\r\n1. -Aesthestics Platform \r\n2. -Digital Marketing (FB, IG,Wechat)\r\n3. -Door Access System\r\n4. -Unmanned Store"
4,ACESIAN PARTNERS LIMITED,https://www.linkedin.com/company/acesiansystems/about,Information Technology & Services,http://www.acesiansystems.com,"Created in 2016 as part of the longer-established Acesian Partners group, Acesian Systems is a collective of personnel dedicated to serving the technical needs of the AV and IT Industries."


In [6]:
# Step 1: Remove leading and trailing whitespace
df['Text Content'] = df['Text Content'].str.strip()

# Step 2: Replace newline characters with spaces
df['Text Content'] = df['Text Content'].str.replace('\r\n', ' ', regex=False)
df['Text Content'] = df['Text Content'].str.replace('\n', ' ', regex=False)

# Step 3: Fill missing values with a placeholder
# df['Text Content'] = df['Text Content'].fillna('')

# Optional Step 4: Normalize text by converting to lowercase (if needed)
df['Text Content'] = df['Text Content'].str.lower()

df.head()

Unnamed: 0,Company,Linkedin Page,Org Category,Org Website,Text Content
0,ABR HOLDINGS LIMITED,https://www.linkedin.com/company/abr-holdings-limited/about,Food & Beverages,http://www.abr.com.sg/,"abr holdings limited (“abr”) began as the owner and operator of the first full-service swensen’s ice cream restaurant in singapore back in 1979. the swensen’s brand, with over 25 restaurants in singapore, remains one of the market leaders in the western casual dining category and one of the preferred choices in good value family dining. beyond swensen’s and earle swensen's, we also manage and develop a portfolio of well-known food and beverage companies and brands. these include season confectionary & bakery, season’s café, tip top curry puffs, yogen früz, and chilli padi nonya catering."
1,ABUNDANCE INTERNATIONAL LIMITED,https://www.linkedin.com/company/life-in-abundance-international/about,Non-profit Organizations,https://lifeinabundance.org,"life in abundance international (lia) is an african-founded, african-led, and faith-based organization with over 25 years of experience bringing hope to more than a million people in need. lia exists to empower local churches in africa and the caribbean to serve the poor and vulnerable, creating sustainable transformation in their communities."
2,ABUNDANTE LIMITED,https://www.linkedin.com/company/abundanteseguros/about,Insurance,http://www.abundanteseguros.com.br,abundante corretora de seguros - todos os ramos
3,ACCRELIST LTD,https://www.linkedin.com/company/accrelist-ai-tech/about,IT Services and IT Consulting,http://www.weh.sg/,"our business scope: 1. -aesthestics platform 2. -digital marketing (fb, ig,wechat) 3. -door access system 4. -unmanned store"
4,ACESIAN PARTNERS LIMITED,https://www.linkedin.com/company/acesiansystems/about,Information Technology & Services,http://www.acesiansystems.com,"created in 2016 as part of the longer-established acesian partners group, acesian systems is a collective of personnel dedicated to serving the technical needs of the av and it industries."


In [7]:
ref.rename(columns = {'entity_name': 'Company'}, inplace = True)
ref = ref.sort_values(by = ['Company', 'ssic_code']).drop_duplicates(subset = 'Company', keep = 'first')
ref = ref[['Company', 'ssic_code']]

test = pd.merge(df, ref, how = 'left', on = 'Company')

In [8]:
# NOTE There's discrepency in PDFs vs 'List of 90 Coy and SSIC' company names (12 companies missing) ..
# .. [Most were resolved by putting a '.' to List of 90 Coy and SSIC's df]
ref['Company'] = ref['Company'].apply(lambda x: x[:-1] if x[-1] == '.' else x)

test2 = pd.merge(test[test.ssic_code.isnull()], ref, how = 'left', on = 'Company')
test2.drop(columns='ssic_code_x', inplace = True)
test2.rename(columns={'ssic_code_y':'ssic_code'}, inplace = True)

In [9]:
# NOTE Web scrapping codes is taking the first profile in the search page, which may be the wrong company. Hence, scrapped info may be wrong.
## ACCRELIST LTD linkedin info seems wrong, but pdf and List of 90 Coy and SSIC's df seems correct (misaligned names!) ..
## TODO Probably should webscrape using the names from List of 90 Coy and SSIC's df instead of pdf names (since it's the source of truth for SSIC codes)?
df = pd.concat([test[~ test.ssic_code.isnull()], test2], axis = 0)

In [None]:
# NOTE Out of 90 coys, ..
## .. 18 has no LinkedIn content, ..
## .. 1 has PDF but not in List of 90 Coy and SSIC's df (ACMA LTD), ..
## .. 1 has PDF and in List of 90 Coy and SSIC's df (ACMA LTD) but partially different name (ACCRELIST LTD) [hence can't match to get SSIC codes!]
df = df[['Company', 'ssic_code', 'Text Content']][(df.ssic_code.notnull()) & (df['Text Content'].notnull())].reset_index(drop = True)

# just predicting up to Division level in the SSIC code
# df['ssic_code'] = df['ssic_code'].apply(lambda x: str(x)[0:2])

# 70-30 Train-Test split (randomised selection):
train = df.sample(round(df.shape[0]*0.7)).reset_index(drop = True)
test = df[~df.Company.isin(train.Company)].reset_index(drop = True)

In [None]:
df.shape

(70, 3)

In [None]:
train.shape

(49, 3)

In [None]:
test.shape

(21, 3)

In [None]:
ssic.sample(5)

Unnamed: 0,SSIC 2020,SSIC 2020 Title,Detailed Definitions,Examples of Activities Classified Under this Code
2414,20233,Manufacture of cosmetics and toiletries,Hair dressing preparation manufacturing,
6073,68101,Real estate developers,Real estate development service,
4927,47741,Retail sale of antiques and works of art,Art trade,
2721,24209,Manufacture of basic precious and non-ferrous metals n.e.c.,"Sheet, zinc, manufacturing",
285,20294,"Manufacture of inks, dyestuffs, pigments and carbon black","This Sub-class includes the manufacture of inks (e.g. writing and drawing ink), dyestuffs, pigments and/or carbon black.",<Blank>


In [None]:
train.head(1)

Unnamed: 0,Company,ssic_code,Text Content
0,PENGUIN INTERNATIONAL LIMITED,30110,"thursday, 5 january 2023 9:59 am penguin international limited is a publicly listed singaporean homegrown company. we specialise in the design, construction, ownership and operation of aluminium high-speed craft. since 1995, we have delivered more than 200 aluminium workboats, patrol craft and passenger ferries to ship owners around the world. these include our proprietary flex multi-role crewboats, armoured security boats and windfarm vessels, built under our self-funded stock vessel construction programme. our designs are jointly developed by our integrated shipbuilding and ship management teams in singapore, backed by more than three decades of operational experience. we typically own and operate what we design and build. we also undertake owner-specific build-to-order, repair and conversion projects for a variety of high-speed craft. whether you are a ship owner or a charterer, you will enjoy peace of mind with the penguin brand, which stands for integrity, professionalism and mutual respect. go ahead. flex your fleet today!"


In [None]:
test.head(1)

Unnamed: 0,Company,ssic_code,Text Content
0,ACESIAN PARTNERS LIMITED,41009,"created in 2016 as part of the longer-established acesian partners group, acesian systems is a collective of personnel dedicated to serving the technical needs of the av and it industries."


In [20]:
ssic = pd.read_excel('/Users/Michael/Documents/GitHub/ssicsync/ssic2020-detailed-definitions.xlsx', skiprows=[0,1,2,3], usecols=[0,1,3,5], dtype={'SSIC 2020': str})

In [21]:
ssic.shape

(7123, 4)

In [22]:
for column in ssic.columns:
    ssic.loc[((ssic[column] == '<Blank>') | (ssic[column].isnull())), column] = ''
ssic = ssic[~ssic['SSIC 2020'].str.match(r'^[A-Z]$')]

# just predicting up to Division level in the SSIC code
# ssic['SSIC 2020'] = ssic['SSIC 2020'].apply(lambda x: str(x)[0:2])

# Define a cleaning function
def clean_text(text):
    if isinstance(text, str):
        # Replace newline characters with space
        text = text.replace('\n', ' ')
        # Replace special characters (bullet points) with a consistent delimiter
        text = text.replace('•', '- ')
        # Strip leading and trailing whitespace
        text = text.strip()
        # Replace multiple spaces with a single space
        text = ' '.join(text.split())
    return text

# Apply the cleaning function to the 'Info' column
ssic['Detailed Definitions'] = ssic['Detailed Definitions'].apply(clean_text)
ssic['Examples of Activities Classified Under this Code'] = ssic['Examples of Activities Classified Under this Code'].apply(clean_text)

ssic['textonly'] = ssic['SSIC 2020 Title'] + '.' + ssic['Detailed Definitions'] + '.' + ssic['Examples of Activities Classified Under this Code']
ssic = ssic[['SSIC 2020', 'textonly']]
ssic.columns = ['ssic_code', 'Text Content']
ssic = ssic[ssic['Text Content'].notnull()]

In [24]:
df.shape

(70, 3)

In [None]:
# NOTE include line below if want to add original SSIC text reference into training data:
train = pd.concat([train, ssic], axis = 0).reset_index(drop = True)

In [None]:
# Model training

textonly = train['Text Content']
text_tokens = [word_tokenize(t.lower()) for t in textonly]
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(text_tokens)]

model = Doc2Vec(vector_size=64, min_count=2, epochs=40)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=40)

In [None]:
# Parsing model outputs into a df (with dups)

recsys_df = pd.DataFrame(columns = ["Company", "actualSSIC", "recSSIC", "score"])

for index, series in test.iterrows():

    test_doc = word_tokenize(series['Text Content'].lower())
    test_vec = model.infer_vector(test_doc)
    # results = model.docvecs.most_similar(positive=[test_vec], topn=50)
    results = model.docvecs.most_similar(positive=[test_vec], topn=5)

    for i, (index, similarity_score) in enumerate(results):
        coy = series["Company"]
        actualssic = series['ssic_code']
        recssic = train["ssic_code"][index]
        score = similarity_score
        row = [coy, actualssic, recssic, score]
        recsys_df.loc[len(recsys_df)] = row

# recsys_df.drop_duplicates(subset = ['Company', 'recSSIC'], keep = 'first', inplace = True)
# recsys_df['rank'] = recsys_df.groupby('Company').cumcount()+1
recsys_df['rank'] = recsys_df.sort_values(by = ['Company', 'score'], ascending=[True, False]).groupby('Company').cumcount()+1
# recsys_df = recsys_df[recsys_df['rank'] <= 3]

In [None]:
# # Parsing model outputs into a df (without dups)

# recsys_df = pd.DataFrame(columns = ["Company", "actualSSIC", "recSSIC", "score"])

# for index, series in test.iterrows():

#     test_doc = word_tokenize(series['Text Content'].lower())
#     test_vec = model.infer_vector(test_doc)
#     results = model.docvecs.most_similar(positive=[test_vec], topn=50)
#     # results = model.docvecs.most_similar(positive=[test_vec], topn=5)

#     for i, (index, similarity_score) in enumerate(results):
#         coy = series["Company"]
#         actualssic = series['ssic_code']
#         recssic = train["ssic_code"][index]
#         score = similarity_score
#         row = [coy, actualssic, recssic, score]
#         recsys_df.loc[len(recsys_df)] = row

# recsys_df.drop_duplicates(subset = ['Company', 'recSSIC'], keep = 'first', inplace = True)
# recsys_df['rank'] = recsys_df.groupby('Company').cumcount()+1
# # recsys_df['rank'] = recsys_df.sort_values(by = ['Company', 'score'], ascending=[True, False]).groupby('Company').cumcount()+1
# recsys_df = recsys_df[recsys_df['rank'] <= 3]

In [None]:
recsys_df

Unnamed: 0,Company,actualSSIC,recSSIC,score,rank
0,ABR HOLDINGS LIMITED,47219,56122,0.582097,1
1,ABR HOLDINGS LIMITED,47219,56122,0.552064,2
2,ABR HOLDINGS LIMITED,47219,56122,0.546321,3
3,ABR HOLDINGS LIMITED,47219,47103,0.506491,4
4,ABR HOLDINGS LIMITED,47219,56122,0.494198,5
5,ACESIAN PARTNERS LIMITED,41009,27322,0.626589,1
6,ACESIAN PARTNERS LIMITED,41009,47533,0.576911,2
7,ACESIAN PARTNERS LIMITED,41009,46424,0.562774,3
8,ACESIAN PARTNERS LIMITED,41009,46900,0.551953,4
9,ACESIAN PARTNERS LIMITED,41009,28191,0.549688,5


In [None]:
# For accuracy calculation (up to Division level)

recsysAccuracy_df = recsys_df.copy()

recsysAccuracy_df['actualSSIC'] = recsysAccuracy_df['actualSSIC'].apply(lambda x: str(x)[0:2])
recsysAccuracy_df['recSSIC'] = recsysAccuracy_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysAccuracy_df.loc[recsysAccuracy_df.actualSSIC == recsysAccuracy_df.recSSIC, 'match'] = 1
recsysAccuracy_df.loc[recsysAccuracy_df.match != 1, 'match'] = 0

recsysAccuracy_df = recsysAccuracy_df.sort_values(by = ['Company', 'match'], ascending=False).groupby('Company').head(1).reset_index(drop = True)
matchAccuracy = len(recsysAccuracy_df[recsysAccuracy_df.match == 1].match)/recsysAccuracy_df.shape[0]

print(f'Accuracy of Recommendation Model: {round(matchAccuracy*100,1)}%')

Accuracy of Recommendation Model: 38.1%


In [None]:
"""
<Company's correct SSIC code (up to Division level) found in top 3 recommendations>

Prediction Accuracy (with original SSIC text reference): 76.2% [better choice, proceed to analyse similarity score and overall stats below]
Prediction Accuracy (without original SSIC text reference): 61.9%
"""

"\n<Company's correct SSIC code (up to Division level) found in top 3 recommendations>\n\nPrediction Accuracy (with original SSIC text reference): 76.2% [better choice, proceed to analyse similarity score and overall stats below]\nPrediction Accuracy (without original SSIC text reference): 61.9%\n"

In [None]:
print(f"Overall Similarity Score: {round(recsys_df.score.mean(),2)} out of 1.00")

Overall Similarity Score: 0.55 out of 1.00


In [None]:
recsysStats_df = recsys_df.copy()

recsysStats_df['actualSSIC'] = recsysStats_df['actualSSIC'].apply(lambda x: str(x)[0:2])
recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].astype('int64')
recsysStats_df = recsysStats_df.groupby('Company')['recSSIC'].agg(['mean', 'std']).reset_index()
recsysStats_df['mean'] = round(recsysStats_df['mean'],1)
recsysStats_df['std'] = round(recsysStats_df['std'],1)
print(f"Overall Stats (99 Divisions in Total):\n{round(recsysStats_df.mean(),1)}")
recsysStats_df

Overall Stats (99 Divisions in Total):
mean    49.0
std     16.3
dtype: float64


Unnamed: 0,Company,mean,std
0,ABR HOLDINGS LIMITED,54.2,4.0
1,ACESIAN PARTNERS LIMITED,38.8,10.3
2,ADVANCED SYSTEMS AUTOMATION LIMITED,42.4,15.4
3,BRC ASIA LIMITED,40.8,27.0
4,CASA HOLDINGS LIMITED,66.2,27.4
5,CHEMICAL INDUSTRIES (FAR EAST) LIMITED,39.6,34.7
6,CREATIVE TECHNOLOGY LTD,39.4,17.9
7,CSE GLOBAL LIMITED,62.8,25.6
8,ENGRO CORPORATION LIMITED,34.2,15.9
9,HONG LEONG FINANCE LIMITED,57.0,10.0


In [None]:
# For accuracy calculation (all levels)

recsysAccuracy_df = recsys_df.copy()

# recsysAccuracy_df['actualSSIC'] = recsysAccuracy_df['actualSSIC'].apply(lambda x: str(x)[0:2])
# recsysAccuracy_df['recSSIC'] = recsysAccuracy_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysAccuracy_df.loc[recsysAccuracy_df.actualSSIC == recsysAccuracy_df.recSSIC, 'match'] = 1
recsysAccuracy_df.loc[recsysAccuracy_df.match != 1, 'match'] = 0

recsysAccuracy_df = recsysAccuracy_df.sort_values(by = ['Company', 'match'], ascending=False).groupby('Company').head(1).reset_index(drop = True)
matchAccuracy = len(recsysAccuracy_df[recsysAccuracy_df.match == 1].match)/recsysAccuracy_df.shape[0]

print(f'Accuracy of Recommendation Model: {round(matchAccuracy*100,1)}%')

print(f"Overall Similarity Score: {round(recsys_df.score.mean(),2)} out of 1.00")
recsysStats_df = recsys_df.copy()

# recsysStats_df['actualSSIC'] = recsysStats_df['actualSSIC'].apply(lambda x: str(x)[0:2])
# recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].astype('int64')
recsysStats_df = recsysStats_df.groupby('Company')['recSSIC'].agg(['mean', 'std']).reset_index()
recsysStats_df['mean'] = round(recsysStats_df['mean'],1)
recsysStats_df['std'] = round(recsysStats_df['std'],1)
print(f"Overall Stats (99,090 Sub-classes in total):\n{round(recsysStats_df.mean(),1)}")
recsysStats_df

Accuracy of Recommendation Model: 14.3%
Overall Similarity Score: 0.55 out of 1.00
Overall Stats (99,090 Sub-classes in total):
mean    48489.3
std     16994.8
dtype: float64


Unnamed: 0,Company,mean,std
0,ABR HOLDINGS LIMITED,54318.2,4033.4
1,ACESIAN PARTNERS LIMITED,39274.0,10525.8
2,ADVANCED SYSTEMS AUTOMATION LIMITED,42851.2,15386.8
3,BRC ASIA LIMITED,41030.8,27072.7
4,CASA HOLDINGS LIMITED,49961.2,37637.5
5,CHEMICAL INDUSTRIES (FAR EAST) LIMITED,39911.2,34594.9
6,CREATIVE TECHNOLOGY LTD,39611.0,17889.9
7,CSE GLOBAL LIMITED,63185.4,25476.9
8,ENGRO CORPORATION LIMITED,34673.8,15853.8
9,HONG LEONG FINANCE LIMITED,57379.6,9747.2


### Recommender Model Function

In [1030]:
ref.head(1)

Unnamed: 0,UEN,entity_name,ssic_code,ssic_code2,oth_desc,oth_desc2,gsearch
0,197803023H,ABR HOLDINGS LIMITED,47219,64202.0,RESTAURANTS AND MANUFACTURING OF ICE CREAM,,https://www.google.com/search?q=ABR HOLDINGS LIMITED+uen


In [1044]:
df.sample(1)

Unnamed: 0,PDF Name,Page Number,Notes Page Content
16,CITY DEVELOPMENTS LIMITED 2022.pdf,64,"The 1 DOMICILE AND ACTIVITIES City Developments Limited (the Company) is incorporated in the Republic of Singapore and has its registered office at 9 Raffles Place, #12-01 Republic Plaza, Singapore 048619. principal activities of the Company are those of a property developer and owner, and investment holding.The principal activities of the subsidiaries are those of property developers and owners, hotel owners and operators, a club operator and owner, investment in properties and in shares, property management, project management and provision of consultancy, procurement and laundry services. The consolidated financial statements for the year ended 31 December 2022 relate to the Company and its subsidiaries (together referred to as the Group and individually as Group entities ) and the Group s interests in associates and joint ventures. The directors consider the immediate and ultimate holding company to be Hong Leong Investment Holdings Pte. Ltd., a company incorporated in the Republic of Singapore. 2 B ASIS OF PREPARATION 2.1 S tatement of compliance The financial statements have been prepared in accordance with Singapore Financial Reporting Standards (International) (SFRS(I)s) and International Financial Reporting Standards (IFRSs). SFRS(I)s, issued by the Accounting Standards Council (ASC), comprises standards and interpretations that are equivalents to IFRSs as issued by the International Accounting Standards Board (IASB). All references to SFRS(I)s and IFRSs are subsequently referred to as SFRS(I) in the financial statements. 2.2 Basis of measurement The financial statements have been prepared on the historical cost basis except as otherwise described in the notes below. 2.3 F unctional and presentation currency The financial statements are presented in Singapore dollars, which is the Company s functional currency. All financial information has been rounded to the nearest thousand, unless otherwise stated. 2.4 U se of estimates and judgements The preparation of the financial statements in conformity with SFRS(I) requires management to make judgements, estimates and assumptions that affect the application of accounting policies and the reported amounts of assets, liabilities, income and expenses. Actual results may differ from these estimates. Estimates and underlying assumptions are reviewed on an ongoing basis. Revisions to accounting estimates are recognised prospectively.NOTES TO THE FINANCIAL STATEMENTS YEAR ENDED 31 DECEMBER 2022 Significant non-cash transactions During the year ended 31 December 2022, there were the following significant non-cash transactions: Dividends amoun ting to $1,200,000 were paid by a subsidiary to its non-controlling interests in the form of additional shares in that subsidiary. In May 2022, the Company distributed 144,191,823 stapled securities in CDL Hospitality Trusts ( CDLHT and such stapled securities, the CDLHT units ) that it held to its ordinary shareholders at 0.159 CDLHT Unit per ordinary share based on $1.27 per CDLHT Unit, amounting to $183,124,000 (Note 36). During the year ended 31 December 2021, there were the following significant non-cash transactions: Dividends amoun ting to $1,287,000 were paid by a subsidiary to its non-controlling interests in the form of additional shares in that subsidiary. In c onnection with the Group s disposal of its interest in HCP Chongqing Property Development Co., Ltd (HCP), an offshore investment vehicle that held an indirect 80.01% equity interest in Chongqing Sincere Yuanchuang Industrial Co., Ltd and its subsidiaries (Sincere Property Group), the Group entered into various agreements with HCP Group whereby it was agreed that (i) the amount owing to HCP Group of $263.7 million would be set off against the amounts owing by HCP Group; and (ii) the collateral held by the Group in respect of the amounts owing by HCP Group, which relates to shares in a property-owning entity which had been pledged by HCP Group to the Group, would be transferred to the Group, as settlement of $54.1 million (RMB260.0 million) of the amounts owing by HCP Group.CONSOLIDATED STATEMENT OF CASH FLOWS YEAR ENDED 31 DECEMBER 2022 CITY DEVELOPMENTS LIMITED ANNUAL REPORT 2022 FINANCIALS 124 125"


In [1049]:
ref.rename(columns = {'entity_name': 'Company'}, inplace = True)
ref = ref.sort_values(by = ['Company', 'ssic_code']).drop_duplicates(subset = 'Company', keep = 'first')
ref = ref[['Company', 'ssic_code']]

df.rename(columns = {'PDF Name': 'Company'}, inplace = True)
df.rename(columns = {'Notes Page Content': 'Text Content'}, inplace = True)
test = pd.merge(df, ref, how = 'left', on = 'Company')
# NOTE There's discrepency in PDFs vs 'List of 90 Coy and SSIC' company names (12 companies missing) ..
# .. [Most were resolved by putting a '.' to List of 90 Coy and SSIC's df]
ref['Company'] = ref['Company'].apply(lambda x: x[:-1] if x[-1] == '.' else x)

test2 = pd.merge(test[test.ssic_code.isnull()], ref, how = 'left', on = 'Company')
test2.drop(columns='ssic_code_x', inplace = True)
test2.rename(columns={'ssic_code_y':'ssic_code'}, inplace = True)
# NOTE Web scrapping codes is taking the first profile in the search page, which may be the wrong company. Hence, scrapped info may be wrong.
## ACCRELIST LTD linkedin info seems wrong, but pdf and List of 90 Coy and SSIC's df seems correct (misaligned names!) ..
## TODO Probably should webscrape using the names from List of 90 Coy and SSIC's df instead of pdf names (since it's the source of truth for SSIC codes)?
df = pd.concat([test[~ test.ssic_code.isnull()], test2], axis = 0)
# NOTE Out of 90 coys, ..
## .. 18 has no LinkedIn content, ..
## .. 1 has PDF but not in List of 90 Coy and SSIC's df (ACMA LTD), ..
## .. 1 has PDF and in List of 90 Coy and SSIC's df (ACMA LTD) but partially different name (ACCRELIST LTD) [hence can't match to get SSIC codes!]
df = df[['Company', 'ssic_code', 'Text Content']][(df.ssic_code.notnull()) & (df['Text Content'].notnull())].reset_index(drop = True)

# just predicting up to Division level in the SSIC code
# df['ssic_code'] = df['ssic_code'].apply(lambda x: str(x)[0:2])

df.shape

(0, 3)

In [31]:
hello = df.copy()
hey = ssic.copy()

In [126]:
# NOTE DO NOT use this
q,w,e,r = recommendationModel(df = hello, ssic = hey, withSSIC = True, epochs = 40, vector_size = 64, top = 5, trainPercentage_0to1 = 0.7)
modelOutputs_final = q.copy()
modelStats_final = w.copy()
modelValidation_final = e.copy()
model_final = r

# TODO
# we talk about linkedin first for now
# what are the companies that are matched wrongly? any patterns? could be business user declaration and way of writing mismatches the business activity. maybe there's prob with source of truth. maybe sample issue skewed against certain ssic fields

Model Results:
Accuracy of Recommendation Model: 57.1%
Overall Similarity Score: 0.56 out of 1.00
Overall Stats (99 Divisions in Total):
mean    50.8
std     18.1
dtype: float64


In [132]:
modelResults_path = '/Users/Michael/Documents/GitHub/ssicsync/recommendationModels/modelResults.xlsx'
model_path = '/Users/Michael/Documents/GitHub/ssicsync/recommendationModels/model'

# Create an Excel writer object and write each DataFrame to a different sheet
with pd.ExcelWriter(modelResults_path) as writer:
    modelOutputs_final.to_excel(writer, sheet_name='Model Outputs', index=False)
    modelStats_final.to_excel(writer, sheet_name='Model Stats', index=False)
    modelValidation_final.to_excel(writer, sheet_name='Model Validation', index=False)

model_final.save(model_path)

In [133]:
qqqqqqqqqqqqqqqq = Doc2Vec.load(model_path)

In [141]:
train = test

In [None]:
import pandas as pd
import warnings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
# df = pd.read_csv('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/Webscrape from Linkedin.csv')
df = pd.read_excel('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/AR Report Data/extracted_notes_pages_updated7.xlsx')
df2 = pd.read_excel('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/AR Report Data/extracted_notes_pages_updated8_pdfplumber.xlsx')
ssic = pd.read_excel('/Users/Michael/Documents/GitHub/ssicsync/ssic2020-detailed-definitions.xlsx', skiprows=[0,1,2,3], usecols=[0,1,3,5], dtype={'SSIC 2020': str})
ref = pd.read_csv('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/List of 90 Coy and SSIC.csv', dtype={'ssic_code': str})

In [None]:
for column in ssic.columns:
    ssic.loc[((ssic[column] == '<Blank>') | (ssic[column].isnull())), column] = ''
ssic = ssic[~ssic['SSIC 2020'].str.match(r'^[A-Z]$')]

# just predicting up to Division level in the SSIC code
# ssic['SSIC 2020'] = ssic['SSIC 2020'].apply(lambda x: str(x)[0:2])

# Define a cleaning function
def clean_text(text):
    if isinstance(text, str):
        # Replace newline characters with space
        text = text.replace('\n', ' ')
        # Replace special characters (bullet points) with a consistent delimiter
        text = text.replace('•', '- ')
        # Strip leading and trailing whitespace
        text = text.strip()
        # Replace multiple spaces with a single space
        text = ' '.join(text.split())
    return text

# Apply the cleaning function to the 'Info' column
ssic['Detailed Definitions'] = ssic['Detailed Definitions'].apply(clean_text)
ssic['Examples of Activities Classified Under this Code'] = ssic['Examples of Activities Classified Under this Code'].apply(clean_text)

ssic['textonly'] = ssic['SSIC 2020 Title'] + '.' + ssic['Detailed Definitions'] + '.' + ssic['Examples of Activities Classified Under this Code']
ssic = ssic[['SSIC 2020', 'textonly']]
ssic.columns = ['ssic_code', 'Text Content']
ssic = ssic[ssic['Text Content'].notnull()]

In [None]:
# from sklearn.model_selection import cross_val_score
# import numpy as np

# def recommendationModel(df, ssic, withSSIC=True, epochs=40, vector_size=64, top=5):
#     df['ssic_code'] = df['ssic_code'].astype(str).str.zfill(5)

#     # Train the model
#     def train_model(train_data):
#         textonly = train_data['Text Content']
#         text_tokens = [word_tokenize(t.lower()) for t in textonly]
#         tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(text_tokens)]
#         model = Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs)
#         model.build_vocab(tagged_data)
#         model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs)
#         return model

#     # Evaluation function
#     def evaluate_model(model, test_data, train_data):
#         recsys_df = pd.DataFrame(columns=["Company", "actualSSIC", "recSSIC", "score", "actualTextContent", "recommendedTextContent"])
#         for index, series in test_data.iterrows():
#             test_doc = word_tokenize(series['Text Content'].lower())
#             test_vec = model.infer_vector(test_doc)
#             results = model.dv.most_similar(positive=[test_vec], topn=top)
#             for i, (idx, similarity_score) in enumerate(results):
#                 row = {
#                     "Company": series["Company"],
#                     "actualSSIC": series['ssic_code'],
#                     "recSSIC": train_data["ssic_code"][idx],
#                     "score": similarity_score,
#                     "actualTextContent": series["Text Content"],
#                     "recommendedTextContent": train_data["Text Content"][idx]
#                 }
#                 recsys_df = recsys_df.append(row, ignore_index=True)
#         recsys_df['rank'] = recsys_df.sort_values(by=['Company', 'score'], ascending=[True, False]).groupby('Company').cumcount() + 1
#         recsys_df['match'] = (recsys_df['actualSSIC'] == recsys_df['recSSIC']).astype(int)
#         accuracy = recsys_df['match'].mean()
#         return accuracy

#     accuracies = []
#     for train_index, test_index in KFold(n_splits=5).split(df):
#         train_data, test_data = df.iloc[train_index], df.iloc[test_index]
#         if withSSIC:
#             train_data = pd.concat([train_data, ssic], axis=0).reset_index(drop=True)
#         model = train_model(train_data)
#         accuracy = evaluate_model(model, test_data, train_data)
#         accuracies.append(accuracy)

#     mean_accuracy = np.mean(accuracies)
#     print(f'Average Cross-Validated Accuracy: {mean_accuracy * 100:.1f}%')

#     return model

# # Usage
# model = recommendationModel(df=hello, ssic=hey, withSSIC=True, epochs=40, vector_size=64, top=5)

In [None]:
def recommendationModel(df = None, ssic = None, withSSIC = True, epochs = 40, vector_size = 64, top = 5, trainPercentage_0to1 = 0.7):

    df['ssic_code'] = df['ssic_code'].astype(str).str.zfill(5)

    # 70-30 Train-Test split (randomised selection):
    train = df.sample(round(df.shape[0]*trainPercentage_0to1)).reset_index(drop = True)
    test = df[~df.Company.isin(train.Company)].reset_index(drop = True)

    if withSSIC == True:
        # NOTE include line below if want to add original SSIC text reference into training data:
        train = pd.concat([train, ssic], axis = 0).reset_index(drop = True)

    # Model training
    textonly = train['Text Content']
    text_tokens = [word_tokenize(t.lower()) for t in textonly]
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(text_tokens)]
    model = Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs)

    # Parsing model outputs into a df (with dups)
    recsys_df = pd.DataFrame(columns = ["Company", "actualSSIC", "recSSIC", "score", "actualTextContent", "recommendedTextContent"])
    for index, series in test.iterrows():
        test_doc = word_tokenize(series['Text Content'].lower())
        test_vec = model.infer_vector(test_doc)
        results = model.docvecs.most_similar(positive=[test_vec], topn=top)
        for i, (index, similarity_score) in enumerate(results):
            coy = series["Company"]
            actualssic = series['ssic_code']
            recssic = train["ssic_code"][index]
            actualTextContent = series["Text Content"]
            recommendedTextContent = train["Text Content"][index]
            score = similarity_score
            row = [coy, actualssic, recssic, score, actualTextContent, recommendedTextContent]
            recsys_df.loc[len(recsys_df)] = row
    recsys_df['rank'] = recsys_df.sort_values(by = ['Company', 'score'], ascending=[True, False]).groupby('Company').cumcount()+1
    recsysForValidation_df = recsys_df.copy()
    training_df = train.copy()
    testing_df = test.copy()

    recsys_df.drop(columns = ["actualTextContent", "recommendedTextContent"], inplace = True)
    recsys_df['actualSSIC'] = recsys_df['actualSSIC'].apply(lambda x: x[0:2])
    recsys_df['recSSIC'] = recsys_df['recSSIC'].apply(lambda x: x[0:2])

    recsysForValidation_df['actualSSICDescription'] = ''
    recsysForValidation_df['recSSICDescription'] = ''
    for index, series in recsysForValidation_df.iterrows():
        ssicGrouped = ssic.groupby('ssic_code')['Text Content'].apply(lambda x: ' '.join(x)).reset_index()

        try:    
            actualSSICDescription = ssicGrouped[ssicGrouped.ssic_code == series.actualSSIC].reset_index(drop=True)['Text Content'].values[0]
            recsysForValidation_df.loc[index, 'actualSSICDescription'] = actualSSICDescription
        except Exception as e:
            print('actualSSICDescription:')
            print(ssicGrouped[ssicGrouped.ssic_code == series.actualSSIC].reset_index(drop=True)['Text Content'])
            print(f"Error: {e}")
            print(f"SSIC in Master List: {series.actualSSIC}")

        try:
            recSSICDescription = ssicGrouped[ssicGrouped.ssic_code == series.recSSIC].reset_index(drop=True)['Text Content'].values[0]
            recsysForValidation_df.loc[index, 'recSSICDescription'] = recSSICDescription
        except Exception as e:
            print('recSSICDescription:')
            print(ssicGrouped[ssicGrouped.ssic_code == series.actualSSIC].reset_index(drop=True)['Text Content'])
            print(f"Error: {e}")
            print(f"SSIC in Master List: {series.actualSSIC}")

    recsysForValidation_df = recsysForValidation_df[['Company', 'actualSSIC', 'recSSIC', 'score', 'rank', 'actualTextContent', 'recommendedTextContent', 'actualSSICDescription', 'recSSICDescription']]

    #########################################
    
    recsysAccuracy_df = recsys_df.copy()

    print('Model Results:')
    recsysAccuracy_df.loc[recsysAccuracy_df.actualSSIC == recsysAccuracy_df.recSSIC, 'match'] = 1
    recsysAccuracy_df.loc[recsysAccuracy_df.match != 1, 'match'] = 0
    recsysAccuracy_df = recsysAccuracy_df.sort_values(by = ['Company', 'match'], ascending=False).groupby('Company').head(1).reset_index(drop = True)
    matchAccuracy = len(recsysAccuracy_df[recsysAccuracy_df.match == 1].match)/recsysAccuracy_df.shape[0]
    print(f'Accuracy of Recommendation Model: {round(matchAccuracy*100,1)}%')

    print(f"Overall Similarity Score: {round(recsys_df.score.mean(),2)} out of 1.00")

    recsysStats_df = recsys_df.copy()
    recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].astype('int64')
    recsysStats_df = recsysStats_df.groupby('Company')['recSSIC'].agg(['mean', 'std']).reset_index()
    print(f"Overall Stats (99 Divisions in Total):\n{round(recsysStats_df.mean(),1)}")

    print_statements = {
    "Evaluation Metrics": ["Overall Accuracy", "Similarity Score (Average)", "Mean (99 Divisions in Total)", "Standard Deviation (99 Divisions in Total)"],
    "Values": [f"{matchAccuracy}", f"{recsys_df.score.mean()}", f"{recsysStats_df['mean'].mean()}", f"{recsysStats_df['std'].mean()}"]
    }
    modelResults_df = pd.DataFrame(print_statements)

    return recsys_df, recsysStats_df, recsysForValidation_df, training_df, testing_df, model, modelResults_df

In [181]:
# NOTE USE THIS

modelResults_path = '/Users/Michael/Documents/GitHub/ssicsync/recommendationModels/fullModelResults.xlsx'
model_path = '/Users/Michael/Documents/GitHub/ssicsync/recommendationModels/model'

accuracy = 0
while accuracy < 0.6:
    recsys_df, recsysStats_df, recsysForValidation_df, training_df, testing_df, model, modelResults_df = recommendationModel(df = hello, ssic = hey,
                                                                                                    withSSIC = True, epochs = 40, vector_size = 64,
                                                                                                    top = 5, trainPercentage_0to1 = 0.7)
    accuracy = float(modelResults_df.head(1)['Values'].values[0])

# Create files
with pd.ExcelWriter(modelResults_path) as writer:
    modelResults_df.to_excel(writer, sheet_name='Model Results', index=False)
    recsys_df.to_excel(writer, sheet_name='Model Outputs', index=False)
    recsysStats_df.to_excel(writer, sheet_name='Model Stats', index=False)
    recsysForValidation_df.to_excel(writer, sheet_name='Model Validation', index=False)
    training_df.to_excel(writer, sheet_name='Training Data', index=False)
    testing_df.to_excel(writer, sheet_name='Testing Data', index=False)
model.save(model_path)

# TODO
# we talk about linkedin first for now
# what are the companies that are matched wrongly? any patterns? could be business user declaration and way of writing mismatches the business activity. maybe there's prob with source of truth. maybe sample issue skewed against certain ssic fields

Model Results:
Accuracy of Recommendation Model: 28.6%
Overall Similarity Score: 0.57 out of 1.00
Overall Stats (99 Divisions in Total):
mean    50.0
std     17.7
dtype: float64
Model Results:
Accuracy of Recommendation Model: 19.0%
Overall Similarity Score: 0.56 out of 1.00
Overall Stats (99 Divisions in Total):
mean    47.5
std     17.0
dtype: float64
Model Results:
Accuracy of Recommendation Model: 38.1%
Overall Similarity Score: 0.57 out of 1.00
Overall Stats (99 Divisions in Total):
mean    46.0
std     18.8
dtype: float64
Model Results:
Accuracy of Recommendation Model: 33.3%
Overall Similarity Score: 0.55 out of 1.00
Overall Stats (99 Divisions in Total):
mean    53.9
std     18.6
dtype: float64
Model Results:
Accuracy of Recommendation Model: 23.8%
Overall Similarity Score: 0.55 out of 1.00
Overall Stats (99 Divisions in Total):
mean    51.8
std     20.6
dtype: float64
Model Results:
Accuracy of Recommendation Model: 28.6%
Overall Similarity Score: 0.54 out of 1.00
Overall Stat

In [None]:
i.head()

In [149]:
xxxxxxxxxxxxxxxxxxxx = pd.DataFrame(columns = ["Company", "actualSSIC", "recSSIC", "score", "actualTextContent", "recommendedTextContent"])
for index, series in test.iterrows():
    test_doc = word_tokenize(series['Text Content'].lower())
    test_vec = qqqqqqqqqqqqqqqq.infer_vector(test_doc)
    results = qqqqqqqqqqqqqqqq.docvecs.most_similar(positive=[test_vec], topn=5)
    for i, (index, similarity_score) in enumerate(results):
        coy = series["Company"]
        actualssic = series['ssic_code']
        recssic = o["ssic_code"][index]
        actualTextContent = series["Text Content"]
        recommendedTextContent = o["Text Content"][index]
        score = similarity_score
        row = [coy, actualssic, recssic, score, actualTextContent, recommendedTextContent]
        xxxxxxxxxxxxxxxxxxxx.loc[len(xxxxxxxxxxxxxxxxxxxx)] = row
xxxxxxxxxxxxxxxxxxxx['rank'] = xxxxxxxxxxxxxxxxxxxx.sort_values(by = ['Company', 'score'], ascending=[True, False]).groupby('Company').cumcount()+1
xxxxxxxxxxxxxxxxxxxx.head()

AttributeError: 'float' object has no attribute 'lower'