### LinkedIn Text Processing

In [297]:
import pandas as pd
import warnings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [298]:
df = pd.read_csv('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/Webscrape from Linkedin.csv')
ref = pd.read_csv('/Users/Michael/Documents/GitHub/ssicsync/Webscrap/List of 90 Coy and SSIC.csv')
ssic = pd.read_excel('/Users/Michael/Documents/GitHub/ssicsync/ssic2020-detailed-definitions.xlsx', skiprows=[0,1,2,3], usecols=[0,1,3,5])

  warn("""Cannot parse header or footer so it will be ignored""")


In [299]:
df.head()

Unnamed: 0,Company,Linkedin Page,Org Category,Org Website,Text Content
0,ABR HOLDINGS LIMITED,https://www.linkedin.com/company/abr-holdings-limited/about,Food & Beverages,http://www.abr.com.sg/,"ABR Holdings Limited (“ABR”) began as the owner and operator of the first full-service Swensen’s ice cream restaurant in Singapore back in 1979. The Swensen’s brand, with over 25 restaurants in Singapore, remains one of the market leaders in the western casual dining category and one of the preferred choices in good value family dining.\r\n\r\nBeyond Swensen’s and Earle Swensen's, we also manage and develop a portfolio of well-known food and beverage companies and brands. These include Season Confectionary & Bakery, Season’s Café, Tip Top Curry Puffs, Yogen Früz, and Chilli Padi Nonya Catering."
1,ABUNDANCE INTERNATIONAL LIMITED,https://www.linkedin.com/company/life-in-abundance-international/about,Non-profit Organizations,https://lifeinabundance.org,"Life In Abundance International (LIA) is an African-founded, African-led, and faith-based organization with over 25 years of experience bringing hope to more than a million people in need.\r\n\r\nLIA exists to empower local churches in Africa and the Caribbean to serve the poor and vulnerable, creating sustainable transformation in their communities."
2,ABUNDANTE LIMITED,https://www.linkedin.com/company/abundanteseguros/about,Insurance,http://www.abundanteseguros.com.br,Abundante Corretora de Seguros - Todos os Ramos
3,ACCRELIST LTD,https://www.linkedin.com/company/accrelist-ai-tech/about,IT Services and IT Consulting,http://www.weh.sg/,"Our Business scope:\r\n1. -Aesthestics Platform \r\n2. -Digital Marketing (FB, IG,Wechat)\r\n3. -Door Access System\r\n4. -Unmanned Store"
4,ACESIAN PARTNERS LIMITED,https://www.linkedin.com/company/acesiansystems/about,Information Technology & Services,http://www.acesiansystems.com,"Created in 2016 as part of the longer-established Acesian Partners group, Acesian Systems is a collective of personnel dedicated to serving the technical needs of the AV and IT Industries."


In [300]:
# Step 1: Remove leading and trailing whitespace
df['Text Content'] = df['Text Content'].str.strip()

# Step 2: Replace newline characters with spaces
df['Text Content'] = df['Text Content'].str.replace('\r\n', ' ', regex=False)
df['Text Content'] = df['Text Content'].str.replace('\n', ' ', regex=False)

# Step 3: Fill missing values with a placeholder
# df['Text Content'] = df['Text Content'].fillna('')

# Optional Step 4: Normalize text by converting to lowercase (if needed)
df['Text Content'] = df['Text Content'].str.lower()

df.head()

Unnamed: 0,Company,Linkedin Page,Org Category,Org Website,Text Content
0,ABR HOLDINGS LIMITED,https://www.linkedin.com/company/abr-holdings-limited/about,Food & Beverages,http://www.abr.com.sg/,"abr holdings limited (“abr”) began as the owner and operator of the first full-service swensen’s ice cream restaurant in singapore back in 1979. the swensen’s brand, with over 25 restaurants in singapore, remains one of the market leaders in the western casual dining category and one of the preferred choices in good value family dining. beyond swensen’s and earle swensen's, we also manage and develop a portfolio of well-known food and beverage companies and brands. these include season confectionary & bakery, season’s café, tip top curry puffs, yogen früz, and chilli padi nonya catering."
1,ABUNDANCE INTERNATIONAL LIMITED,https://www.linkedin.com/company/life-in-abundance-international/about,Non-profit Organizations,https://lifeinabundance.org,"life in abundance international (lia) is an african-founded, african-led, and faith-based organization with over 25 years of experience bringing hope to more than a million people in need. lia exists to empower local churches in africa and the caribbean to serve the poor and vulnerable, creating sustainable transformation in their communities."
2,ABUNDANTE LIMITED,https://www.linkedin.com/company/abundanteseguros/about,Insurance,http://www.abundanteseguros.com.br,abundante corretora de seguros - todos os ramos
3,ACCRELIST LTD,https://www.linkedin.com/company/accrelist-ai-tech/about,IT Services and IT Consulting,http://www.weh.sg/,"our business scope: 1. -aesthestics platform 2. -digital marketing (fb, ig,wechat) 3. -door access system 4. -unmanned store"
4,ACESIAN PARTNERS LIMITED,https://www.linkedin.com/company/acesiansystems/about,Information Technology & Services,http://www.acesiansystems.com,"created in 2016 as part of the longer-established acesian partners group, acesian systems is a collective of personnel dedicated to serving the technical needs of the av and it industries."


In [301]:
ref.rename(columns = {'entity_name': 'Company'}, inplace = True)
ref = ref.sort_values(by = ['Company', 'ssic_code']).drop_duplicates(subset = 'Company', keep = 'first')
ref = ref[['Company', 'ssic_code']]

test = pd.merge(df, ref, how = 'left', on = 'Company')

In [303]:
# NOTE There's discrepency in PDFs vs 'List of 90 Coy and SSIC' company names (12 companies missing) ..
# .. [Most were resolved by putting a '.' to List of 90 Coy and SSIC's df]
ref['Company'] = ref['Company'].apply(lambda x: x[:-1] if x[-1] == '.' else x)

test2 = pd.merge(test[test.ssic_code.isnull()], ref, how = 'left', on = 'Company')
test2.drop(columns='ssic_code_x', inplace = True)
test2.rename(columns={'ssic_code_y':'ssic_code'}, inplace = True)

In [305]:
# NOTE Web scrapping codes is taking the first profile in the search page, which may be the wrong company. Hence, scrapped info may be wrong.
## ACCRELIST LTD linkedin info seems wrong, but pdf and List of 90 Coy and SSIC's df seems correct (misaligned names!) ..
## TODO Probably should webscrape using the names from List of 90 Coy and SSIC's df instead of pdf names (since it's the source of truth for SSIC codes)?
df = pd.concat([test[~ test.ssic_code.isnull()], test2], axis = 0)

In [306]:
# NOTE Out of 90 coys, ..
## .. 18 has no LinkedIn content, ..
## .. 1 has PDF but not in List of 90 Coy and SSIC's df (ACMA LTD), ..
## .. 1 has PDF and in List of 90 Coy and SSIC's df (ACMA LTD) but partially different name (ACCRELIST LTD) [hence can't match to get SSIC codes!]
df = df[['Company', 'ssic_code', 'Text Content']][(df.ssic_code.notnull()) & (df['Text Content'].notnull())].reset_index(drop = True)

# just predicting up to Division level in the SSIC code
# df['ssic_code'] = df['ssic_code'].apply(lambda x: str(x)[0:2])

# 70-30 Train-Test split (randomised selection):
train = df.sample(round(df.shape[0]*0.7)).reset_index(drop = True)
test = df[~df.index.isin(train.index)].reset_index(drop = True)

In [307]:
df.shape

(70, 3)

In [308]:
train.shape

(49, 3)

In [309]:
test.shape

(21, 3)

In [310]:
train.head(1)

Unnamed: 0,Company,ssic_code,Text Content
0,KS ENERGY LIMITED,46541.0,"every day, we install the pipes and cables that power our lives—from lighting offices to heating homes. while this paints a pretty picture, our work is just pretty tough. we dig. we cut. we build. every time we climb into a protected hole, we’re taking a risk. the smallest mistake may cause an accident. this work demands safety, perfection, and efficiency. it’s mission: impossible. at ks, we live for it. we build energy infrastructure that you depend on. in this industry, everyone is good. we are better!”"


In [311]:
test.head(1)

Unnamed: 0,Company,ssic_code,Text Content
0,SERIAL SYSTEM LTD,46522.0,"serial system ltd started operations in 1988 as a distributor of electronics components in singapore. the company was incorporated as a private limited entity in april 1992 and listed in sesdaq in 1997. serial system ltd became a singapore exchange securities trading limited (sgx-st) mainboard listed company on 10 july 2000. the group has network of oversea offices, operations and representations in china, hong kong, south korea, taiwan, india, malaysia, thailand, philippines and vietnam. the group is a leading semiconductors/ components distributor in the asia pacific region, offering design supports, technology solutions and services, materials planning and inventory management. the group distributes a full range of active and passive components to original equipment manufacturers and sub-contractors in various industries and has forged strong business and technologies links with its customers. customers are provided with innovative value-added services such as turnkey design, warehousing and logistics support. the group's suppliers include well-established names such as texas instruments, on semiconductor, telegent system hk ltd, analog devices inc, nxp semiconductors singapore, silicon storage technology ltd, avago technologies international, tyco electronics, hitachi global storage technologies, micron semiconductor asia and mars."


In [312]:
for column in ssic.columns:
    ssic.loc[((ssic[column] == '<Blank>') | (ssic[column].isnull())), column] = ''
ssic = ssic[~ssic['SSIC 2020'].str.match(r'^[A-Z]$')]

# just predicting up to Division level in the SSIC code
# ssic['SSIC 2020'] = ssic['SSIC 2020'].apply(lambda x: str(x)[0:2])

# Define a cleaning function
def clean_text(text):
    if isinstance(text, str):
        # Replace newline characters with space
        text = text.replace('\n', ' ')
        # Replace special characters (bullet points) with a consistent delimiter
        text = text.replace('•', '- ')
        # Strip leading and trailing whitespace
        text = text.strip()
        # Replace multiple spaces with a single space
        text = ' '.join(text.split())
    return text

# Apply the cleaning function to the 'Info' column
ssic['Detailed Definitions'] = ssic['Detailed Definitions'].apply(clean_text)
ssic['Examples of Activities Classified Under this Code'] = ssic['Examples of Activities Classified Under this Code'].apply(clean_text)

ssic['textonly'] = ssic['SSIC 2020 Title'] + '.' + ssic['Detailed Definitions'] + '.' + ssic['Examples of Activities Classified Under this Code']
ssic = ssic[['SSIC 2020', 'textonly']]
ssic.columns = ['ssic_code', 'Text Content']
ssic = ssic[ssic['Text Content'].notnull()]

In [313]:
# NOTE include line below if want to add original SSIC text reference into training data:
train = pd.concat([train, ssic], axis = 0).reset_index(drop = True)

In [314]:
# Model training

textonly = train['Text Content']
text_tokens = [word_tokenize(t.lower()) for t in textonly]
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(text_tokens)]

model = Doc2Vec(vector_size=64, min_count=2, epochs=40)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=40)

In [315]:
# Parsing model outputs into a df

recsys_df = pd.DataFrame(columns = ["Company", "actualSSIC", "recSSIC", "score"])

for index, series in test.iterrows():

    test_doc = word_tokenize(series['Text Content'].lower())
    test_vec = model.infer_vector(test_doc)
    results = model.docvecs.most_similar(positive=[test_vec], topn=50)

    for i, (index, similarity_score) in enumerate(results):
        coy = series["Company"]
        actualssic = series['ssic_code']
        recssic = train["ssic_code"][index]
        score = similarity_score
        row = [coy, actualssic, recssic, score]
        recsys_df.loc[len(recsys_df)] = row

recsys_df.drop_duplicates(subset = ['Company', 'recSSIC'], keep = 'first', inplace = True)
recsys_df['rank'] = recsys_df.groupby('Company').cumcount()+1
recsys_df = recsys_df[recsys_df['rank'] <= 3]
recsys_df['actualSSIC'] = recsys_df['actualSSIC'].astype('Int64')

  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_similar(positive=[test_vec], topn=50)
  results = model.docvecs.most_sim

In [374]:
recsys_df

Unnamed: 0,Company,actualSSIC,recSSIC,score,rank
0,SERIAL SYSTEM LTD,46522,46522,0.940736,1
1,SERIAL SYSTEM LTD,46522,62021,0.738619,2
2,SERIAL SYSTEM LTD,46522,46523,0.73741,3
50,SHANAYA LIMITED,82110,82110,0.927824,1
51,SHANAYA LIMITED,82110,20131,0.607722,2
52,SHANAYA LIMITED,82110,42909,0.569224,3
100,SIA ENGINEERING COMPANY LIMITED,30302,30302,0.939599,1
101,SIA ENGINEERING COMPANY LIMITED,30302,95303,0.693227,2
102,SIA ENGINEERING COMPANY LIMITED,30302,26119,0.674959,3
150,SING INVESTMENTS & FINANCE LIMITED,64150,64150,0.937493,1


In [375]:
# For accuracy calculation (up to Division level)

recsysAccuracy_df = recsys_df.copy()

recsysAccuracy_df['actualSSIC'] = recsysAccuracy_df['actualSSIC'].apply(lambda x: str(x)[0:2])
recsysAccuracy_df['recSSIC'] = recsysAccuracy_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysAccuracy_df.loc[recsysAccuracy_df.actualSSIC == recsysAccuracy_df.recSSIC, 'match'] = 1
recsysAccuracy_df.loc[recsysAccuracy_df.match != 1, 'match'] = 0

recsysAccuracy_df = recsysAccuracy_df.sort_values(by = ['Company', 'match'], ascending=False).groupby('Company').head(1).reset_index(drop = True)
matchAccuracy = len(recsysAccuracy_df[recsysAccuracy_df.match == 1].match)/recsysAccuracy_df.shape[0]

print(f'Accuracy of Recommendation Model: {round(matchAccuracy*100,1)}%')

Accuracy of Recommendation Model: 76.2%


In [376]:
"""
<Company's correct SSIC code (up to Division level) found in top 3 recommendations>

Prediction Accuracy (with original SSIC text reference): 76.2% [better choice, proceed to analyse similarity score and overall stats below]
Prediction Accuracy (without original SSIC text reference): 61.9%
"""

"\n<Company's correct SSIC code (up to Division level) found in top 3 recommendations>\n\nPrediction Accuracy (with original SSIC text reference): 76.2% [better choice, proceed to analyse similarity score and overall stats below]\nPrediction Accuracy (without original SSIC text reference): 61.9%\n"

In [377]:
print(f"Overall Similarity Score: {round(recsys_df.score.mean(),2)} out of 1.00")

Overall Similarity Score: 0.72 out of 1.00


In [378]:
recsysStats_df = recsys_df.copy()

recsysStats_df['actualSSIC'] = recsysStats_df['actualSSIC'].apply(lambda x: str(x)[0:2])
recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].astype('int64')
recsysStats_df = recsysStats_df.groupby('Company')['recSSIC'].agg(['mean', 'std']).reset_index()
recsysStats_df['mean'] = round(recsysStats_df['mean'],1)
recsysStats_df['std'] = round(recsysStats_df['std'],1)
print(f"Overall Stats (99 Divisions in Total):\n{round(recsysStats_df.mean(),1)}")
recsysStats_df

Overall Stats (99 Divisions in Total):
mean    52.3
std     17.5
dtype: float64


Unnamed: 0,Company,mean,std
0,CHEMICAL INDUSTRIES (FAR EAST) LIMITED,45.7,25.5
1,CREATIVE TECHNOLOGY LTD,58.0,20.8
2,HOE LEONG CORPORATION LTD,34.3,11.2
3,MANUFACTURING INTEGRATION TECHNOLOGY LTD,34.0,10.6
4,NOEL GIFTS INTERNATIONAL LTD,46.3,12.7
5,PSC CORPORATION LTD,51.3,9.2
6,SAKAE HOLDINGS LTD,61.7,18.2
7,SERIAL SYSTEM LTD,51.3,9.2
8,SHANAYA LIMITED,48.0,31.4
9,SIA ENGINEERING COMPANY LIMITED,50.3,38.7


In [381]:
# For accuracy calculation (all levels)

recsysAccuracy_df = recsys_df.copy()

# recsysAccuracy_df['actualSSIC'] = recsysAccuracy_df['actualSSIC'].apply(lambda x: str(x)[0:2])
# recsysAccuracy_df['recSSIC'] = recsysAccuracy_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysAccuracy_df.loc[recsysAccuracy_df.actualSSIC == recsysAccuracy_df.recSSIC, 'match'] = 1
recsysAccuracy_df.loc[recsysAccuracy_df.match != 1, 'match'] = 0

recsysAccuracy_df = recsysAccuracy_df.sort_values(by = ['Company', 'match'], ascending=False).groupby('Company').head(1).reset_index(drop = True)
matchAccuracy = len(recsysAccuracy_df[recsysAccuracy_df.match == 1].match)/recsysAccuracy_df.shape[0]

print(f'Accuracy of Recommendation Model: {round(matchAccuracy*100,1)}%')

print(f"Overall Similarity Score: {round(recsys_df.score.mean(),2)} out of 1.00")
recsysStats_df = recsys_df.copy()

# recsysStats_df['actualSSIC'] = recsysStats_df['actualSSIC'].apply(lambda x: str(x)[0:2])
# recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].apply(lambda x: str(x)[0:2])

recsysStats_df['recSSIC'] = recsysStats_df['recSSIC'].astype('int64')
recsysStats_df = recsysStats_df.groupby('Company')['recSSIC'].agg(['mean', 'std']).reset_index()
recsysStats_df['mean'] = round(recsysStats_df['mean'],1)
recsysStats_df['std'] = round(recsysStats_df['std'],1)
print(f"Overall Stats (99,090 Sub-classes in total):\n{round(recsysStats_df.mean(),1)}")
recsysStats_df

Accuracy of Recommendation Model: 71.4%
Overall Similarity Score: 0.72 out of 1.00
Overall Stats (99,090 Sub-classes in total):
mean    52167.0
std     18244.7
dtype: float64


Unnamed: 0,Company,mean,std
0,CHEMICAL INDUSTRIES (FAR EAST) LIMITED,45955.0,25502.8
1,CREATIVE TECHNOLOGY LTD,58648.7,21088.1
2,HOE LEONG CORPORATION LTD,34617.3,11369.9
3,MANUFACTURING INTEGRATION TECHNOLOGY LTD,34314.3,10777.0
4,NOEL GIFTS INTERNATIONAL LTD,36777.3,29183.6
5,PSC CORPORATION LTD,51814.7,8841.0
6,SAKAE HOLDINGS LTD,62113.3,18627.2
7,SERIAL SYSTEM LTD,51688.7,8948.1
8,SHANAYA LIMITED,48383.3,31350.0
9,SIA ENGINEERING COMPANY LIMITED,50574.7,38792.3
