# Creation of Work Layer

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
from tqdm.auto import tqdm
pd.set_option('display.max_columns', None)

### Dataset
###### Variables: 
- PersonNr
- ArbstId
- CfarNr_LISA
- AstNr_LISA
- KU1CfarNr
- ArbTid
- YrkStalln

In [2]:
#csv_data = 'work_eco_data.csv' #about 66 000
#csv_data_small = 'work_eco_data_s.csv' #about 200 000
csv_data_medium = 'work_eco_data_m.csv' #about 4M

# Read the CSV file into a DataFrame
data = pd.read_csv(csv_data_medium)
data.head()

Unnamed: 0,PersonNr,CfarNr_LISA,ArbstId,AstNr_LISA,AstKommun,AstLan,KU1PeOrgNr,KU1CfarNr,KU1AstNr,KU1AstKommun,KU1AstLan,KU1YrkStalln,KU2PeOrgNr,KU2CfarNr,KU2AstNr,KU2AstKommun,KU2AstLan,KU2YrkStalln,KU3PeOrgNr,KU3CfarNr,KU3AstNr,KU3AstKommun,KU3AstLan,KU3YrkStalln,SyssStat,ArbTid,YrkStalln,KU1lnk,KU2lnk,KU3lnk,Raks_SummaInk,Raks_Huvudanknytning,Raks_EtablGrad,Raks_Forvink
0,19461004-7721,858393992,85839399270172Kalmar490821-3993,70172,Kalmar,Kalmar county,490821-3993,858393992,70172,Kalmar,Kalmar county,1,851103-2089,779393687,94846,Mörbylånga,Kalmar county,1,-,-,-,0,0,-,5,2.0,2.0,1150,447,0,3370,1,1.0,0
1,19690213-8710,-,--0000-,-,0000,00,-,-,-,0000,00,-,-,-,-,0000,00,-,-,-,-,0,0,-,6,,,0,0,0,3208,6,,0
2,19741231-1992,-,--0000-,-,0000,00,-,-,-,0000,00,-,-,-,-,0000,00,-,-,-,-,0,0,-,6,,,0,0,0,5202,5,,0
3,19371201-5418,881535518,88153551863062Sundbyberg921231-3036,63062,Sundbyberg,Stockholm county,921231-3036,881535518,63062,Sundbyberg,Stockholm county,1,-,-,-,0000,00,-,-,-,-,0,0,-,1,3.0,0.0,2506,0,0,2506,7,,0
4,19411130-2845,169250916,16925091600512Sundbyberg660511-4964,00512,Sundbyberg,Stockholm county,660511-4964,169250916,00512,Sundbyberg,Stockholm county,1,660517-6177,592438972,67231,Upplands-Bro,Stockholm county,1,-,-,-,0,0,-,1,5.0,4.0,1834,1292,0,3126,6,,0


Start by connecting people using just 'CfarNr'
- 'CfarNr' gives us all the people who work for the same company.
- Although this alone cannot be used to determine if people know each other, it may be useful for analysis in later stages e.g. how well certain companies pay...

In [3]:
def same_company(data): #check if people work for the same company

    start_time = time.time()

    #remove rows where 'CfarNr_LISA' is missing (equal to '-')
    data = data[data['CfarNr_LISA'] != '-']

    #dictionary mapping 'CfarNr_LISA' to a set of 'PersonNr'
    company_persons = data.groupby('CfarNr_LISA')['PersonNr'].apply(set).to_dict()

    results = []

    #tqdm progress bar
    progress_bar = tqdm(total=len(company_persons), desc='Processing companies')

    for company, persons in company_persons.items():
        #get pairs of 'PersonNr' working for the same company
        pairs = [(person1, person2) for person1 in persons for person2 in persons if person1 < person2]
        results.extend(pairs)
        #update progress bar
        progress_bar.update(1)


    progress_bar.close()
    end_time = time.time()

    results_data = pd.DataFrame(results, columns=['Person1', 'Person2'])
    results_data['Connection'] = 'works for same company'


    print(f"Time elapsed: {end_time - start_time:.2f} seconds") #takes about 30 seconds for 4M dataset

    return results_data


In [4]:
same_company(data)

Processing companies:   0%|          | 0/718091 [00:00<?, ?it/s]

Time elapsed: 34.17 seconds


Unnamed: 0,Person1,Person2,Connection
0,19740905-6696,19840212-6176,works for same company
1,19740905-6696,19840101-7291,works for same company
2,19740905-6696,19781101-8234,works for same company
3,19610316-3976,19740905-6696,works for same company
4,19610316-3976,19731202-0979,works for same company
...,...,...,...
14771763,19570628-9070,19750727-4959,works for same company
14771764,19570628-9070,19630821-1804,works for same company
14771765,19700422-0706,19831207-6596,works for same company
14771766,19631126-2797,19700422-0706,works for same company


### Working Ties Approach 1:
###### Variables: 'PersonNr','ArbstId'
- 'ArbstId' is an identifier which combines 4 variables: 'CfarNr_LISA', 'KU1AstNr', 'AstKommun' and 'KU1PerOrgNr'

In [5]:
def work_mates(data): #check ArbstId
    start_time = time.time()

    #remove rows where 'ArbstId' is missing (equal to '--0000-')
    data = data[data['ArbstId'] != '--0000-']

    #dictionary mapping 'ArbstId' to a set of 'PersonNr'
    persons = data.groupby('ArbstId')['PersonNr'].apply(set).to_dict()

    results = []

    #tqdm progress bar
    progress_bar = tqdm(total=len(persons), desc='Processing')

    for company, persons in persons.items():
        #get pairs of 'PersonNr' with the same ArbstId
        pairs = [(person1, person2) for person1 in persons for person2 in persons if person1 < person2]
        results.extend(pairs)
        #update progress bar
        progress_bar.update(1)


    progress_bar.close()
    


    results_data = pd.DataFrame(results, columns=['Person1', 'Person2'])
    results_data['Connection'] = 'workmates'

    end_time = time.time()
    print(f"Time elapsed: {end_time - start_time:.2f} seconds") #takes about 100 seconds for 4M dataset


    return results_data

In [6]:
work_mates(data)

Processing:   0%|          | 0/3777620 [00:00<?, ?it/s]

Time elapsed: 107.94 seconds


Unnamed: 0,Person1,Person2,Connection
0,19360827-2238,19471026-6937,workmates
1,19641225-0860,19760314-4678,workmates
2,19290117-7366,19300115-7276,workmates
3,19300221-0638,19631231-1502,workmates
4,19520419-7929,19630708-1057,workmates
5,19410201-6628,19790914-8219,workmates
6,19290218-4636,19571118-1123,workmates
7,19530719-2518,19801231-8441,workmates
8,19590109-9572,19760403-2507,workmates
9,19660310-5998,19771014-8153,workmates


In [7]:
#save results in CSV
results_one = work_mates(data)
csv_file_path = 'working_ties_one.csv'
results_one.to_csv(csv_file_path, index=False)

Processing:   0%|          | 0/3777620 [00:00<?, ?it/s]

Time elapsed: 107.24 seconds


### Working Ties Approach 2:
###### Variables: 'PersonNr', 'CfarNr_LISA', 'AstNr_LISA', 'KU1CfarNr', 'ArbTid' and 'YrkStalln'
- If two people work for the same company (CfarNr_LISA), at the same branch (AstNr_LISA), spend around the same hours at work (ArbTid) and have the same occupation (YrkStalln), get their main income from the same company (KU1CfarNr) they are workmates.

In [8]:
def group_map(data):
    #remove rows where 'CfarNr_LISA', 'AstNr_LISA', 'KU1CfarNr', 'ArbTid' and 'YrkStalln' are missing (equal to '-')
    data = data[(data['CfarNr_LISA'] != '-') & (data['AstNr_LISA'] != '-') 
                & (data['KU1CfarNr'] != '-') & (data['ArbTid'] != ' ')
                & (data['YrkStalln'] != ' ')]

    #group by the variables and create dictionary mapping to sets of 'PersonNr'
    grouped_persons = data.groupby(['CfarNr_LISA', 'KU1CfarNr', 'AstNr_LISA', 'ArbTid','YrkStalln'])['PersonNr'].apply(set).to_dict()

    return grouped_persons

In [9]:
# same company, branch, primary source of income, work time and occupation
def workmates(data):
    
    start_time = time.time()

    company_persons = group_map(data)
    results = []

    #tqdm progress bar
    progress_bar = tqdm(total=len(company_persons), desc='Processing')

    for (company, branch, ku_one, work_time, occupation), persons in company_persons.items():
        #pairs of 'PersonNr' working for same company and branch
        pairs = [(person1, person2) for person1 in persons for person2 in persons if person1 < person2]
        results.extend(pairs)
        progress_bar.update(1)


    progress_bar.close()
    end_time = time.time()


    results_data = pd.DataFrame(results, columns=['Person1', 'Person2'])
    results_data['Connection'] = 'workmates'


    print(f"Time elapsed: {end_time - start_time:.2f} seconds") #takes about 150 seconds for 4M dataset

    return results_data


In [10]:
workmates(data)

Processing:   0%|          | 0/3777464 [00:00<?, ?it/s]

Time elapsed: 158.70 seconds


Unnamed: 0,Person1,Person2,Connection
0,19381130-5137,19440921-3581,workmates
1,19610722-4992,19751231-3565,workmates
2,19590329-3538,19670310-2286,workmates
3,19391103-1631,19700416-7931,workmates
4,19600913-3042,19870505-2640,workmates
...,...,...,...
164,19560602-3888,19860508-7716,workmates
165,19511125-4057,19650508-4111,workmates
166,19710130-5988,19860207-1785,workmates
167,19330104-9422,19850703-4158,workmates


In [11]:
results_two = workmates(data)
csv_file_path = 'working_ties_two.csv'
results_two.to_csv(csv_file_path, index=False)

Processing:   0%|          | 0/3777464 [00:00<?, ?it/s]

Time elapsed: 163.20 seconds
