In [1]:
import pandas as pd
import random
import numpy as np

# Building Professional contacts

For children, we assign a number of contacts in school.
Source: Donnees_Emploi_par_taille_unite_legale_et_secteur_2009_et_2010.xls


For adults, we assign a line of business and number of contacts when they are at work , or `(NaN,0)` if they don't have a job
Source: https://www.insee.fr/fr/statistiques/2582785?sommaire=2587886 (population active, emploi par activité)

En France, en 2015, 66 453 600 est la population totale. Population active: 28,7 millions de personnes de 15 ans et plus, dont 25,8 millions d’actifs ayant un emploi.
Raccourci : si nous considérons comme actifs les personnes de [20,60] ans, qui représentent 50% de la population totale, nous pouvons en déduire que les "actifs" (selon notre définition) correspondent à 67% des adultes. (big approximation...)

For adults, we simulate a repartition in companies:
* 33% don't work (no job, elderly people, etc.)
* 67% have a job


In [2]:
# Global variables
NB_WORKING_ADULTS=67 # percentage of adluts with jobs
SCHOOL_CONTACT_MIN, SCHOOL_CONTACT_MAX = 10,100
POP_TOTALE=66453600 # French people
PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES=0.3

In [3]:
df = pd.read_csv('households.csv')

In [4]:
df.rename(columns={'Unnamed: 0':'household_id'}, inplace=True)
df

Unnamed: 0,household_id,nb_children,nb_adults,type,size
0,0,2,2,two_parent_family,4
1,1,1,2,two_parent_family,3
2,2,0,3,two_parent_family,3
3,3,2,2,two_parent_family,4
4,4,2,2,two_parent_family,4
5,5,0,3,two_parent_family,3
6,6,0,3,two_parent_family,3
7,7,1,2,two_parent_family,3
8,8,1,2,two_parent_family,3
9,9,2,2,two_parent_family,4


In [5]:
# Population P
P=sum(df['nb_children'])+sum(df['nb_adults'])

## From households to adults (nodes)

In [6]:
household=[]
adults=[]
a=0
# iterate on rows, iat[j] gives the value of column j
for (index,row) in df.iterrows(): 
    # create the number of adult according to row.iat[2]
    for i in range(row.iat[2]):
        household.append(row.iat[0])
        adults.append(a)
        a+=1
df_adults = pd.DataFrame({'household_id' : pd.Series(household),
     'adult_id' : pd.Series(adults)})

In [7]:
df_adults.tail()

Unnamed: 0,household_id,adult_id
6955,3797,6955
6956,3798,6956
6957,3799,6957
6958,3800,6958
6959,3801,6959


## Adults with a job

In [8]:
# Use the random.sample() function when you want to choose multiple 
# random items from a list without repetition or duplicates.
working=random.sample(adults,round(len(adults)*NB_WORKING_ADULTS/100))
len(working)

4663

* job: we assign a line of business according to the proportion of the French population working in these line of businnes
   [insee](https://www.insee.fr/fr/statistiques/2569348?sommaire=2587886),Emploi par activité, Personnels et équipements de santé
   
* size_company: continuous uniform distribution of contacts, according the type of company (cf. their size in our xls file)
   `Donnees_Emploi_par_taille_unite_legale_et_secteur_2009_et_2010.xls`
   
* contacts_number: a proportion PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES of size_company

* company_id: With the number of companies in each line of business in France, we find the number of such types of company for our population P. Then we assign randomly (uniform distribution) a company_id to each worker
   Nb_companies per business line : https://www.insee.fr/fr/statistiques/1893274#consulter 
   For food shop https://www.insee.fr/fr/statistiques/2015051
   & files in directory Sources: `base-etablissements par secteur d'activite-2014.xls`, `equip-serv-sante-com-2018.xls`


In [9]:
# Totals working people in France: 25 844 000
job=[]
size_company=[]
contacts_number=[]
company_id=[]
for p in working:
    dice=random.choice(range(1, 1001))/10
    
    # Agriculture, sylviculture et pêche
    if dice<=2.7: # Agriculture, sylviculture et pêche
        job.append('Agriculture_fishing')
        size=round(np.random.uniform(1, 10))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*638739/POP_TOTALE))))
   
    # Industry
    if dice>2.7 and dice<=5.3: # Sub category of Industrie
        job.append('Indus_food')
        size=round(np.random.uniform(50, 250))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*68826/POP_TOTALE))))

    if dice>5.3 and dice<=16.7: # Cumulation of other sub categories of Industrie
        job.append('Indus_other') 
        size=round(np.random.uniform(50, 250))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*262885/POP_TOTALE))))
    
    # Construction
    if dice>16.7 and dice<=23.1: # Construction
        job.append('Construction')
        size=round(np.random.uniform(1, 10))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*557809/POP_TOTALE))))

        
    # Tertiaire    
    if dice>23.1 and dice<=35.6: # Sub Category of Commerce de gros et de détail, transports, hébergement et restauration 
        job.append('Shops_other') # shops non-food, mecanics
        size=round(np.random.uniform(1, 50))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*825371/POP_TOTALE))))
    # food: grocery store (45031 shops) and market places (86074 shops) 
    # https://www.insee.fr/fr/statistiques/2015051
    # 64 502 + 13 120 workers = 0.3% of working people
    if dice>35.6 and dice<=35.9:
        job.append('Shops_market_food') # Sub Category of Commerce de gros et de détail, transports, hébergement et restauration 
        size=round(np.random.uniform(1, 5)) # mean = 2
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*131105/POP_TOTALE))))   
    if dice>35.9 and dice<=39.7: # Sub Category of Commerce de gros et de détail, transports, hébergement et restauration 
        job.append('Hotel_Restaurant') # Hébergement et restauration
        size=round(np.random.uniform(10, 500))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*304033/POP_TOTALE))))
    if dice>39.7 and dice<=62.0: # Sub category of Adm. publique, enseignement, santé humaine et action sociale
        job.append('Administration_schools')
        size=round(np.random.uniform(20, 500))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*(584348-51322)/POP_TOTALE))))
    if dice>62.0 and dice<=70.9: #8.8% of working people are in health sector (Sub category of Adm. publique, enseignement, santé humaine et action sociale)
        job.append('Health')
        size=round(np.random.uniform(20, 1000))
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*51322/POP_TOTALE))))
    if dice>70.9 and dice<=76.5: 
        job.append('Transportation') # Transports et entreposage
        size=round(np.random.uniform(5, 5000)) # 5000? Airports?
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*132287/POP_TOTALE))))

    if dice>76.5: # The other categories in Tertiaire, Remote working possible (aproximation. ;-)
        job.append('Services_other') 
        size=round(np.random.uniform(1,5000)) # 5000? CNRS? Orange? etc.
        size_company.append(size)
        contacts_number.append(round(size*PROPORTION_OF_CONTACTS_AMONG_COLLEAGUES))
        company_id.append(round(np.random.uniform(0, round(P*1661828/POP_TOTALE))))
    

In [10]:
df_job = pd.DataFrame({'job_cat' : pd.Series(job),
     'adult_id' : pd.Series(working), 'pro_contacts' : pd.Series(contacts_number), 'company_id' : pd.Series(company_id)})

In [11]:
# Full outer join produces the set of all records in Table A and Table B, 
# with matching records from both sides where available. 
# If there is no match, the missing side will contain null.
df_r = df_adults.merge(df_job, on='adult_id', how='outer')
df_r

Unnamed: 0,household_id,adult_id,job_cat,pro_contacts,company_id
0,0,0,Indus_other,52.0,6.0
1,0,1,Hotel_Restaurant,114.0,21.0
2,1,2,Shops_other,14.0,45.0
3,1,3,Administration_schools,33.0,10.0
4,2,4,Services_other,770.0,34.0
5,2,5,Shops_other,5.0,1.0
6,2,6,,,
7,3,7,Shops_other,9.0,20.0
8,3,8,Administration_schools,147.0,57.0
9,4,9,Services_other,273.0,68.0


In [12]:
# replace NaN in pro_contacts Serie by a zero
df_r['pro_contacts'].fillna(0, inplace=True)

In [13]:
df_r.dtypes

household_id      int64
adult_id          int64
job_cat          object
pro_contacts    float64
company_id      float64
dtype: object

In [14]:
# convert float to integer 
df_r['household_id']= pd.to_numeric(df_r['household_id'], downcast='integer')
df_r['adult_id']= pd.to_numeric(df_r['adult_id'], downcast='integer')
df_r['pro_contacts']= pd.to_numeric(df_r['pro_contacts'], downcast='integer')
df_r['company_id']= pd.to_numeric(df_r['company_id'], downcast='integer')

In [15]:
df_r.to_csv('pro_contacts_adults.csv', index=False)

## Children

In [16]:
household=[]
children=[]
# ids of children start after the last adult id
a=len(adults)
# iterate on rows, iat[j] gives the value of column j
for (index,row) in df.iterrows(): 
    # create the number of child according to row.iat[1]
    for i in range(row.iat[1]):
        household.append(row.iat[0])
        children.append(a)
        a+=1
df_children = pd.DataFrame({'household_id' : pd.Series(household),
     'child_id' : pd.Series(children)})

In [17]:
len(children)

3085

In [18]:
df_children

Unnamed: 0,household_id,child_id
0,0,6960
1,0,6961
2,1,6962
3,3,6963
4,3,6964
5,4,6965
6,4,6966
7,7,6967
8,8,6968
9,9,6969


### Number of contacts in school

[Source](https://www.insee.fr/fr/statistiques/2569394?sommaire=2587886) 
Schools have various sizes, but... we simplify...

* Hypothesis: Continuous uniform distribution of contacts in schools between a min and a max

In [19]:
schools_size = [round(np.random.uniform(SCHOOL_CONTACT_MIN, SCHOOL_CONTACT_MAX)) for i in range(len(df_children['child_id']))]

In [20]:
df_children['school_contacts']=pd.Series(schools_size)

In [21]:
df_children

Unnamed: 0,household_id,child_id,school_contacts
0,0,6960,35
1,0,6961,58
2,1,6962,92
3,3,6963,25
4,3,6964,85
5,4,6965,15
6,4,6966,18
7,7,6967,72
8,8,6968,31
9,9,6969,16


### Assign school id to nodes

[Source](https://www.insee.fr/fr/statistiques/2569394?sommaire=2587886)
En 2015: 51 745 établissements en primaire et 11 331 en secondaire, 75 universités, une centaine de grandes écoles -> ~ 63000 établissements pour une population de 66 millions d'habitants.

* Hypothèse 1 : équi-répartition des établissements + aproximation des chiffres. Pour une population `P`, `P*63000/66000000`
* Hypothèse 2 : Répartition uniforme des âges parmi les enfants -> autant de chance d'être à l'université qu'en maternelle.
* Hypothèse 3: on ne tient pas compte des fratries

In [22]:
nb_schools=round(P*63000/66000000)
print("Population: %d, Nb schools: %d" %(P,nb_schools))

Population: 10045, Nb schools: 10


In [23]:
assignment=[round(random.choice(range(nb_schools))) for i in range(len(df_children))]


In [24]:
df_children['school_id']=pd.Series(assignment)

In [25]:
# convert float to integer 
df_children['household_id']= pd.to_numeric(df_children['household_id'], downcast='integer')
df_children['child_id']= pd.to_numeric(df_children['child_id'], downcast='integer')
df_children['school_contacts']= pd.to_numeric(df_children['school_contacts'], downcast='integer')
df_children['school_id']= pd.to_numeric(df_children['school_id'], downcast='integer')
df_children

Unnamed: 0,household_id,child_id,school_contacts,school_id
0,0,6960,35,2
1,0,6961,58,6
2,1,6962,92,5
3,3,6963,25,9
4,3,6964,85,1
5,4,6965,15,9
6,4,6966,18,1
7,7,6967,72,7
8,8,6968,31,2
9,9,6969,16,2


In [26]:
df_children.to_csv('pro_contacts_children.csv', index=False)

# Let's see



In [27]:
df_r[['job_cat','company_id']].groupby('job_cat').count()

Unnamed: 0_level_0,company_id
job_cat,Unnamed: 1_level_1
Administration_schools,1039
Agriculture_fishing,123
Construction,324
Health,432
Hotel_Restaurant,178
Indus_food,134
Indus_other,518
Services_other,1082
Shops_market_food,15
Shops_other,587


* Seems quite realistic, doesn't it?

In [28]:
df_r[['job_cat','pro_contacts']].groupby('job_cat').describe()

Unnamed: 0_level_0,pro_contacts,pro_contacts,pro_contacts,pro_contacts,pro_contacts,pro_contacts,pro_contacts,pro_contacts
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
job_cat,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Administration_schools,1039.0,77.205005,41.558295,6.0,40.0,76.0,114.0,150.0
Agriculture_fishing,123.0,1.747967,0.892533,0.0,1.0,2.0,2.0,3.0
Construction,324.0,1.728395,0.779177,0.0,1.0,2.0,2.0,3.0
Health,432.0,159.361111,85.495793,6.0,83.75,159.0,234.5,300.0
Hotel_Restaurant,178.0,73.88764,42.510999,4.0,41.25,71.0,113.5,149.0
Indus_food,134.0,45.350746,17.450793,15.0,31.25,41.0,61.75,75.0
Indus_other,518.0,45.137066,17.01821,15.0,30.0,44.0,60.0,75.0
Services_other,1082.0,767.502773,417.692773,1.0,421.5,769.5,1123.75,1500.0
Shops_market_food,15.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Shops_other,587.0,7.640545,4.288479,0.0,4.0,8.0,11.0,15.0


* The mean of contacts number may be too large for some large companies.... 
* Maybe, we should add customers in shops?

> **Or maybe this way to compute professional contacts doesn't make sense!!!**

Feel free to propose something elase, or just use a graph generator such as preferential attachment ;-)