# Preparation for User validation. Obtaining User's Job Titles and Income 

- In this notebook, I am preparing the validation data. This includes:
    - fetching the job titles and income of users via N-gram frequency analysis. 
    - subsequent data cleaning of manually inspected job title files to prep for model comparison and user estimate validation
- Final result here is the finished file of users and their job titles and income

In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import matplotlib
import spacy
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

import importlib
# Local application imports
sys.path.insert(0, '../Utility files')
import utils2
from utils2 import *

### N-grams + frequency analysis

In [2]:
# # Load the dimension data

path = '/home/livtollanes/NewData/coordinates/dim_X/'
file = '/m1_dimx_users_bios.csv'
bios = pd.read_csv(path+file, index_col=0)

#Sort the df from high to low according to the first dimenion
bios = bios.sort_values(by='0', ascending=False)
#bios['follower_id'].nunique() #115079

115079

#### Obtain the most frequent n-grams in bios
Save to separate files. These are manually inspected in a separate file. Job related ngras are saved, and later manually given PCS-ESE codes

In [None]:
importlib.reload(utils2)
all_descriptions = ' '.join(bios['description_cleantext'])
ngram_freq = utils2.get_ngram_freq(all_descriptions,n=1)


# Get the most common bigrams
#most_common_unigrams = ngram_freq.most_common(4000)

#Write the ngrams to csv
# importlib.reload(utils2)
# utils2.write_ngrams_to_csv(most_common_unigrams, 'unigrams.csv')
# utils2.write_ngrams_to_csv(most_common_bigrams, 'bigrams.csv')
# utils2.write_ngrams_to_csv(most_common_trigrams, 'trigrams.csv')
# utils2.write_ngrams_to_csv(unigrams_last1000, 'uni_last1000.csv')

### Data soure 1: Filter bios based on ngrams from bios

- Here, I am filtering the user bios by looking for overlapping ngrams. The ngrams I use here stem from people's bios, so identifying the common ways of referring to a job title. 
- This is the ngram matching approach that yields the most amount of users, since it reflects people's everyday language. 
- Eventually, the final df from this part needs to be annotated, deleting poorly identified bios. 

In [2]:
#Load the file containing all relevant ngrams
path = '/home/livtollanes/NewData/annotations/'
file = 'keywords_INSEE.csv'
job_titles = pd.read_csv(path+file, header=0)

In [10]:
#Preprocessing of the key word file

#rename columns
job_titles = job_titles.rename(columns={'n_gram': 'key_word', 'PCS_ESE_code': 'PCS_ESE'})

#Sort job titles alphabetically and drop unnecessary cols
job_titles = job_titles.sort_values('key_word')
job_titles = job_titles.drop('ngram', axis=1)
job_titles = job_titles.drop('count', axis=1)

#dropping the ocs category just for now. might add it later - if income validation is not enough
job_titles = job_titles.drop('pcs_category_numbers', axis=1)
job_titles = job_titles.drop('pcs_category_name', axis=1)

#Syarting with 547 rows (key words of different token size)

#Reformat NA cells and drop rows with no PCS_ESE code 
job_titles['PCS_ESE_name'] = job_titles['PCS_ESE_name'].replace('Na', np.nan)
job_titles['PCS_ESE'] = job_titles['PCS_ESE'].replace('Na', np.nan)

job_titles = job_titles.dropna(subset=['PCS_ESE']) #Now we have 226 key words left. We started with 547 key words. 321 could not be matched with INSEE data. 


#Remove newline characters in the 'PCS_ESE_name' column
job_titles['PCS_ESE'] = job_titles['PCS_ESE'].replace('\n', ' ', regex=True)
job_titles['PCS_ESE'] = job_titles['PCS_ESE'].str.strip()


job_titles['PCS_ESE_name'] = job_titles['PCS_ESE_name'].replace('\n', ' ', regex=True)
job_titles['PCS_ESE_name'] = job_titles['PCS_ESE_name'].str.strip()


In [283]:
#Load the merged income data

#income
path = '/home/livtollanes/NewData/annotations/'
file = 'INSEE_merged.csv'
income = pd.read_csv(path+file, sep=",")

#make the codes lower case, to match between data sets
income = income.drop(income.columns[0], axis=1)
income['PCS_ESE'] = income['PCS_ESE'].str.lower()
income = income.sort_values('Salaire moyen en EQTP')

In [3]:
#Get the user bios - that later wil get added income
# # Load the data

path = '/home/livtollanes/NewData/coordinates/dim_X/'
file = '/m1_dimx_users_bios.csv'
bios = pd.read_csv(path+file, index_col=0)

#Sort the df according to the first dimenion
bios = bios.sort_values(by='0', ascending=False)

In [285]:
#Tokenize our bios and save all tokens in a new column
nltk.download('stopwords')
stop_words = set(stopwords.words('french'))

bios = utils2.tokenize_bios(bios, stop_words)

# Convert key words from strings to tuples in order to filter based on our predefined n-gram list
job_titles['titles'] = job_titles['key_word'].apply(lambda x: tuple(x.split()))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/livtollanes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [286]:
# Create a list of job titles from which we will filter the bios
titles_list = job_titles['titles'].tolist()

# Include only the users that have bios with n_grams occurring in our job title list
filtered_bios = bios[bios['total_n_grams'].apply(lambda x: any(ng in x for ng in titles_list))].copy()

# Add a new column 'titles'
filtered_bios.loc[:, 'titles'] = filtered_bios['total_n_grams'].apply(lambda x: [ng for ng in titles_list if ng in x])

In [287]:
# Split the detected titles into their respective n size
importlib.reload(utils2)
filtered_bios['unigrams_detected'] = filtered_bios['titles'].apply(lambda x: utils2.separate_ngrams(x)['unigrams_detected'])
filtered_bios['bigrams_detected'] = filtered_bios['titles'].apply(lambda x: utils2.separate_ngrams(x)['bigrams_detected'])
filtered_bios['trigrams_detected'] = filtered_bios['titles'].apply(lambda x: utils2.separate_ngrams(x)['trigrams_detected']) 

With our own key words, we end up with 17 782 rows (users) that have a bio that is in our list

In [293]:
# Add PCS_ESE codes to the file

#multiple rows per users, one per suggested title
import ast

# Convert string representation of list of tuples into actual list of tuples
#filtered_bios['titles'] = filtered_bios['titles'].apply(ast.literal_eval)

#Explode the 'titles' column
bios_exploded = filtered_bios.explode('titles')


In [295]:
#add  a PCS_ESE code per suggested title
# Convert all titles to lowercase
job_titles['titles'] = job_titles['titles'].apply(lambda x: tuple([i.lower() for i in x]))
bios_exploded['titles'] = bios_exploded['titles'].apply(lambda x: tuple([i.lower() for i in x]))

bios_exploded = bios_exploded.merge(job_titles, on='titles', how='left')

In [299]:
selected_columns = bios_exploded[['follower_id', '0', 'screen_name', 'description_cleantext', 'titles', 'key_word', 'PCS_ESE', 'PCS_ESE_name']]
selected_columns.head()

Unnamed: 0,follower_id,0,screen_name,description_cleantext,titles,key_word,PCS_ESE,PCS_ESE_name
0,4808491882,2.183514,bone_avocat,Avocat associé Moyersoen Avocats /droit du sport membre fondateur de l'AIAF & l'Association des Avocats en Droit du Sport # AADS,"(avocat,)",avocat,312a,Avocats
1,186103608,2.141388,Son_and_graf,"Graphiste freelance de métier, j'ai décidé de mettre à mon panel de créations le design de kit sportif afin de vous les partager ici","(graphiste,)",graphiste,354a,Artistes plasticiens
2,712739777515233281,2.104645,makunesss,"coach sportif, ancien footballeur professionnel formé a @OM_Officiel , passé par @fciopofficiel @nimesolympique @uscl_football @athleticomars @fcannecy","(coach, sportif)",coach sportif,424a,"Moniteurs et éducateurs sportifs, sportifs professionnels"
3,1175402702836248578,2.09548,MaleckArts,"Communication chez @PartoucheSport Graphiste freelance, collab avec @estac_officiel @OpenSuddeFrance @CABCLRUGBY @QRM @Sport360..","(graphiste,)",graphiste,354a,Artistes plasticiens
4,68796663,2.092969,LGClequipe,journaliste @lequipe @Ligue2BKT @ToulouseFC et un peu de @MotoGP,"(journaliste,)",journaliste,352a,Journalistes (y c. rédacteurs en chef)


In [300]:
#write to csv
#bios_exploded.to_csv('/home/livtollanes/NewData/annotations/source1_titles.csv', index = False)
#filtered_bios = pd.read_csv('/home/livtollanes/NewData/annotations/source1_titles.csv')


#Write to annotation csv
# # Select specific columns
# selected_columns = bios_exploded[['follower_id', '0', 'screen_name', 'description_cleantext', 'titles', 'key_word', 'PCS_ESE', 'PCS_ESE_name']]
# # Save to CSV
# selected_columns.to_csv('/home/livtollanes/NewData/annotations/source1.csv', index=False)

Now, the next step is to go over the bios, manually annotate them. Do the added titles match?

### Data source 2: Add suggested PCS_ESE codes based on INSEE n_grams

- Here, I am using the Ngram matching approach based on ngrams from the income files. I only use this for the bios that have not already been matched with the outher ngram match approach (data source 1). this is because I assume they'll yield similar result and only add to the complexuty of the manual annotation process. 
- Them pla was for this df to be manually inspected too, removing rows with unrealistic suggestions. However, it was deemed more fruitful to just use the source1 bios, so this step was overlooked in inspection.

In [215]:
#Obtain ngrams on the Insee data
#income
path = '/home/livtollanes/NewData/annotations/'
file = 'INSEE_merged.csv'
income = pd.read_csv(path+file, sep=",")

#make the codes lower case, to match between data sets
income = income.drop(income.columns[0], axis=1)
income['PCS_ESE'] = income['PCS_ESE'].str.lower()
income = income.sort_values('Salaire moyen en EQTP')

In [217]:
# Preprocess the job labels in the INSEE data and return tokens 
# We only want isngulars in the job titles
# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the French language model
nlp = spacy.load('fr_core_news_sm')

#Apply the function to the 'total_n_grams' column
income['label_processed'] = income['label'].apply(lambda text: utils2.preprocess_text(text, nlp))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/livtollanes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/livtollanes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/livtollanes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [218]:
#filter the bios df to only include the rows where follower_ids is not already in filtered_bios
#this is in order to time optimise the process, and see if we can add some extra 
bios_non_captured = bios[~bios['follower_id'].isin(filtered_bios['follower_id'])]

bios_non_captured.shape # 97 250

In [221]:
#Get ngrams
income['ngrams'] = income['label_processed'].apply(lambda x: utils2.get_ngrams(x, 3))

# Convert n-grams to lists when creating the DataFrames
income['ngrams'] = income['ngrams'].apply(lambda x: [' '.join(tup) for tup in x])
bios_non_captured['total_n_grams'] = bios_non_captured['total_n_grams'].apply(lambda x: [' '.join(tup) for tup in x])

# Apply the function to each row in the 'bios' DataFrame
importlib.reload(utils2)
bios_non_captured['INSEE_suggested_PCS_ESE'] = bios_non_captured['total_n_grams'].apply(utils2.find_all_matches2, income_df=income)

#include only the ones that have a suggestion
bios_non_captured2 = bios_non_captured[bios_non_captured['INSEE_suggested_PCS_ESE'].apply(len) > 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bios_non_captured['total_n_grams'] = bios_non_captured['total_n_grams'].apply(lambda x: [' '.join(tup) for tup in x])


In [318]:
s2 = pd.read_csv('/home/livtollanes/NewData/annotations/source2.csv')
# Rename 'INSEE_suggested_PCS_ESE' to 'PCS ESE'
s2_test = s2_test.rename(columns={'PCS_ESE': 'INSEE_suggested_PCS_ESE'})

In [316]:
#add  a PCS_ESE code per suggested title

# Rename 'INSEE_suggested_PCS_ESE' to 'PCS ESE'
s2 = s2.rename(columns={'INSEE_suggested_PCS_ESE': 'PCS_ESE'})
# Select 'PCS_ESE' and 'label' columns from 'income'
income_selected = income[['PCS_ESE', 'label']]

# Merge 's2' with the selected columns from 'income'
s2_test = s2.merge(income_selected, on='PCS_ESE', how='left')# Select 'PCS_ESE' and 'label' columns from 'income'

# Rename 'INSEE_suggested_PCS_ESE' to 'PCS ESE'
s2_test = s2_test.rename(columns={'PCS_ESE': 'INSEE_suggested_PCS_ESE'})

In [None]:
#add  a PCS_ESE code per suggested title
# Convert all titles to lowercase
job_titles['titles'] = job_titles['titles'].apply(lambda x: tuple([i.lower() for i in x]))
bios_exploded['titles'] = bios_exploded['titles'].apply(lambda x: tuple([i.lower() for i in x]))

bios_exploded = bios_exploded.merge(job_titles, on='titles', how='left')

In [237]:
#add one row per suggestion per user
bios_non_captured2 = bios_non_captured2.explode('INSEE_suggested_PCS_ESE')

#save source2 data to csv
bios_non_captured2.to_csv('/home/livtollanes/NewData/annotations/Data/source2_titles.csv', index = False)

In [321]:
# Select specific columns
#selected_columns = s2[['follower_id', '0', 'screen_name', 'description_cleantext', 'INSEE_suggested_PCS_ESE']]

# Save to CSV
s2_test.to_csv('/home/livtollanes/NewData/annotations/source2.csv', index=False)

### Source 3: Manually identified users

Here, it is intended that I add the users that resulted from the manual inspection. These need to be properly annotated though

#### Manual inspection of bios - lower end focus

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
bios[['0', 'follower_id', 'screen_name', 'description_cleantext']].iloc[5000:5100]

Unnamed: 0,0,follower_id,screen_name,description_cleantext
109295,1.562961,2247559753,FrostPlayer,"Luis Enriqué : ""Si une équipe peut nous mettre 4 buts alors on peut leur en mettre 6 avec l'aide de l'arbitre.."""
106286,1.56296,734408245,MaeGameuZe,32 ans Baigne dans les jeux vidéo depuis pitchoune
109103,1.562946,790830293540278272,Archange341,Bienvenue iciendroit sympa : https://urlz.fr/i0SNhttps://urlz.fr/i0SQ
108888,1.562859,1499167418,n_attend,Mon rêve c'était d'être connu en faisant trembler San Siro - #ACMilan #HeatNation #RavensFlock
112008,1.562833,747896821,Potiron_Vert,Supporter des verts !
112072,1.562673,939675719310946304,vallhe10,"19 ans, fan de foot et de l'OM #TeamOM #TeamParieur Fan de #StrangerThings et de l'actrice Isabelle NantySnapchat : vlhermitte1"
106569,1.562664,1031426233,_TacleALaGorge_,Point faible: trop fort.
112132,1.562521,262686512,charloo44,"Auxiliaire de viePetite fille d'agriculteursSupportrice du FC Nantes, abonnée en Loire.Team Déplacements #ActivNantes"
108927,1.562433,142225944,dezphilippe,Rien à déclarer..
112507,1.56243,895736725418385409,sofloer,IDE complètement à l’Ouest . coup de foudre pour l’Andalousie et aussi le Portugal team rando .


### Cleaning the final annotated job title df

In [10]:
#Load data
path = '/home/livtollanes/NewData/annotations/'
file = 'onlygreens1.csv'
onlygreens = pd.read_csv(path+file)

#income data
path = '/home/livtollanes/NewData/annotations/'
file = 'INSEE_merged.csv'

income = pd.read_csv(path+file, sep=",")

In [12]:
#Make the codes lower case, to match between data sets
income = income.drop(income.columns[0], axis=1)
income = income.drop('Effectifs en EQTP', axis=1)
income['PCS_ESE'] = income['PCS_ESE'].str.lower()

onlygreens['PCS_ESE'] = onlygreens['PCS_ESE'].str.lower()

#merge the two dfs 
onlygreens = onlygreens.merge(income, on='PCS_ESE', how='left')

In [13]:
#Check for NA in income codes
na_salaries = onlygreens['Salaire moyen en EQTP'].isna()
associated_pcs_ese = onlygreens.loc[na_salaries, 'PCS_ESE'].unique()
print(associated_pcs_ese) #['100x' '352b' '335a' '22d6' '464b']

#remove rows where income is NA
onlygreens = onlygreens.dropna(subset=['Salaire moyen en EQTP'])

#inspect the rows where duplicates don't have the same job code
duplicates = onlygreens[onlygreens.duplicated('screen_name', keep=False)]
sorted_duplicates = duplicates.sort_values('screen_name')

unique_pcs_ese = sorted_duplicates.groupby('screen_name')['PCS_ESE'].nunique() > 1
screen_names_with_multiple_pcs_ese = sorted_duplicates['screen_name'].isin(unique_pcs_ese.index[unique_pcs_ese])
rows_with_multiple_pcs_ese = sorted_duplicates[screen_names_with_multiple_pcs_ese]
rows_with_multiple_pcs_ese.shape #none. Clear for duplicate deletion

#For the rest of the duplicates, remove keep only the first version
onlygreens = onlygreens.drop_duplicates(subset=['screen_name'], keep='first')

['100x' '22d6' '335a' '352b' '464b']


In [15]:
#Create a title column that merges variants of the same key words into its neutral form

# Create a new column 'title' from 'key_word'
onlygreens['title'] = onlygreens['key_word']

# Define a dictionary of words to replace
replace_dict = {'actrice': 'acteur',
'adjointe maire': 'adjoint maire',
'aide soignante': 'aide soignant',
'animateur enfance' : 'animateur socioculturel',
'animateur jeunesse': 'animateur socioculturel',
'animateur périscolaire': 'animateur socioculturel',
'animateur sportif': 'animateur socioculturel',
'animatrice radio': 'animateur radio',
'artiste céramiste': 'artiste plasticien',
'artiste plasticienne': 'artiste plasticien',
'artiste peintre': 'artiste plasticien',
'auteure compositrice interprète': 'auteur compositeur interprète',
'avocate': 'avocat',
'barmaid': 'barman',
'chanteuse': 'chanteur',
'chargé com': 'chargée communication',
'chargée com': 'chargée communication',
'chauffeur poid lourd': 'chauffeur poids lourds',
'chauffeur poids lourd': 'chauffeur poids lourds',
'chercheuse cnrs': 'chercheur',
'chercheur cnrs': 'chercheur',
'chroniqueuse': 'chroniqueur',
'coach sportive': 'coach sportif',
'coach handball': 'coach sportif',
'coach basket': 'coach sportif',
'tennis coach': 'coach sportif',
'entraineur coach': 'coach sportif',
'coiffure': 'coiffeur',
'compositrice': 'compositeur',
'comédienne': 'comédien',
'comédienne metteur scène': 'comédien metteur scène',
'conducteur métro': 'conducteur train métro',
'conducteur train': 'conducteur train métro',
'conducteur sncf': 'conducteur train métro',
'danseuse': 'danseur',
'doctorante': 'doctorant',
'doctorant droit public': 'doctorant',
'doctorant droit': 'doctorant',
'doctorant histoire': 'doctorant',
'doctorant droit': 'doctorant',
'développer': 'développeur',
'développeur web mobile': 'développeur',
'illustratrice': 'illustrateur',
'ingénieure recherche': 'ingénieur recherche',
'musicienne': 'musicien',
'médecin santé publique': 'médecin',
'médecin éducation nationale': 'médecin',
'médecine': 'médecin',
'ouvrière': 'ouvrier',
'responsable com': 'responsable communication',
'rédac chef': 'rédacteur chef',
'red chef': 'rédacteur chef',
'rédac chef adjoint': 'rédacteur chef adjoint',
'rédacteur chef adjoint': 'rédacteur chef adjoint',
'red chef adjointe': 'rédacteur chef adjoint',
'rédactrice chef': 'rédacteur chef',
'rédactrice chef adjointe': 'rédacteur chef adjoint',
'soignante': 'soignant',
'responsable com': 'responsable communication',
'sapeur': 'pompier',
'journalist': 'journaliste',
'agrégé sciences économiques': 'agrégé',
'agrégé droit': 'agrégé',
'assistante direction': 'assistant direction',
'chargé recherche': 'chargée recherche',
'compositeur interprète': 'auteur compositeur interprète',
'chercheuse': 'chercheur'
}

# Use replace function
onlygreens['title'] = onlygreens['title'].replace(replace_dict)
onlygreens['title'] = onlygreens['title'].str.strip()


In [44]:
#There seems to be some errors in the onlygreens income code. some msmatches between the title and the income code.
#errors
#professeur universités: 1412681755 -delete this user
#onlygreens = onlygreens[onlygreens['follower_id'] != '1412681755']



# médecin: 181 are in category pcs_ese 344b, and 15 are in 344a. 
#onlygreens.loc[(onlygreens['title'] == 'médecin') & (onlygreens['Salaire moyen en EQTP'] == 6000.0), 'title'] = 'médecin hospitalier'

#artiste plasticien: 1100249761 and 476890899, 354b, title wrong. Should be musicien
#onlygreens.loc[(onlygreens['title'] == 'artiste plasticien') & (onlygreens['Salaire moyen en EQTP'] == 3200.0), 'title'] = 'musicien'


In [47]:
#save onlygreens to csv
#onlygreens.to_csv('//home/livtollanes/NewData/annotations/onlygreens_cleaned.csv', sep = ',')