In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
import os

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

In [6]:
from langchain_community.graphs import Neo4jGraph

NEO4J_DATABASE = "graphrag"

# Neo4j
NEO4J_URI = os.getenv("LOCAL_NEO4J_URI")
NEO4J_USERNAME = os.getenv("LOCAL_NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("LOCAL_NEO4J_PASSWORD")

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
)

### Data Pre-processing

In [7]:
import pandas as pd

DATASET = "data/linkedin-profiles.csv"
df = pd.read_csv(DATASET)
df.head()

Unnamed: 0,timestamp,id,name,city,country_code,region,current_company:company_id,current_company:name,position,following,...,people_also_viewed,educations_details,education,avatar,languages,certifications,recommendations,recommendations_count,volunteer_experience,сourses
0,2023-01-10,catherinemcilkenny,"Catherine Fitzpatrick (McIlkenny), B.A",Canada,CA,,,,Snr Business Analyst at Emploi et Développemen...,,...,"[{""profile_link"":""https://ca.linkedin.com/in/l...",Queen's University Belfast,"[{""degree"":""Bachelor of Arts (B.A.) Honours"",""...",https://media.licdn.com/dms/image/C4E03AQEcz_j...,,,,,,
1,2022-12-17,margot-bon-51a04624,Margot Bon,"The Randstad, Netherlands",NL,EU,gemeente-utrecht,Gemeente Utrecht,Communicatieadviseur Corporate & Strategie Gem...,,...,"[{""profile_link"":""https://nl.linkedin.com/in/j...",,"[{""degree"":""Scrum en Agile werken"",""end_year"":...",https://static.licdn.com/sc/h/244xhbkr7g40x6bs...,"[{""subtitle"":""Professional working proficiency...","[{""meta"":""Issued Jun 2013"",""subtitle"":""Van der...",Menno H. Poort “Ik werk al jaren prettig met M...,2.0,"[{""cause"":"""",""duration"":""Sep 2010 Jul 2020 9 y...",
2,2023-05-17,mike-dean-8509a193,Mike Dean,"England, United Kingdom",UK,,network-rail,Network Rail,Network Data Manager at Network Rail,,...,"[{""profile_link"":""https://uk.linkedin.com/in/g...",Brighton Polytechnic,"[{""degree"":""2:2"",""end_year"":""1991"",""field"":""El...",https://media.licdn.com/dms/image/C4D03AQHLj-Z...,,,,,,
3,2022-05-29,giovanna-panarella-99a0a4167,Giovanna Panarella,"Avellino, Campania, Italy",IT,EU,,Freelance,Architetto (Freelance),500.0,...,"[{""profile_link"":""https://it.linkedin.com/in/e...",Università di Camerino,"[{""degree"":""“Corso di aggiornamento profession...",https://media-exp1.licdn.com/dms/image/C4D03AQ...,,,,,"[{""cause"":""Arts and Culture"",""duration"":""Jan 2...",
4,2022-12-06,steve-latimer-3364327,Steve Latimer,"Ontario, Canada",CA,,mid-range-computer-group-inc.,Mid-Range Computer Group Inc.,Senior Account Executive at Mid-Range Computer...,,...,"[{""profile_link"":""https://ca.linkedin.com/in/d...",St. Michael's College School,"[{""degree"":"""",""end_year"":""1978"",""field"":"""",""me...",,,"[{""meta"":""Issued Jan 2022 See credential"",""sub...","Blake Reeves “If I was a customer, I would wan...",1.0,,


In [8]:
import json


def extract_industry(json_str):
    try:
        data = json.loads(json_str)
        return data.get("industry", None)
    except json.JSONDecodeError:
        return None


def extract_languages(json_list):
    try:
        languages = [entry["title"] for entry in json.loads(json_list)]
        return "|".join(languages)
    except:
        return None


def extract_country(string):
    if isinstance(string, str):
        elements = string.split(",")
        return elements[-1].strip()
    else:
        return None


df["industry"] = df["current_company"].apply(lambda x: extract_industry(x))
df["languages"] = df["languages"].apply(lambda x: extract_languages(x))
df["country"] = df["city"].apply(lambda x: extract_country(x))
df = df[
    [
        "id",
        "name",
        "current_company:name",
        "educations_details",
        "languages",
        "industry",
        "country",
    ]
].dropna()
industry_counts = df["industry"].value_counts()
df = df[df["industry"].isin(industry_counts[industry_counts > 2].index)].reset_index(
    drop=True
)
df = df.rename(
    columns={"current_company:name": "company", "educations_details": "education"}
)
df.head(10)

Unnamed: 0,id,name,company,education,languages,industry,country
0,paul-lukes-906608134,Paul Lukes,Toolbox Creative,California College of the Arts,English|Czech,Advertising Services,United States
1,roberto-merola-baa923103,Roberto Merola,Capgemini,Université libre de Bruxelles,English|Italian|French|Dutch|German,IT Services and IT Consulting,Belgium
2,minju-hong-bsn-rn-1a7801239,"Minju Hong, BSN, RN",University of Washington Medical Center,University of Washington School of Nursing,Korean|English,Hospitals and Health Care,United States
3,prateek-chitpur-710a1a12a,Prateek Chitpur,George Mason University,George Mason University Education George Mason...,English|Hindi|Marathi|Kannada|Telugu,Higher Education,United States
4,aadcampos,Alexandre Campos,Serpro - Serviço Federal de Processamento de D...,Unichristus,English,IT Services and IT Consulting,Brazil
5,gareth-reid-75966110b,Gareth Reid,Willis Towers Watson,University of Leicester,English|French|Spanish,Financial Services,United Kingdom
6,alaa-el-said-56740659,Alaa El-said,Microsoft,Mansoura University,Arabic|English,Software Development,Saudi Arabia
7,bagus-satya-mas,Bagus Satya Mas,Jatis Mobile,Universitas Udayana (UNUD),Indonesian|English|Japanese,Software Development,Indonesia
8,emrecruit,Emily S.,Dignity Health,Ottawa University,Spanish,Hospitals and Health Care,Greater Phoenix Area
9,giteshpatel21,Gitesh Patel,Baptist Health System KY & IN,Sullivan University,English|Hindi|Gujarati,Hospitals and Health Care,United States


### Load data on Neo4J

In [9]:
graph.refresh_schema()
print(graph.schema)

Node properties:

Relationship properties:

The relationships:



In [22]:
people_query = """
LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/martin-fabbri/graph-llm-agents/main/notebooks/data/profiles.csv'
AS row
MERGE (person:Person {name: row.name})
MERGE (company:Company {name: row.company})
MERGE (school:School {name: row.education})
MERGE (industry:Industry {name: row.industry})
MERGE (country:Country {name: row.country})

FOREACH (lang in split(row.languages, '|') | 
    MERGE (language:Language {name:trim(lang)})
    MERGE (person)-[:SPEAKS]->(language))

MERGE (person)-[:WORKS_IN]->(company)
MERGE (person)-[:LIVES_IN]->(country)
MERGE (person)-[:EDUCATED_AT]->(school)
MERGE (company)-[:IS_IN]->(industry)
"""

graph.query(people_query)

DatabaseError: {code: Neo.DatabaseError.Statement.ExecutionFailed} {message: At https://raw.githubusercontent.com/martin-fabbri/graph-llm-agents/main/notebooks/data/profiles.csv @ position 21241 -  there's a field starting with a quote and whereas it ends that quote there seems to be characters in that field after that ending quote. That isn't supported. This is what I read: '[{"cause":"","duration":"Sep 2010 Jul 2020 9 years 11 months","duration_short":"9 years 11 months","end_date":"Jul 2020","info":"Onderhouden FacebookpaginaSchrijven van wedstrijdverslagenFotografieOndersteuning wedstrijdzakenHuisstijl bewakenCommunicatiemiddelen verzorgenActiviteiten ledenwervingOndersteuning evenementen","start_date":"Sep 2010","subtitle":"Gymnastiekvereniging Vogel","title":"Redacteur en lid PR commissie","url":""},{"cause":"Education","duration":"Feb 2017 Jan 2019 2 years","duration_short":"2 years","end_date":"Jan 2019","info":"I am a volunteer at Library ""D""Jan 2019"",""info"":""I am a volunteer at Library \""D'}