In [2]:
import pandas as pd
from faker import Faker
import numpy as np
from sqlalchemy import create_engine

In [4]:
# Faker
fake = Faker()

In [6]:
# number of records to generate
num_users = 1000
num_jobs_per_user = 3
num_educations_per_user = 2
num_skills_per_user = 5
num_recommendations_per_user = 1
num_connections_per_user = 10

In [7]:
# database connection
engine = create_engine('sqlite:///linkedin_fake_data.db')

In [8]:
# Users data
users = {
    "user_id": list(range(1, num_users + 1)),
    "linkedin_id": [fake.unique.uuid4() for _ in range(num_users)],
    "first_name": [fake.first_name() for _ in range(num_users)],
    "last_name": [fake.last_name() for _ in range(num_users)],
    "headline": [fake.job() for _ in range(num_users)],
    "summary": [fake.text(max_nb_chars=500) for _ in range(num_users)],
    "profile_url": [fake.url() for _ in range(num_users)],
    "location": [fake.city() for _ in range(num_users)],
    "industry": [fake.word() for _ in range(num_users)],
    "email": [fake.email() for _ in range(num_users)]
}

In [9]:
df_users = pd.DataFrame(users)
df_users.to_sql('Users', engine, index=False, if_exists='replace')

1000

In [11]:
# JobHistory data
job_history = {
    "job_id": [],
    "user_id": [],
    "title": [],
    "company_name": [],
    "location": [],
    "start_date": [],
    "end_date": [],
    "description": []
}

In [12]:
for user_id in range(1, num_users + 1):
    for _ in range(num_jobs_per_user):
        job_history["job_id"].append(len(job_history["job_id"]) + 1)
        job_history["user_id"].append(user_id)
        job_history["title"].append(fake.job())
        job_history["company_name"].append(fake.company())
        job_history["location"].append(fake.city())
        start_date = fake.date_between(start_date='-10y', end_date='today')
        end_date = fake.date_between(start_date=start_date, end_date='today')
        job_history["start_date"].append(start_date)
        job_history["end_date"].append(end_date)
        job_history["description"].append(fake.text(max_nb_chars=200))

In [13]:
df_job_history = pd.DataFrame(job_history)
df_job_history.to_sql('JobHistory', engine, index=False, if_exists='replace')


3000

In [14]:
# Education data
education = {
    "education_id": [],
    "user_id": [],
    "school_name": [],
    "degree": [],
    "field_of_study": [],
    "start_date": [],
    "end_date": [],
    "description": []
}

In [15]:
for user_id in range(1, num_users + 1):
    for _ in range(num_educations_per_user):
        education["education_id"].append(len(education["education_id"]) + 1)
        education["user_id"].append(user_id)
        education["school_name"].append(fake.company())
        education["degree"].append(fake.word())
        education["field_of_study"].append(fake.word())
        start_date = fake.date_between(start_date='-10y', end_date='today')
        end_date = fake.date_between(start_date=start_date, end_date='today')
        education["start_date"].append(start_date)
        education["end_date"].append(end_date)
        education["description"].append(fake.text(max_nb_chars=200))

In [16]:
df_education = pd.DataFrame(education)
df_education.to_sql('Education', engine, index=False, if_exists='replace')

2000

In [17]:

# Skills data
skills = {
    "skill_id": [],
    "user_id": [],
    "skill_name": [],
    "endorsements_count": []
}

In [18]:
for user_id in range(1, num_users + 1):
    for _ in range(num_skills_per_user):
        skills["skill_id"].append(len(skills["skill_id"]) + 1)
        skills["user_id"].append(user_id)
        skills["skill_name"].append(fake.word())
        skills["endorsements_count"].append(fake.random_int(min=0, max=100))

In [19]:
df_skills = pd.DataFrame(skills)
df_skills.to_sql('Skills', engine, index=False, if_exists='replace')


5000

In [20]:
#  Recommendations data
recommendations = {
    "recommendation_id": [],
    "user_id": [],
    "recommender_name": [],
    "recommender_position": [],
    "recommendation_text": [],
    "date": []
}


In [21]:
for user_id in range(1, num_users + 1):
    for _ in range(num_recommendations_per_user):
        recommendations["recommendation_id"].append(len(recommendations["recommendation_id"]) + 1)
        recommendations["user_id"].append(user_id)
        recommendations["recommender_name"].append(fake.name())
        recommendations["recommender_position"].append(fake.job())
        recommendations["recommendation_text"].append(fake.text(max_nb_chars=300))
        recommendations["date"].append(fake.date_between(start_date='-5y', end_date='today'))

In [22]:
df_recommendations = pd.DataFrame(recommendations)
df_recommendations.to_sql('Recommendations', engine, index=False, if_exists='replace')


1000

In [24]:
# Connections data
connections = {
    "connection_id": [],
    "user_id": [],
    "connection_linkedin_id": [],
    "connection_name": [],
    "connection_profile_url": [],
    "connection_headline": []
}

In [25]:
for user_id in range(1, num_users + 1):
    for _ in range(num_connections_per_user):
        connections["connection_id"].append(len(connections["connection_id"]) + 1)
        connections["user_id"].append(user_id)
        connections["connection_linkedin_id"].append(fake.unique.uuid4())
        connections["connection_name"].append(fake.name())
        connections["connection_profile_url"].append(fake.url())
        connections["connection_headline"].append(fake.job())

In [26]:
df_connections = pd.DataFrame(connections)
df_connections.to_sql('Connections', engine, index=False, if_exists='replace')

10000

In [28]:
# Read and display a few rows from each table
df_users_sample = pd.read_sql('Users', engine).head()
df_job_history_sample = pd.read_sql('JobHistory', engine).head()
df_education_sample = pd.read_sql('Education', engine).head()
df_skills_sample = pd.read_sql('Skills', engine).head()
df_recommendations_sample = pd.read_sql('Recommendations', engine).head()
df_connections_sample = pd.read_sql('Connections', engine).head()


In [29]:
print("Users Sample Data:")
print(df_users_sample)

print("\nJobHistory Sample Data:")
print(df_job_history_sample)

print("\nEducation Sample Data:")
print(df_education_sample)

print("\nSkills Sample Data:")
print(df_skills_sample)

print("\nRecommendations Sample Data:")
print(df_recommendations_sample)

print("\nConnections Sample Data:")
print(df_connections_sample)

Users Sample Data:
   user_id                           linkedin_id first_name  last_name  \
0        1  78bf89e6-f565-4d8e-8045-42210fba46ed      James   Gonzalez   
1        2  f1aa0a03-2487-4f8d-9b0c-11e0039d2c4d    Michael        Day   
2        3  c67b6f3b-f44c-4d39-8590-877e7f5f4a34    Valerie  Rodriguez   
3        4  3eb4ede7-b587-4b83-bd5c-e875f66ad1df    William     Howell   
4        5  e9b636c8-6b68-47a8-9edf-c2ef04368986     Audrey   Friedman   

                     headline  \
0  Regulatory affairs officer   
1           Librarian, public   
2    Local government officer   
3           Personnel officer   
4              Therapist, art   

                                             summary  \
0  View voice town teach long herself. Simple bet...   
1  Whom miss green early compare. Enjoy affect st...   
2  Entire attorney from tend while. College imagi...   
3  East American local change really resource dre...   
4  Marriage protect trade music scientist issue b...   



Users Table:
   user_id                           linkedin_id first_name  last_name  \
0        1  78bf89e6-f565-4d8e-8045-42210fba46ed      James   Gonzalez   
1        2  f1aa0a03-2487-4f8d-9b0c-11e0039d2c4d    Michael        Day   
2        3  c67b6f3b-f44c-4d39-8590-877e7f5f4a34    Valerie  Rodriguez   
3        4  3eb4ede7-b587-4b83-bd5c-e875f66ad1df    William     Howell   
4        5  e9b636c8-6b68-47a8-9edf-c2ef04368986     Audrey   Friedman   

                     headline  \
0  Regulatory affairs officer   
1           Librarian, public   
2    Local government officer   
3           Personnel officer   
4              Therapist, art   

                                             summary  \
0  View voice town teach long herself. Simple bet...   
1  Whom miss green early compare. Enjoy affect st...   
2  Entire attorney from tend while. College imagi...   
3  East American local change really resource dre...   
4  Marriage protect trade music scientist issue b...   

      