In [42]:
#importing necessary libraries
import os
from pprint import PrettyPrinter
import plotly.express as px
import pandas as pd
from pymongo import MongoClient
from country_converter import CountryConverter
from dotenv import load_dotenv
load_dotenv(".venv")

True

I. Connect

II. Explore

    1. Nationality
    2. Age
    3. Education

In [13]:
db_username = os.environ.get("db_username")
db_password = os.environ.get("db_password")
database = os.environ.get("database")
cluster_url = os.environ.get("cluster_url")
collection = os.environ.get("collection")

connection_string = f"mongodb+srv://{db_username}:{db_password}@{cluster_url}"

client = MongoClient(connection_string)

In [14]:
pp = PrettyPrinter(indent = 2)

In [15]:
pp.pprint(list(client.list_databases()))

[ {'empty': False, 'name': 'data_science', 'sizeOnDisk': 618496},
  {'empty': False, 'name': 'sample_mflix', 'sizeOnDisk': 124002304},
  {'empty': False, 'name': 'admin', 'sizeOnDisk': 368640},
  {'empty': False, 'name': 'local', 'sizeOnDisk': 5698060288}]


In [16]:
db = client[database]
list(db.list_collections())

[{'name': 'statistical_analysis',
  'type': 'collection',
  'options': {},
  'info': {'readOnly': False,
   'uuid': Binary(b'f5C\xc5I\xe8Mj\x90\xa8\x046\x10\x05\x16\x8d', 4)},
  'idIndex': {'v': 2, 'key': {'_id': 1}, 'name': '_id_'}}]

In [18]:
# connecting to the collection
ds_applicants = db[collection]

### exploratory data analysis

In [19]:
# counting documents in the collection
ds_applicants.count_documents({})

6000

In [41]:
# looking at the structure of the documents
result = ds_applicants.find({}).limit(5)
pd.DataFrame(result)

Unnamed: 0,_id,Unnamed: 2,name,DOB,gender,email,admissionsQuiz,countryISO2,highestDegreeEarned
0,691dbd3cfa72dc4fe7e3f968,0,Donna Chapman DDS,1980-12-27,female,x8qzzhg1ku@gmail.com,incomplete,ET,Some College (1-3 years)
1,691dbd3cfa72dc4fe7e3f969,1,Xavier Chambers,1987-04-30,male,tzo61bn086@yahoo.com,complete,MV,Bachelor's degree
2,691dbd3cfa72dc4fe7e3f96a,2,Cheyenne Flores,2006-12-03,male,jpaqw5oc9t@yahoo.com,complete,TL,Some College (1-3 years)
3,691dbd3cfa72dc4fe7e3f96b,3,Elizabeth Garza,2003-01-31,female,fhm16eqaq9@yahoo.com,complete,MD,Bachelor's degree
4,691dbd3cfa72dc4fe7e3f96c,4,Carla Johnson,1993-06-30,male,p8kenu0sag@yahoo.com,complete,DE,High School or Baccalaureate


### Nationality

In [21]:
result = ds_applicants.aggregate(
    [
        {
            "$group": {"_id": "$countryISO2", "count": {"$count": {}}}
        }
    ]
)

df_nationality = pd.DataFrame(result).rename({"_id": "countryISO2"}, axis = "columns").sort_values("count")
df_nationality.head()

Unnamed: 0,countryISO2,count
177,WF,14
91,CZ,14
50,NL,14
61,LK,14
244,EC,15


In [22]:
cc = CountryConverter()
df_nationality["country_name"] = cc.convert(df_nationality["countryISO2"], to="name_short")
df_nationality.head()

Unnamed: 0,countryISO2,count,country_name
177,WF,14,Wallis and Futuna Islands
91,CZ,14,Czechia
50,NL,14,Netherlands
61,LK,14,Sri Lanka
244,EC,15,Ecuador


In [23]:
fig = px.bar(
    data_frame = df_nationality.tail(10),
    x = "count",
    y= "country_name",
    orientation = "h",
    title = "DS Applicants: Nationality"
)
fig.update_layout(xaxis_title = "Frequency [Count]", yaxis_title = "Country")
fig.show()

In [24]:
df_nationality["count_pct"] =(df_nationality["count"]/df_nationality["count"].sum()) * 100
df_nationality.shape

(249, 4)

In [25]:
fig = px.bar(
    data_frame = df_nationality.tail(10),
    x = "count_pct",
    y= "country_name",
    orientation = "h",
    title = "DS Applicants: Nationality"
)
fig.update_layout(xaxis_title = "Frequency [%]", yaxis_title = "Country")
fig.show()

In [26]:
df_nationality["country_iso3"] = cc.convert(df_nationality["countryISO2"], to ="ISO3")
df_nationality.head()

Unnamed: 0,countryISO2,count,country_name,count_pct,country_iso3
177,WF,14,Wallis and Futuna Islands,0.233333,WLF
91,CZ,14,Czechia,0.233333,CZE
50,NL,14,Netherlands,0.233333,NLD
61,LK,14,Sri Lanka,0.233333,LKA
244,EC,15,Ecuador,0.25,ECU


In [28]:
def global_choropleth_map():

    fig = px.choropleth(
        data_frame = df_nationality,
        locations = "country_iso3",
        color = "count_pct",
        projection = "natural earth",
        color_continuous_scale = px.colors.sequential.Oranges
    )

    return fig

In [17]:
national_repr_fig = global_choropleth_map()
national_repr_fig.show()

### Age

In [29]:
result = ds_applicants.aggregate(
    [
        {"$project":{
            "years":{
                "$dateDiff":{
                    "startDate":"$DOB",
                    "endDate":"$$NOW",
                    "unit":"year"
                    }
                }
            }
        }
    ]
)
df_age = pd.DataFrame(result)["years"]
df_age.tail()

5995    35
5996    29
5997    43
5998    30
5999    22
Name: years, dtype: int64

In [30]:
def global_age_distribution():

    fig = px.histogram(x=df_age, nbins =20, title = "DS Applicants: Distribution of ages.")
    
    fig.update_layout(xaxis_title ="Age", yaxis_title ="Frequency [count]")

    return fig

In [31]:
age_fig = global_age_distribution()
age_fig.show()

### Education

In [33]:
result = ds_applicants.aggregate(
    [
        {
            "$group":{
                "_id":"$highestDegreeEarned",
                "count":{"$count": {}}
            }
        }
    ]
)
df_education = (
    pd.DataFrame(result)
    .rename({"_id":"highest_degree_earned"}, axis="columns")
    .set_index("highest_degree_earned")
    .squeeze()
)

df_education.head()

highest_degree_earned
High School or Baccalaureate    1192
Bachelor's degree               1163
Some College (1-3 years)        1203
Doctorate (e.g. PhD)            1241
Master's degree                 1201
Name: count, dtype: int64

In [34]:
def ed_sort(counts):

    degrees = [
        "High School or Baccalaureate",
        "Some College (1-3 years)",
        "Bachelor's degree",
        "Master's degree",
        "Doctorate (e.g. PhD)",
    ]
    mapping = {k:v for v,k in enumerate(degrees)}    
    sort_order =[mapping[c] for c in counts]

    return sort_order

In [38]:
df_education.sort_index(key=ed_sort, inplace=True)

In [37]:
def edu_plot_distribution():

    fig = px.bar(
        data_frame = df_education,
        x = df_education,
        y = df_education.index,
        orientation = "h"
    )

    fig.update_layout(xaxis_title = "Frequency [count]", yaxis_title = "Highest Degree Earned")

    return fig

In [39]:
fig = edu_plot_distribution()
fig.show()

In reality there are a lot of people who hold a high school or Baccalaureate and Some College, moderate number of bachelor's degree holders and a few that holds master's degree and PhD. Bear with me I have done the GANs yet. I will improve my synthetic data as time goes