# Observations and Insights

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
# Creating data path for datasets
job_data_path="../Output/job_company_merged_data.csv"

# Reading datasets
job_data = pd.read_csv(job_data_path)
job_data

Unnamed: 0.1,Unnamed: 0,job id,job level,location,job name,post date,category,company id,company name,content,...,state,Job Title,excel,machine,sql,python,javascript,tableau,html,css
0,0,2649786.0,Senior Level,"New York, NY",Data Analyst,2020-05-03T13:01:30.848085Z,Data Science,1274.0,Viventium,,...,NY,Data Analyst,True,,,,,,,
1,1,2723310.0,Management,"Seattle, WA",Tanzu Infrastructure Technical Program Manager,2020-06-21T11:14:26.914893Z,Data Science,11784.0,VMware Carbon Black,,...,WA,Other,,True,,,,,,
2,2,2235742.0,Senior Level,"New York, NY",Senior Data Analyst,2020-03-30T23:06:09.549139Z,Data Science,281.0,BounceX,,...,NY,Data Analyst,,,True,True,,,,
3,3,243501.0,Mid Level,"New York, NY",Algo Strategy Developer,2020-03-20T23:04:01.202760Z,Data Science,1292.0,Hudson River Trading,,...,NY,Data Analyst,,,,,,,,
4,4,2724616.0,Senior Level,"New York, NY",Tax Associate,2020-06-15T23:25:20.121105Z,Data Science,11909.0,Kforce,,...,NY,Other,True,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,588,3133197.0,Senior Level,"Chicago, IL","Senior Software Engineer, Hadoop",2020-06-16T11:34:42.063438Z,Data Science,838.0,Epsilon,,...,IL,Data Scientist,,True,,,,,,
582,589,3094974.0,Management,"Philadelphia, PA",Director Supply Chain Data Science & Network O...,2020-06-09T11:20:21.159744Z,Data Science,954.0,GSK,,...,PA,Data Scientist,,True,,,,,,
583,590,2711903.0,Senior Level,"New York, NY","Data Scientist/Engineer (SAS, R code, Python &...",2020-04-24T23:04:10.712494Z,Data Science,1309.0,Medidata,,...,NY,Other,,,True,True,,,,
584,591,3125033.0,Management,"New York, NY","Vice President, CIMD4554143",2020-06-13T11:20:01.292952Z,Data Science,850.0,Goldman Sachs,,...,NY,Other,,,,True,,,,


In [3]:
# isolate data scientist
ds_df = job_data[job_data["Job Title"] == "Data Scientist"]
ds_df = ds_df[["location", "job id"]]

# rename columns
ds_df = ds_df.rename(columns={"location": "Cities", "job id": "Job Postings"})
ds_df

Unnamed: 0,Cities,Job Postings
5,"New York, NY",2554994.0
6,"New York, NY",2588786.0
7,"San Francisco, CA",3142544.0
8,"New York, NY",2723681.0
11,"New York, NY",2751238.0
...,...,...
575,"Chicago, IL",2296201.0
579,"San Francisco, CA",3131918.0
581,"Chicago, IL",3133197.0
582,"Philadelphia, PA",3094974.0


In [4]:
# count ds jobs by city
dsjobCount = ds_df["Cities"].value_counts()
dsjobCount

New York, NY         84
San Francisco, CA    63
Seattle, WA          28
Boston, MA           20
Chicago, IL          16
Washington, DC       12
Flexible / Remote    11
Philadelphia, PA      8
Los Angeles, CA       3
Atlanta, GA           3
Houston, TX           1
Name: Cities, dtype: int64

In [5]:
# create dataframe and reset index
dsPostCount = pd.DataFrame({"Job Postings": dsjobCount}).reset_index()
dsPostCount = dsPostCount.rename(columns={"index": "Cities"})
dsPostCount

Unnamed: 0,Cities,Job Postings
0,"New York, NY",84
1,"San Francisco, CA",63
2,"Seattle, WA",28
3,"Boston, MA",20
4,"Chicago, IL",16
5,"Washington, DC",12
6,Flexible / Remote,11
7,"Philadelphia, PA",8
8,"Los Angeles, CA",3
9,"Atlanta, GA",3


In [6]:
# city list to have consistent city names for rankings
dsCities = ["NYC","San Francisco","Seattle","Boston","Chicago","Washington DC", "Flexible / Remote", "Philadelphia","Los Angeles", "Atlanta","Houston"]

In [7]:
# set conditions for rankings
rankDSPostCount = []
jobDSPostList= dsPostCount['Job Postings']

for x in jobDSPostList:  
    if x >= 80: rankDSPostCount.append(1)
    elif x >= 70 < 79: rankDSPostCount.append(2)
    elif x >= 60 < 69: rankDSPostCount.append(3)
    elif x >= 50 < 59: rankDSPostCount.append(4)
    elif x >= 40 < 49: rankDSPostCount.append(5)
    elif x >= 30 < 39: rankDSPostCount.append(6)
    elif x >= 20 < 29: rankDSPostCount.append(7)
    elif x >= 10 < 19: rankDSPostCount.append(8)
    else: rankDSPostCount.append(9)    
rankDSPostCount

[1, 3, 7, 7, 8, 8, 8, 9, 9, 9, 9]

In [8]:
# create dataframe
rankDSPost_df = pd.DataFrame({"Cities": dsCities, "Rank DS Posts": rankDSPostCount})
rankDSPost_df

Unnamed: 0,Cities,Rank DS Posts
0,NYC,1
1,San Francisco,3
2,Seattle,7
3,Boston,7
4,Chicago,8
5,Washington DC,8
6,Flexible / Remote,8
7,Philadelphia,9
8,Los Angeles,9
9,Atlanta,9


In [9]:
# push to csv file for ranking
outFile = "../Output/rankDSPost.csv"
rankDSPost_df.to_csv(outFile)

In [10]:
# pull columns and isolate by data analyst
da_df = job_data[["location", "Job Title"]]
da_df = da_df[da_df["Job Title"] == "Data Analyst"]

# rename columns
da_df = da_df.rename(columns={"location": "Cities", "Job Title": "Job Postings"})
da_df


Unnamed: 0,Cities,Job Postings
0,"New York, NY",Data Analyst
2,"New York, NY",Data Analyst
3,"New York, NY",Data Analyst
10,"Philadelphia, PA",Data Analyst
15,"Washington, DC",Data Analyst
...,...,...
565,"Atlanta, GA",Data Analyst
568,"San Francisco, CA",Data Analyst
570,"New York, NY",Data Analyst
571,"New York, NY",Data Analyst


In [11]:
# job count by city
dajobCount = da_df["Cities"].value_counts()
dajobCount

New York, NY         77
San Francisco, CA    35
Chicago, IL          15
Seattle, WA          15
Boston, MA           14
Atlanta, GA          12
Los Angeles, CA       9
Washington, DC        6
Philadelphia, PA      4
Flexible / Remote     4
Houston, TX           2
Name: Cities, dtype: int64

In [12]:
# create a dataframe and reset index
daPostCount = pd.DataFrame({"Job Postings": dajobCount}).reset_index()
daPostCount = daPostCount.rename(columns={"index": "Cities"})
daPostCount

Unnamed: 0,Cities,Job Postings
0,"New York, NY",77
1,"San Francisco, CA",35
2,"Chicago, IL",15
3,"Seattle, WA",15
4,"Boston, MA",14
5,"Atlanta, GA",12
6,"Los Angeles, CA",9
7,"Washington, DC",6
8,"Philadelphia, PA",4
9,Flexible / Remote,4


In [13]:
# create a city list for consistent naming for ranking
daCities = dsCities = ["NYC","San Francisco","Chicago", "Seattle","Boston","Atlanta","Los Angeles", "Washington DC", "Flexible / Remote", "Philadelphia","Houston"]

In [14]:
# set ranking conditions
rankDAPostCount = []
jobDAPostList=daPostCount['Job Postings']


for x in jobDAPostList:  
    if x >= 70 < 79: rankDAPostCount.append(1)
    elif x >= 60 < 69: rankDAPostCount.append(2)
    elif x >= 50 < 59: rankDAPostCount.append(3)
    elif x >= 40 < 49: rankDAPostCount.append(4)
    elif x >= 30 < 39: rankDAPostCount.append(5)
    elif x >= 20 < 29: rankDAPostCount.append(6)
    elif x >= 10 < 19: rankDAPostCount.append(7)
    else: rankDAPostCount.append(8)    
rankDAPostCount

[1, 5, 7, 7, 7, 7, 8, 8, 8, 8, 8]

In [15]:
# create ranking dataframe
rankDAPost_df = pd.DataFrame({"Cities": daCities, "Rank DA Posts": rankDAPostCount})
rankDAPost_df

Unnamed: 0,Cities,Rank DA Posts
0,NYC,1
1,San Francisco,5
2,Chicago,7
3,Seattle,7
4,Boston,7
5,Atlanta,7
6,Los Angeles,8
7,Washington DC,8
8,Flexible / Remote,8
9,Philadelphia,8


In [16]:
# push df to csv
outFile = "../Output/rankDAPost.csv"
rankDAPost_df.to_csv(outFile)

In [17]:
# isolate columns
python_df = job_data[["location", "python"]]

# drop null values
python_df = python_df.dropna()

# rename column
python_df = python_df.rename(columns={"location": "Cities"})
python_df

# count python skills per city
pythonCount = python_df["Cities"].value_counts()
pythonCount

New York, NY         112
San Francisco, CA     56
Seattle, WA           26
Boston, MA            20
Chicago, IL           13
Los Angeles, CA        7
Washington, DC         7
Philadelphia, PA       6
Atlanta, GA            6
Flexible / Remote      3
Houston, TX            1
Name: Cities, dtype: int64

In [18]:
# create df and reset index
pythonCount_df = pd.DataFrame({"Python Count": pythonCount}).reset_index()
pythonCount_df = pythonCount_df.rename(columns={"index": "Cities"})
pythonCount_df

Unnamed: 0,Cities,Python Count
0,"New York, NY",112
1,"San Francisco, CA",56
2,"Seattle, WA",26
3,"Boston, MA",20
4,"Chicago, IL",13
5,"Los Angeles, CA",7
6,"Washington, DC",7
7,"Philadelphia, PA",6
8,"Atlanta, GA",6
9,Flexible / Remote,3


In [19]:
# create city list for consistent naming for ranking
pythonCities = ["NYC","San Francisco","Seattle","Boston","Chicago","Washington DC", "Los Angeles","Atlanta","Philadelphia","Flexible / Remote" ,"Houston"]

In [20]:
# set ranking conditions
rankPythonCount = []
pythonCountList=pythonCount_df['Python Count']

for x in pythonCountList:  
    if x >= 110 < 119: rankPythonCount.append(1)
    elif x >= 80 < 99: rankPythonCount.append(2)
    elif x >= 60 < 79: rankPythonCount.append(3)
    elif x >= 40 < 59: rankPythonCount.append(4)
    elif x >= 20 < 39: rankPythonCount.append(5)
    else: rankPythonCount.append(6)    
rankPythonCount

[1, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6]

In [21]:
# create dataframe
rankPython_df = pd.DataFrame({"Cities": pythonCities, "Rank Python":rankPythonCount})
rankPython_df

Unnamed: 0,Cities,Rank Python
0,NYC,1
1,San Francisco,4
2,Seattle,5
3,Boston,5
4,Chicago,6
5,Washington DC,6
6,Los Angeles,6
7,Atlanta,6
8,Philadelphia,6
9,Flexible / Remote,6


In [22]:
# push df to csv
outFile = "../Output/rankPythonCount.csv"
rankPython_df.to_csv(outFile)