# Import Libraries and CSV

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import re
import nltk

In [2]:
df = pd.read_csv('alldata.csv')

In [3]:
new = df["location"].str.split(", ", n=1,expand = True)
df['city'] = new[0]
df['state'] = new[1].str.split(" ",n=1,expand = True)[0]
df = df.drop(['location', 'reviews', 'company'], axis=1)
df = df.drop_duplicates()
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,position,description,city,state
0,Development Director,Development Director\nALS Therapy Development ...,Atlanta,GA
1,An Ostentatiously-Excitable Principal Research...,"Job Description\n\n""The road that leads to acc...",Atlanta,GA
2,Data Scientist,"Growing company located in the Atlanta, GA are...",Atlanta,GA
3,Data Analyst,DEPARTMENT: Program OperationsPOSITION LOCATIO...,Atlanta,GA
4,Assistant Professor -TT - Signal Processing & ...,DESCRIPTION\nThe Emory University Department o...,Atlanta,GA


# Position

## Checking Values

In [5]:
## cleaning Position values
df['position'] = df['position'].apply(lambda x: re.sub(r'[^\w\s]','', x))
df['position'] = df['position'].apply(lambda x: re.sub(r'\r\n',' ', x))
df['position'] = df['position'].apply(lambda x: re.sub(r'\s+', ' ', x))

## putting all the Position words in the same list
position_check = ""
for job in df['position']:
    position_check += " " + job

## tokenizing
from nltk.tokenize import word_tokenize
all_position_low = position_check.lower()
position_tokens = word_tokenize(str(all_position_low))

df.head()

Unnamed: 0,position,description,city,state
0,Development Director,Development Director\nALS Therapy Development ...,Atlanta,GA
1,An OstentatiouslyExcitable Principal Research ...,"Job Description\n\n""The road that leads to acc...",Atlanta,GA
2,Data Scientist,"Growing company located in the Atlanta, GA are...",Atlanta,GA
3,Data Analyst,DEPARTMENT: Program OperationsPOSITION LOCATIO...,Atlanta,GA
4,Assistant Professor TT Signal Processing Machi...,DESCRIPTION\nThe Emory University Department o...,Atlanta,GA


## Standardising Position Names

In [7]:
lower_position = [position.lower() for position in df['position']]
category = []
for position in lower_position:
    if "anal" in position or "database" in position or "research anal" in position:
        category.append('data analyst')
    elif "data scien" in position or "machine learning" in position or "scien" in position:
        category.append('data scientist')
    elif "engineer" in position or "artificial intelligence" in position or "ai" in position or "software" in position:
        category.append('data engineer')
    else:
        category.append('other')
df['position'] = category

In [8]:
df['position'].value_counts().head(20).sort_values(ascending = True)

data engineer     1011
data analyst      1227
other             1656
data scientist    3014
Name: position, dtype: int64

# Description

## Checking Values

In [9]:
## cleaning description values
df['description'] = df['description'].apply(lambda x: re.sub(r'[^\w\s]','', x))
df['description'] = df['description'].apply(lambda x: re.sub(r'\r\n',' ', x))
df['description'] = df['description'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [10]:
from nltk.tokenize import word_tokenize
descriptions = []
for desc in df['description']:
    desc = desc.lower()
    desc = word_tokenize(str(desc))
    descriptions.append(desc)

df['description'] = descriptions

In [27]:
matrix = {'data analyst': {},
         'data engineer': {},
         'data scientist': {}}
positions = ['data analyst', 'data engineer', 'data scientist']
tool_list = ['python','sql','java','excel',
               'aws','linux','tableau','sas','oracle',
               'mysql','matlab','html','tensorflow','spss']
skill_list = ['machine learning','data analysis','visualization',
            'modeling','statistical analysis','research','deep learning',
            'optimization','decision tree','logistic','random forest',
            'ab testing','web scraping','neural network','interpersonal skills',
            'communication skills','artificial intelligence']
degree_list = ['master',"masters",'phd','bachelor',"bachelors",
             'high school','No degree requirement']
major_list = ['computer science','data science','statistics','quantitative',
            'mathematics','economics','liberal arts','science','buisness',
            'engineer','engineering','psychology','biology','medicine',
            'marketing','business analysis','chemical engineering','chemistry',
            'public health','public relationship']
var_list_of_lists = [tool_list, skill_list, degree_list, major_list]

for pos in positions:
        for var_list in var_list_of_lists:
            for var in var_list:
                n = 0
                for i in list(df.loc[df['position'] == pos].index):
                    p = df.iloc[i,1]
                    if len(var) == 1:
                        if var in p:
                            n += 1
                    else:
                        if var in ' '.join(str(e) for e in p):
                            n += 1
                matrix[pos].update({var: n})

epic_df = pd.DataFrame.from_dict(matrix, orient='columns')

In [29]:
epic_df.head(20)

Unnamed: 0,data analyst,data engineer,data scientist
python,427,478,1678
sql,472,324,925
java,134,405,693
excel,753,467,1353
aws,149,287,481
linux,48,176,241
tableau,203,34,202
sas,269,67,397
oracle,54,40,73
mysql,30,58,64


In [18]:
df_copy.head(50)

Unnamed: 0,position,description,city,state,python
0,other,"[development, director, als, therapy, developm...",Atlanta,GA,
1,data scientist,"[job, description, the, road, that, leads, to,...",Atlanta,GA,
2,data scientist,"[growing, company, located, in, the, atlanta, ...",Atlanta,GA,
3,data analyst,"[department, program, operationsposition, loca...",Atlanta,GA,1.0
4,data scientist,"[description, the, emory, university, departme...",Atlanta,GA,
5,data engineer,"[qualifications, bachelors, degree, in, comput...",Atlanta,GA,
6,other,"[qualifications, bachelors, degree, 57, years,...",Atlanta,GA,
7,data scientist,"[overview, responsibilities, wood, environment...",Atlanta,GA,
8,data analyst,"[works, closely, with, senior, cib, profession...",Atlanta,GA,0.0
9,data scientist,"[known, for, being, a, great, place, to, work,...",Atlanta,GA,


## Description by Position - WordCloud

In [11]:
da = df.loc[df['position'] == "data analyst"]
ds = df.loc[df['position'] == "data scientist"]
de = df.loc[df['position'] == "data engineer"]
ot = df.loc[df['position'] == "other"]

da.head(2)

Unnamed: 0,position,description,city,state
3,data analyst,DEPARTMENT Program OperationsPOSITION LOCATION...,Atlanta,GA
8,data analyst,Works closely with senior CIB professionals Pe...,Atlanta,GA


In [12]:
ds.head(2)

Unnamed: 0,position,description,city,state
1,data scientist,Job Description The road that leads to accompl...,Atlanta,GA
2,data scientist,Growing company located in the Atlanta GA area...,Atlanta,GA


In [13]:
de.head(2)

Unnamed: 0,position,description,city,state
5,data engineer,Qualifications Bachelors degree in Computer Sc...,Atlanta,GA
16,data engineer,Description Executes complex assignments requi...,Atlanta,GA


In [14]:
ot.head(2)

Unnamed: 0,position,description,city,state
0,other,Development Director ALS Therapy Development I...,Atlanta,GA
6,other,Qualifications Bachelors degree 57 years of ex...,Atlanta,GA


### Data Analyst

In [15]:
## merge all the description together in one variable
analyst_descrip = ""
for descrip in da['description']:
    analyst_descrip += " " + descrip

## import english stop_words (I, am, in, the...)
from nltk.corpus import stopwords
extra_stopword = ['data','experience','work','team','will','skill','year','skills']
stop_words1 = set((stopwords.words('english')))
stop_words = stop_words1.union(extra_stopword)

## transform the description in a tokenized list of words
from nltk.tokenize import word_tokenize
analyst_descrip_low= analyst_descrip.lower()
tokens = word_tokenize(str(analyst_descrip_low))

## count the frequency of witch tokenized word (without stop_words)
filtered_descrip_analyst = []
for word in tokens:
    if word not in list(stop_words):
        filtered_descrip_analyst += [word]
        
from collections import Counter
Counter(filtered_descrip_analyst).most_common()

[('research', 3384),
 ('business', 2514),
 ('analysis', 2111),
 ('management', 1819),
 ('ability', 1727),
 ('analytics', 1572),
 ('support', 1395),
 ('including', 1311),
 ('new', 1243),
 ('strong', 1221),
 ('development', 1219),
 ('years', 1215),
 ('required', 1179),
 ('degree', 1165),
 ('information', 1140),
 ('analyst', 1136),
 ('clients', 1101),
 ('knowledge', 1098),
 ('working', 1063),
 ('position', 975),
 ('related', 919),
 ('projects', 895),
 ('tools', 877),
 ('analytical', 875),
 ('financial', 865),
 ('requirements', 860),
 ('quantitative', 857),
 ('provide', 851),
 ('teams', 832),
 ('project', 832),
 ('develop', 810),
 ('technical', 805),
 ('amp', 803),
 ('solutions', 794),
 ('environment', 792),
 ('opportunity', 777),
 ('us', 743),
 ('status', 737),
 ('qualifications', 735),
 ('job', 733),
 ('communication', 729),
 ('must', 727),
 ('across', 727),
 ('company', 720),
 ('reports', 711),
 ('preferred', 709),
 ('services', 707),
 ('health', 701),
 ('statistical', 687),
 ('insights

### Data Scientist

In [16]:
## merge all the description together in one variable
scient_descrip = ""
for descrip in ds['description']:
    scient_descrip += " " + descrip

## import english stop_words (I, am, in, the...)
from nltk.corpus import stopwords
extra_stopword = ['data','experience','work','team','will','skill','year','skills']
stop_words1 = set((stopwords.words('english')))
stop_words = stop_words1.union(extra_stopword)

## transform the description in a tokenized list of words
from nltk.tokenize import word_tokenize
scient_descrip_low= scient_descrip.lower()
tokens = word_tokenize(str(scient_descrip_low))

## count the frequency of witch tokenized word (without stop_words)
filtered_descrip_scient = []
for word in tokens:
    if word not in list(stop_words):
        filtered_descrip_scient += [word]
        
from collections import Counter
Counter(filtered_descrip_scient).most_common()

[('learning', 7196),
 ('research', 5956),
 ('machine', 5734),
 ('science', 5601),
 ('development', 5535),
 ('business', 5245),
 ('analysis', 4476),
 ('years', 4194),
 ('new', 4166),
 ('ability', 4126),
 ('strong', 3439),
 ('including', 3390),
 ('working', 3303),
 ('scientist', 3204),
 ('technical', 3176),
 ('knowledge', 3124),
 ('design', 3113),
 ('analytics', 3064),
 ('develop', 3020),
 ('product', 3017),
 ('models', 2988),
 ('engineering', 2981),
 ('solutions', 2835),
 ('related', 2796),
 ('support', 2770),
 ('teams', 2754),
 ('company', 2701),
 ('software', 2668),
 ('environment', 2637),
 ('required', 2590),
 ('using', 2557),
 ('statistical', 2551),
 ('techniques', 2516),
 ('opportunity', 2505),
 ('qualifications', 2489),
 ('status', 2400),
 ('degree', 2374),
 ('computer', 2371),
 ('projects', 2361),
 ('management', 2344),
 ('systems', 2328),
 ('information', 2294),
 ('products', 2294),
 ('technology', 2273),
 ('help', 2223),
 ('field', 2185),
 ('us', 2175),
 ('responsibilities', 21

### Data Engineer

In [17]:
## merge all the description together in one variable
engine_descrip = ""
for descrip in de['description']:
    engine_descrip += " " + descrip

## import english stop_words (I, am, in, the...)
from nltk.corpus import stopwords
extra_stopword = ['data','experience','work','team','will','skill','year','skills']
stop_words1 = set((stopwords.words('english')))
stop_words = stop_words1.union(extra_stopword)

## transform the description in a tokenized list of words
from nltk.tokenize import word_tokenize
engine_descrip_low= engine_descrip.lower()
tokens = word_tokenize(str(engine_descrip_low))

## count the frequency of witch tokenized word (without stop_words)
filtered_descrip_engine = []
for word in tokens:
    if word not in list(stop_words):
        filtered_descrip_engine += [word]
        
from collections import Counter
Counter(filtered_descrip_engine).most_common()

[('software', 2639),
 ('design', 2352),
 ('systems', 2336),
 ('development', 2315),
 ('engineering', 2229),
 ('technical', 1893),
 ('required', 1609),
 ('working', 1515),
 ('years', 1461),
 ('business', 1455),
 ('engineers', 1418),
 ('new', 1364),
 ('ability', 1313),
 ('science', 1287),
 ('support', 1273),
 ('us', 1263),
 ('product', 1257),
 ('including', 1246),
 ('environment', 1218),
 ('degree', 1209),
 ('technologies', 1193),
 ('scientists', 1133),
 ('solutions', 1129),
 ('system', 1126),
 ('related', 1118),
 ('products', 1115),
 ('requirements', 1109),
 ('engineer', 1069),
 ('strong', 1063),
 ('learning', 1051),
 ('management', 1042),
 ('knowledge', 1022),
 ('teams', 999),
 ('may', 976),
 ('tools', 974),
 ('computer', 959),
 ('technology', 953),
 ('qualifications', 950),
 ('test', 949),
 ('build', 930),
 ('customers', 910),
 ('develop', 902),
 ('research', 897),
 ('opportunity', 875),
 ('analysis', 875),
 ('building', 873),
 ('help', 862),
 ('position', 848),
 ('must', 844),
 ('com

### Other

In [18]:
## merge all the description together in one variable
other_descrip = ""
for descrip in ot['description']:
    other_descrip += " " + descrip

## import english stop_words (I, am, in, the...)
from nltk.corpus import stopwords
extra_stopword = ['data','experience','work','team','will','skill','year','skills']
stop_words1 = set((stopwords.words('english')))
stop_words = stop_words1.union(extra_stopword)

## transform the description in a tokenized list of words
from nltk.tokenize import word_tokenize
other_descrip_low= other_descrip.lower()
tokens = word_tokenize(str(other_descrip_low))

## count the frequency of witch tokenized word (without stop_words)
filtered_descrip_other = []
for word in tokens:
    if word not in list(stop_words):
        filtered_descrip_other += [word]
        
from collections import Counter
Counter(filtered_descrip_other).most_common()

[('research', 4219),
 ('development', 3313),
 ('ability', 2977),
 ('management', 2835),
 ('business', 2533),
 ('including', 2523),
 ('clinical', 2404),
 ('product', 2278),
 ('new', 2255),
 ('required', 2244),
 ('years', 2211),
 ('support', 2172),
 ('working', 2007),
 ('technical', 1901),
 ('information', 1833),
 ('related', 1737),
 ('design', 1735),
 ('position', 1730),
 ('teams', 1713),
 ('degree', 1712),
 ('scientists', 1689),
 ('environment', 1689),
 ('knowledge', 1637),
 ('strong', 1627),
 ('qualifications', 1558),
 ('must', 1556),
 ('requirements', 1552),
 ('project', 1536),
 ('opportunity', 1512),
 ('medical', 1494),
 ('science', 1478),
 ('health', 1466),
 ('analysis', 1458),
 ('develop', 1448),
 ('technology', 1448),
 ('status', 1445),
 ('software', 1403),
 ('systems', 1337),
 ('provide', 1337),
 ('program', 1335),
 ('products', 1333),
 ('projects', 1333),
 ('communication', 1323),
 ('solutions', 1315),
 ('responsibilities', 1299),
 ('us', 1282),
 ('preferred', 1276),
 ('company