In [1]:
import pandas as pd

In [11]:
filepath = "Resources/data-scientist-job-market-in-the-us/alldata.csv"
all_df = pd.read_csv(filepath)
all_df.head()

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"


In [12]:
#Dropped and filled in all the NaN values because they were messing with our code later on.

all_df['reviews'] = all_df['reviews'].fillna(value=0)
all_df = all_df.dropna()
all_df.head()

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,0.0,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",0.0,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",0.0,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"


In [14]:
#A lot of the values in the "location" column had single spaces at the end, which were also messing with our code.
#We slice the strings in the column by index, so the spaces messed that up.

for i in range(len(all_df)):
    if all_df.iloc[i,4][-1] == ' ':
        all_df.iloc[i,4] = all_df.iloc[i,4][:-1]

In [15]:
#Made another column separating cities and zipcodes.

all_df['zipcode'] = pd.Series({'zipcode':[]})
for i in range(len(all_df)):
    location = all_df.iloc[i,4]
    if any(char.isdigit() for char in location):
        all_df.iloc[i,5] = location[-5:]
        all_df.iloc[i,4] = location[:-6]
    else:
        all_df.iloc[i,5] = "None"

all_df.head()

Unnamed: 0,position,company,description,reviews,location,zipcode
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,0.0,"Atlanta, GA",30301.0
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",0.0,"Atlanta, GA",
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",0.0,"Atlanta, GA",
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA",30303.0
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA",


In [18]:
#Made a DataFrame of the different companies, tracking the number of job postings they made and the number of reviews they have.

company_df = pd.DataFrame({'Company': all_df.groupby('company').count().index,
                           'Positions': all_df.groupby('company').count()['position'],
                            'Reviews': all_df.groupby('company').mean()['reviews']})
company_df.index = [i for i in range(len(company_df))]
company_df.head()

Unnamed: 0,Company,Positions,Reviews
0,10x Genomics,1,0.0
1,1199SEIU Family of Funds,1,133.0
2,1871,1,4.0
3,23andMe,17,4.0
4,24 Hour Fitness,1,2090.0


In [19]:
#Made a DataFrame of the different job locations and their frequencies.

location_df = pd.DataFrame({'Location': all_df.groupby('location').count().index,
                       'Count': all_df.groupby('location').count()['company']})
location_df.index = [i for i in range(len(location_df))]
location_df.head()

Unnamed: 0,Location,Count
0,"Alameda, CA",10
1,"Allendale, NJ",2
2,"Atlanta, GA",269
3,"Austin, TX",213
4,"Bedminster, NJ",1


In [20]:
#Made a DataFrame of how many times different words popped up in job descriptions

word_counts = {}
df_column = all_df['description']
for i in range(len(df_column)):
    word_list = df_column.iloc[i].split()
    for i in word_list:
        if i.lower() in word_counts:
            word_counts[i.lower()] += 1
        else:
            word_counts[i.lower()] = 1

word_list = list(word_counts.keys())
word_counts_list = []
for i in word_list:
    word_counts_list.append(word_counts[i])

word_df = pd.DataFrame({'Word':word_list, 'Count':word_counts_list})
word_df.head()

Unnamed: 0,Word,Count
0,development,9910
1,director,782
2,als,55
3,therapy,246
4,institute,494


In [21]:
#Edited word count DataFrame to be more relevant (removing a lot of the words that didn't appear a lot,
#as well as the words that appeared the most because they were common prepositions, articles, etc.)

words_by_count = word_df.sort_values('Count',ascending=False)

words_by_count.index = [i for i in range(len(words_by_count))]

words_by_count_top = words_by_count.loc[words_by_count['Count'] >= 300]
words_by_count_upper = words_by_count_top.loc[words_by_count_top['Count'] <= 7373]

words_by_count_upper.index = [i for i in range(len(words_by_count_upper))]
words_by_count_upper.to_csv('wordcount.csv')

In [33]:
#Created a function to search all job descriptions for specific terms or phrases.

def wordsearch(searchterm):
    occurrences = 0
    frequency = 0
    for i in range(len(all_df)):
        if searchterm.lower() in all_df.iloc[i,2].lower():
            frequency += 1
            occurrences += all_df.iloc[i,2].lower().count(searchterm.lower())
    print(f'Total number of times "{searchterm}" occurred in job descriptions: {occurrences}')
    print(f'Total number of job descriptions in which "{searchterm}" occurred: {frequency}')    

In [37]:
wordsearch('degree')

Total number of times "degree" occurred in job descriptions: 6735
Total number of job descriptions in which "degree" occurred: 4215


In [31]:
type_df = pd.DataFrame({'position':[], 'company':[], 'description':[], 'reviews':[], 'location':[], 'zipcode':[], 'type':[]})

for i in range(len(all_df)):
    if 'analyst' in all_df.iloc[i,0].lower() or 'analysis' in all_df.iloc[i,0]:
        type_df = type_df.append(all_df.iloc[i])
        type_df.iloc[-1,6] = 'Data Analyst'
    if 'data scientist' in all_df.iloc[i,0].lower() or 'data science' in all_df.iloc[i,0]:
        type_df = type_df.append(all_df.iloc[i])
        type_df.iloc[-1,6] = 'Data Scientist'
    if 'engineer' in all_df.iloc[i,0].lower():
        type_df = type_df.append(all_df.iloc[i])
        type_df.iloc[-1,6] = 'Engineer'
    if 'engineer' not in all_df.iloc[i,0].lower() and 'data scientist' not in all_df.iloc[i,0].lower() and 'data science' not in all_df.iloc[i,0] and 'analyst' not in all_df.iloc[i,0].lower() and 'analysis' not in all_df.iloc[i,0]:
        type_df = type_df.append(all_df.iloc[i])
        type_df.iloc[-1,6] = 'Misc.'


In [32]:
type_df['type'].value_counts()

Misc.             3528
Engineer          1354
Data Scientist    1261
Data Analyst       885
Name: type, dtype: int64