<p style="text-align:center;">
<img src="https://github.com/digital-futures-academy/DataScienceMasterResources/blob/main/Resources/datascience-notebook-header.png?raw=true"
     alt="DigitalFuturesLogo"
     style="float: center; margin-right: 10px;" />
</p>

### EDA

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Clean DataFrame

In [3]:
# read csv containing data scraped from reed.co.uk
df = pd.read_csv('reed_data.csv')

In [4]:
# check top rows
df.head()

Unnamed: 0,salary,title,company,location,contract,remote
0,"£30,000 per annum",Data Scientist,EG Group,Guide,"Permanent, full-time",No
1,"£50,000 - £65,000 per annum",Data Scientist,Adecco,Bristol,"Permanent, full-time",No
2,"£40,000 - £70,000 per annum",Data Scientist,IO Sphere,City of London,"Permanent, full-time or part-time",No
3,"£40,000 - £50,000 per annum",Data Scientist,ADLIB,Bath,"Permanent, full-time",No
4,£450 - £500 per day,Data Scientist,Pontoon,Crewe,"Contract, full-time",No


In [5]:
# check bottom rows
df.tail()

Unnamed: 0,salary,title,company,location,contract,remote
7568,"£60,000 - £65,000 per annum",Fullstack Developer,SR2,Cambridgeshire,"Permanent, full-time",Work from home
7569,"£50,000 - £60,000 per annum","Health, Safety & Environment Manager - Food",SRG,Chipping Campden,"Permanent, full-time",No
7570,"£25,000 - £28,000 per annum",Site Chemist,Mandeville Recruitment Group,Walsall,"Permanent, full-time",No
7571,"£25,000 - £28,000 per annum",Chemistry Graduate - Site Chemist,Mandeville Recruitment Group,Stoke-on-Trent,"Permanent, full-time",No
7572,£25 - £28 per hour,Project Manager,Kelly Services,Reading,"Contract, full-time",No


In [6]:
# check dimensions
df.shape

(7573, 6)

In [7]:
# check data types
df.dtypes

salary      object
title       object
company     object
location    object
contract    object
remote      object
dtype: object

In [8]:
# check for null values
df.isnull().sum()

salary      0
title       0
company     0
location    0
contract    0
remote      0
dtype: int64

#### Remove duplicate job posts

In [9]:
# check how many duplicated job posts in df
df[df.duplicated()]

Unnamed: 0,salary,title,company,location,contract,remote
19,"£60,000 - £70,000 per annum",Data Scientist,Harnham - Data & Analytics Recruitment,Manchester,"Permanent, full-time",No
26,"£30,000 per annum",Data Scientist,EG Group,Guide,"Permanent, full-time",No
27,"£50,000 - £65,000 per annum",Data Scientist,Adecco,Bristol,"Permanent, full-time",No
28,"£40,000 - £50,000 per annum",Data Scientist,ADLIB,Bath,"Permanent, full-time",No
29,"£40,000 - £70,000 per annum",Data Scientist,IO Sphere,City of London,"Permanent, full-time or part-time",No
...,...,...,...,...,...,...
7562,"£38,000 - £45,000 per annum",Research Scientist - Biomarkers,VRS Recruitment,Slough,"Permanent, full-time",No
7563,"£19,620 - £27,250 per annum",Scientist,Broughton Group,Skipton,"Permanent, full-time",No
7566,"£23,000 - £28,000 per annum, inc benefits",Technical Surveyor,Reed,Great Yarmouth,"Permanent, full-time",No
7567,£1 - £19.38 per hour,Scientist - Upstream Processing,CK GROUP,Slough,"Contract, full-time",No


In [10]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [11]:
# check df dimesions correspond with rows dropped
df.shape

(4443, 6)

#### Inspect columns

In [12]:
# check top of df
df.head()

Unnamed: 0,salary,title,company,location,contract,remote
0,"£30,000 per annum",Data Scientist,EG Group,Guide,"Permanent, full-time",No
1,"£50,000 - £65,000 per annum",Data Scientist,Adecco,Bristol,"Permanent, full-time",No
2,"£40,000 - £70,000 per annum",Data Scientist,IO Sphere,City of London,"Permanent, full-time or part-time",No
3,"£40,000 - £50,000 per annum",Data Scientist,ADLIB,Bath,"Permanent, full-time",No
4,£450 - £500 per day,Data Scientist,Pontoon,Crewe,"Contract, full-time",No


In [13]:
# check number of unique values in each column
df.nunique()

salary      1234
title       2871
company     1060
location     696
contract       9
remote         2
dtype: int64

In [14]:
# iterate through columns stripping all white space and converting to lower-case
for column in df:
    df[column] = df[column].str.strip().str.lower()

In [15]:
# observe any changes
df.nunique()

salary      1234
title       2796
company     1060
location     696
contract       9
remote         2
dtype: int64

In [16]:
# check top of df
df.head()

Unnamed: 0,salary,title,company,location,contract,remote
0,"£30,000 per annum",data scientist,eg group,guide,"permanent, full-time",no
1,"£50,000 - £65,000 per annum",data scientist,adecco,bristol,"permanent, full-time",no
2,"£40,000 - £70,000 per annum",data scientist,io sphere,city of london,"permanent, full-time or part-time",no
3,"£40,000 - £50,000 per annum",data scientist,adlib,bath,"permanent, full-time",no
4,£450 - £500 per day,data scientist,pontoon,crewe,"contract, full-time",no


In [17]:
# filter out job titles that arent relevant to data science
df = df[df['title'].str.contains('data analyst|data scientist|data engineer|data science|machine learning|data specialist|business analyst|business intelligence|power bi analyst|senior analyst|sql analyst|quantitative|data manager')]

In [18]:
df.shape

(837, 6)

In [19]:
df.nunique()

salary      320
title       397
company     279
location    200
contract      6
remote        2
dtype: int64

##### Salary column clean-up

In [20]:
df[~df['salary'].str.contains('per annum')].head()

Unnamed: 0,salary,title,company,location,contract,remote
4,£450 - £500 per day,data scientist,pontoon,crewe,"contract, full-time",no
5,£700 - £800 per day,data scientist,guidant global,cheltenham,"contract, full-time",no
57,£350 - £400 per day,data scientist,advanced resource managers limited,london,"contract, full-time",work from home
60,£650 - £700 per day,data scientist,experis ltd,london,"contract, full-time",work from home
76,£250 - £300 per day,junior data scientist,harnham - data & analytics recruitment,london,"contract, full-time",no
