# 1 - Look at the Big Picture and Frame the Problem

### Frame the Problem
Hello
### Look at the Big Picture
Hello

### Required Imports
By documentation, we need all the four imports below, especially sklearn, also added seaborn.
> Although `import sklearn` may be refined to `from sklearn... import...` later on.

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb

# 2 - Load the Dataset
Given the datasets url, load in the csv file using pandas.

> We might add options on how to see the dataset.

In [35]:
# Uncomment line below to display all columns of the data, also restart kernel
# pd.set_option('display.max_columns', None)

In [36]:
url = "https://raw.githubusercontent.com/SCI-co-Newb/datasets/main/ds_salaries.csv"
data = pd.read_csv(url)

data_backup = data

### Displaying the data

In [37]:
# Display the full data
data

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


In [38]:
# Display the first 5 rows of the data
data.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [39]:
# Displays the summary of the numerical values in the data
data.describe()

Unnamed: 0.1,Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,607.0,607.0,607.0,607.0,607.0
mean,303.0,2021.405272,324000.1,112297.869852,70.92257
std,175.370085,0.692133,1544357.0,70957.259411,40.70913
min,0.0,2020.0,4000.0,2859.0,0.0
25%,151.5,2021.0,70000.0,62726.0,50.0
50%,303.0,2022.0,115000.0,101570.0,100.0
75%,454.5,2022.0,165000.0,150000.0,100.0
max,606.0,2022.0,30400000.0,600000.0,100.0


In [44]:
# Displays the summary of the numerical values, and it should exclude teh first column
# data.loc[:, data.columns != ''].describe()

In [45]:
# Displays a quick description of the data, the total number of rows, each attributes type, and the number of non-null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          607 non-null    int64 
 1   work_year           607 non-null    int64 
 2   experience_level    607 non-null    object
 3   employment_type     607 non-null    object
 4   job_title           607 non-null    object
 5   salary              607 non-null    int64 
 6   salary_currency     607 non-null    object
 7   salary_in_usd       607 non-null    int64 
 8   employee_residence  607 non-null    object
 9   remote_ratio        607 non-null    int64 
 10  company_location    607 non-null    object
 11  company_size        607 non-null    object
dtypes: int64(5), object(7)
memory usage: 57.0+ KB


In [46]:
# Randomly split the data with 80 % in train and 20% in test
train_set = data.sample(frac = 0.8)
test_set  = data.drop(train_set.index)

In [47]:
# Display the train data
train_set

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
37,37,2020,EN,FT,Machine Learning Engineer,250000,USD,250000,US,50,US,L
5,5,2020,EN,FT,Data Analyst,72000,USD,72000,US,100,US,L
546,546,2022,SE,FT,Data Engineer,110500,USD,110500,US,100,US,M
473,473,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
542,542,2022,MI,FT,Data Engineer,206699,USD,206699,US,0,US,M
...,...,...,...,...,...,...,...,...,...,...,...,...
227,227,2021,MI,FT,Data Scientist,75000,EUR,88654,DE,50,DE,L
342,342,2022,EX,FT,Head of Data Science,224000,USD,224000,US,100,US,M
271,271,2021,SE,FT,Computer Vision Engineer,102000,BRL,18907,BR,0,BR,M
199,199,2021,EN,FT,Data Science Consultant,90000,USD,90000,US,100,US,S


In [48]:
# Display the test data
test_set

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
7,7,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
12,12,2020,EN,FT,Data Scientist,35000,EUR,39916,FR,0,FR,M
13,13,2020,MI,FT,Lead Data Analyst,87000,USD,87000,US,100,US,L
17,17,2020,SE,FT,Big Data Engineer,100000,EUR,114047,PL,100,GB,S
22,22,2020,SE,FT,Data Engineer,42000,EUR,47899,GR,50,GR,L
...,...,...,...,...,...,...,...,...,...,...,...,...
569,569,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
577,577,2022,SE,FT,Data Analyst,150075,USD,150075,US,100,US,M
585,585,2022,SE,FT,Data Analyst,110925,USD,110925,US,100,US,M
592,592,2022,SE,FT,Data Scientist,230000,USD,230000,US,100,US,M
