In [1]:
import pandas as pd

## Reading data from files

- https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 

What we need to do when creating a dataframe:
- do we have a coulmn that can be an index for the data frame
- do we have any columns that contain dates so we can parse them properly into proper data type

In [9]:
url = 'https://raw.githubusercontent.com/piotrgradzinski/dap_20230114/main/day_6_pgg/emps.csv'
emps = pd.read_csv(url, sep=';', encoding='utf-8', index_col='employee_id', parse_dates=['hire_date'])
emps

Unnamed: 0_level_0,first_name,last_name,job_title,salary,hire_date,department_name,address,postal_code,city,country
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100,Steven,King,President,24000,1997-06-17,Executive,2004 Charade Rd,98199,Seattle,United States of America
101,Neena,Kochhar,Administration Vice President,17000,1999-09-21,Executive,2004 Charade Rd,98199,Seattle,United States of America
102,Lex,De Haan,Administration Vice President,17000,2003-01-13,Executive,2004 Charade Rd,98199,Seattle,United States of America
103,Alexander,Hunold,Programmer,9000,2000-01-03,IT,2014 Jabberwocky Rd,26192,Southlake,United States of America
104,Bruce,Ernst,Programmer,6000,2001-05-21,IT,2014 Jabberwocky Rd,26192,Southlake,United States of America
...,...,...,...,...,...,...,...,...,...,...
202,Pat,Fay,Marketing Representative,6000,2007-08-17,Marketing,147 Spadina Ave,M5V 2L7,Toronto,Canada
203,Susan,Mavris,Human Resources Representative,6500,2004-06-07,Human Resources,8204 Arthur St,,London,United Kingdom
204,Hermann,Baer,Public Relations Representative,10000,2004-06-07,Public Relations,Schwanthalerstr. 7031,80925,Munich,Germany
205,Shelley,Higgins,Accounting Manager,12000,2004-06-07,Accounting,2004 Charade Rd,98199,Seattle,United States of America


## Data exploration

What we should explore:
- what data type we have in each column, are they proper?
- do we have `null`s or `NaN` values in any column - it's possible that we'll have to deal with them somehow.

In [7]:
type(emps)

pandas.core.frame.DataFrame

In [11]:
emps.dtypes

first_name                 object
last_name                  object
job_title                  object
salary                      int64
hire_date          datetime64[ns]
department_name            object
address                    object
postal_code                object
city                       object
country                    object
dtype: object

In [12]:
emps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 100 to 206
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   first_name       107 non-null    object        
 1   last_name        107 non-null    object        
 2   job_title        107 non-null    object        
 3   salary           107 non-null    int64         
 4   hire_date        107 non-null    datetime64[ns]
 5   department_name  106 non-null    object        
 6   address          106 non-null    object        
 7   postal_code      105 non-null    object        
 8   city             106 non-null    object        
 9   country          106 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 9.2+ KB


In [15]:
# 25 percentile means that 25% of salaries are equal or lower than 3 100.
emps.describe()

Unnamed: 0,salary
count,107.0
mean,6461.682243
std,3909.365746
min,2100.0
25%,3100.0
50%,6200.0
75%,8900.0
max,24000.0


In [16]:
emps.describe(include='all')

  emps.describe(include='all')


Unnamed: 0,first_name,last_name,job_title,salary,hire_date,department_name,address,postal_code,city,country
count,107,107,107,107.0,107,106,106,105.0,106,106
unique,91,102,19,,99,11,7,6.0,7,4
top,Peter,King,Sales Representative,,2004-06-07 00:00:00,Shipping,2011 Interiors Blvd,99236.0,South San Francisco,United States of America
freq,3,2,30,,4,45,45,45.0,45,68
first,,,,,1987-09-17 00:00:00,,,,,
last,,,,,2011-02-06 00:00:00,,,,,
mean,,,,6461.682243,,,,,,
std,,,,3909.365746,,,,,,
min,,,,2100.0,,,,,,
25%,,,,3100.0,,,,,,


In [17]:
emps.columns

Index(['first_name', 'last_name', 'job_title', 'salary', 'hire_date',
       'department_name', 'address', 'postal_code', 'city', 'country'],
      dtype='object')

In [18]:
emps.shape  # how many rows (.shape[0]) and columns (.shape[1])

(107, 10)

In [20]:
len(emps), emps.size

(107, 1070)