In [85]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
import yaml

In [88]:

train_data_path = 'artifact/train_data.csv'
test_data_path = 'artifact/test_data.csv'
train_df  = pd.read_csv(train_data_path)
test_df  = pd.read_csv(test_data_path)

In [None]:
# Removing unwanted columns
train_df.drop(['case_id', 'yr_of_estab'], axis=1, inplace = True)
train_df.drop(['case_id', 'yr_of_estab'], axis=1, inplace =True)

In [None]:
# Drop duplicates
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [92]:
train_df.head(2)


Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV2404,Asia,High School,Y,Y,1411,2009,West,75107.37,Year,Y,Denied
1,EZYV6831,North America,Master's,Y,N,781,2012,West,102403.56,Year,Y,Denied


In [None]:
## case_id, yr_of_estab

In [None]:
'''
1. removing duplicates across whole dataset
2. removing the columns that are not necessary
3. spliting the data into numerical and categorical
    1. replacing null values across both categorical and numerical cols 
    2. scaling of numerical columns 
    3. ordinal encoding and one hot encoding for categorical colums 
'''

In [93]:
train_df['yr_of_estab'].value_counts()

yr_of_estab
1998    906
2005    857
2007    808
2001    793
1999    708
       ... 
1846      3
1824      2
1822      2
1810      2
1830      2
Name: count, Length: 199, dtype: int64

In [94]:
train_df['unit_of_wage'].value_counts()

unit_of_wage
Year     18386
Hour      1704
Week       222
Month       72
Name: count, dtype: int64

In [None]:
schema_file_path = 'config/schema.yaml'

with open(schema_file_path, 'r') as file:
    schema = yaml.safe_load(file)

In [100]:
drop_cols = schema.get('drop_columns', [])
numerical_cols = schema.get('num_features', [])
ordinal_cols = schema.get('or_columns', [])
one_hot_cols = schema.get('oh_columns', [])
transformer_cols = schema.get('transform_columns', [])

In [105]:
drop_cols

['case_id', 'yr_of_estab']

In [104]:
train_df.head(3)

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV2404,Asia,High School,Y,Y,1411,2009,West,75107.37,Year,Y,Denied
1,EZYV6831,North America,Master's,Y,N,781,2012,West,102403.56,Year,Y,Denied
2,EZYV879,Asia,High School,Y,N,958,2005,South,89991.95,Year,Y,Certified


In [101]:
numerical_cols

['no_of_employees', 'prevailing_wage', 'company_age']

In [103]:
transformer_cols

['no_of_employees', 'company_age']

In [102]:
drop_cols

['case_id', 'yr_of_estab']

In [None]:

raw_data_path = 'artifact/raw_data.csv'
train_data_path = 'artifact/train_data.csv'
test_data_path = 'artifact/test_data.csv'
schema_file_path = 'config/schema.yaml'


def load_data():
    raw_df  = pd.read_csv(raw_data_path)
    train_df  = pd.read_csv(train_data_path)
    test_df  = pd.read_csv(test_data_path)
    return raw_df, train_df,test_df

def load_schema():
    with open(schema_file_path, 'r') as file:
        schema = yaml.safe_load(file)
    return schema



In [79]:
raw = pd.read_csv(raw_data_path)
raw.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [81]:
train_df = pd.read_csv(train_data_path)
train_df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV2404,Asia,High School,Y,Y,1411,2009,West,75107.37,Year,Y,Denied
1,EZYV6831,North America,Master's,Y,N,781,2012,West,102403.56,Year,Y,Denied
2,EZYV879,Asia,High School,Y,N,958,2005,South,89991.95,Year,Y,Certified
3,EZYV24062,Asia,Master's,N,N,900,1962,West,128104.61,Year,Y,Denied
4,EZYV2110,Europe,High School,N,N,3533,1993,West,12647.14,Year,Y,Denied


In [80]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,2403,EZYV2404,Asia,High School,Y,Y,1411,2009,West,75107.37,Year,Y,Denied
1,6830,EZYV6831,North America,Master's,Y,N,781,2012,West,102403.56,Year,Y,Denied
2,878,EZYV879,Asia,High School,Y,N,958,2005,South,89991.95,Year,Y,Certified
3,24061,EZYV24062,Asia,Master's,N,N,900,1962,West,128104.61,Year,Y,Denied
4,2109,EZYV2110,Europe,High School,N,N,3533,1993,West,12647.14,Year,Y,Denied


In [78]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,16745,EZYV16746,Asia,Doctorate,Y,N,1337,1989,South,199777.59,Year,Y,Certified
1,9526,EZYV9527,Asia,Master's,Y,N,1212,1909,West,69370.58,Year,Y,Certified
2,8358,EZYV8359,Asia,Bachelor's,Y,N,2253,1990,West,60749.34,Year,Y,Certified
3,18347,EZYV18348,Asia,Bachelor's,N,Y,14083,1914,West,70048.73,Year,Y,Denied
4,2173,EZYV2174,Asia,Bachelor's,Y,N,2940,1840,Northeast,127180.67,Year,Y,Certified


In [37]:
raw_df, train_df,test_df = load_data()
schema = load_schema()

In [39]:
len(raw_df) 

25480

In [69]:
raw_df.head(2)

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified


In [45]:
schema_columns = [list(col.keys())[0] for col in schema['columns']]

In [49]:
def validate_columns():
    
    schema_columns = [list(col.keys())[0] for col in schema['columns']]
    
    if len(raw_df.columns) != len(schema_columns):
        print(f"column mismatch occured, schema columns : {len(schema_columns)}, data columns : {len(raw_df.columns)}")
    
    for column in schema_columns:
        if column not in list(raw_df.columns):
            print(f"columns:{column}, not found in the data")
            
    print('Validation succesfull!')

In [50]:
validate_columns()

Validation succesfull!


In [51]:
raw_df.head(2)

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified


In [56]:
data_numerical_columns = list(raw_df.select_dtypes(include=['number']))
schema_numerical_columns = list(schema['numerical_columns'])


In [57]:
schema_numerical_columns

['no_of_employees', 'prevailing_wage', 'yr_of_estab']

In [55]:
data_numerical_columns

['no_of_employees', 'yr_of_estab', 'prevailing_wage']

In [None]:
for column in schema_numerical_columns:
    if column not in data_numerical_columns:
        

In [None]:
def validate_numerical_columns():
    
    data_numerical_columns = list(raw_df.select_dtypes(include=['number']))
    schema_numerical_columns = list(schema['numerical_columns'])
    
    if len(data_numerical_columns) != len(schema_numerical_columns):
        print(f"column mismatch occured, schema columns has: {len(schema_numerical_columns)} columns, data columns : {len(data_numerical_columns.columns)} columns")
    
    for column in schema_numerical_columns:
        if column not in data_numerical_columns:
            print(f"columns:{column}, not found in the data")
            
    print('Validation for numerical columns succesfull!')

In [62]:
data_numerical_columns

['no_of_employees', 'yr_of_estab', 'prevailing_wage']

In [63]:
schema_numerical_columns

['no_of_employees', 'prevailing_wage', 'yr_of_estab']

In [67]:
def validate_categorical_columns():
    
    data_categorical_columns = list(raw_df.select_dtypes(include=['object']))
    schema_categorical_columns = list(schema['categorical_columns'])
    
    if len(data_categorical_columns) != len(schema_categorical_columns):
        print(f"column mismatch occured, schema columns has: {len(schema_categorical_columns)} columns, data columns : {len(data_categorical_columns.columns)} columns")
    
    for column in schema_categorical_columns:
        if column not in data_categorical_columns:
            print(f"columns:{column}, not found in the data")
            
    print('Validation for categorical columns succesfull!')
    

In [68]:
validate_categorical_columns()

Validation for categorical columns succesfull!


In [None]:
data_numerical_columns = list(raw_df.select_dtypes(include=['object']))

In [65]:
data_numerical_columns

['case_id',
 'continent',
 'education_of_employee',
 'has_job_experience',
 'requires_job_training',
 'region_of_employment',
 'unit_of_wage',
 'full_time_position',
 'case_status']

In [66]:
raw_df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified
