# Data Wrangling

## Set up Function Library

In [None]:
github_name = 'maguirr4-uo'
repo_name = 'cis423'
source_file = 'library.py'
url = f'https://raw.githubusercontent.com/{github_name}/{repo_name}/main/{source_file}'
!rm $source_file
!wget $url
%run -i $source_file

## Bring in the dataset

In [None]:
url = 'https://raw.githubusercontent.com/maguirr4-uo/cis423/main/EmployeeFuture.csv'
future_df = pd.read_csv(url)
future_df

In [None]:
future_df['LeaveOrNot'].to_list().count(1)/len(future_df)

## Break down into features, labels

In [None]:
future_features = future_df.drop(columns=['LeaveOrNot'])
labels = future_df['LeaveOrNot'].to_list()
labels[:5]

In [None]:
future_features

# Defining the wrangling pipeline and transforming

In [None]:
future_transformer = Pipeline(steps=[
    ('education', MappingTransformer('Education', {'Bachelors': 0, 'Masters': 1, 'PHD': 2})),
    ('year', MappingTransformer('JoiningYear', {'2012': 6, '2013': 5, '2014': 4, '2015': 3, '2016': 2, '2017': 1, '2018': 0,})),
    ('gender', MappingTransformer('Gender', {'Male': 0, 'Female': 1})),
    ('benched', MappingTransformer('EverBenched', {'No': 0, 'Yes': 1})),
    ('ohe', OHETransformer('City')),
    ('age', TukeyTransformer('Age', 'outer')),
    ('exp', TukeyTransformer('ExperienceInCurrentDomain', 'outer')),
    ('scale', MinMaxTransformer()), 
    ], verbose=True)

future_transformed_df = future_transformer.fit_transform(future_features)

In [None]:
future_transformed_df

## Find a random state value to save

In [None]:
rs = find_random_state(future_transformed_df, labels)  #77
rs

## Store the test data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(future_features, labels, test_size=0.2, shuffle=True,
                                                    random_state=rs, stratify=future_df['LeaveOrNot'])

X_test.to_csv('test_df.csv', index=None)

In [None]:
X_test.head()