### EDA2

exploring new combination of features

In [1]:
import pandas as pd
import numpy as np

import ocifs

import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
# globals
# some columns are not needed. This is the list of columns that will be used
MY_COLUMNS = ['Age', 'Attrition', 'EnvironmentSatisfaction', 'MaritalStatus', 'TravelForWork', 'SalaryLevel', 'JobFunction', 
              'CommuteLength', 'EducationalLevel', 'EducationField', 'MonthlyIncome', 
              'OverTime', 'StockOptionLevel', 'TrainingTimesLastYear', 'YearsSinceLastPromotion', 'WorkLifeBalance']

In [3]:
# read the csv from Object storage and return a pandas df
def read_from_object_storage(prefix, file_name):
    # get access to OSS as an fs
    # config={} assume resource_principal auth
    fs = ocifs.OCIFileSystem(config={})
    
    FILE_PATH = prefix + file_name
    
    # reading data from Object Storage
    with fs.open(FILE_PATH, 'rb') as f:
        df = pd.read_csv(f)
    
    return df

def compute_distinct(data, columns):
    l_count = []
    for col in columns:
        l_count.append(data[col].nunique())
    
    count_dict = {}
    count_dict['col'] = columns
    count_dict['count'] = l_count

    count_df = pd.DataFrame(count_dict)
    
    return count_df

### Reading the dataset from the Object Storage (using OCIFS)

In [4]:
PREFIX = "oci://data_input@fr95jjtqbdhh/"
FILE_NAME = "orcl_attrition.csv"

# see in functions above
data_orig = read_from_object_storage(prefix=PREFIX, file_name=FILE_NAME)

data_orig.head()

Unnamed: 0,Age,Attrition,TravelForWork,SalaryLevel,JobFunction,CommuteLength,EducationalLevel,EducationField,Directs,EmployeeNumber,...,WeeklyWorkedHours,StockOptionLevel,YearsinIndustry,TrainingTimesLastYear,WorkLifeBalance,YearsOnJob,YearsAtCurrentLevel,YearsSinceLastPromotion,YearsWithCurrManager,name
0,42,Yes,infrequent,5054,Product Management,2,L2,Life Sciences,1,1,...,80,0,8,0,1,6,4,0,5,Tracy Moore
1,50,No,often,1278,Software Developer,9,L1,Life Sciences,1,2,...,80,1,10,3,3,10,7,1,7,Andrew Hoover
2,38,Yes,infrequent,6296,Software Developer,3,L2,Other,1,4,...,80,0,7,3,3,0,0,0,0,Julie Bell
3,34,No,often,6384,Software Developer,4,L4,Life Sciences,1,5,...,80,0,8,3,3,8,7,3,0,Thomas Adams
4,28,No,infrequent,2710,Software Developer,3,L1,Medical,1,7,...,80,1,6,3,3,2,2,2,2,Johnathan Burnett


In [5]:
data_orig.columns

Index(['Age', 'Attrition', 'TravelForWork', 'SalaryLevel', 'JobFunction',
       'CommuteLength', 'EducationalLevel', 'EducationField', 'Directs',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'WeeklyWorkedHours', 'StockOptionLevel',
       'YearsinIndustry', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsOnJob', 'YearsAtCurrentLevel', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'name'],
      dtype='object')

In [6]:
data_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   TravelForWork             1470 non-null   object
 3   SalaryLevel               1470 non-null   int64 
 4   JobFunction               1470 non-null   object
 5   CommuteLength             1470 non-null   int64 
 6   EducationalLevel          1470 non-null   object
 7   EducationField            1470 non-null   object
 8   Directs                   1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [7]:
for col in data_orig.columns:
    print(col, 'Unique values:', data_orig[col].nunique())

Age Unique values: 43
Attrition Unique values: 2
TravelForWork Unique values: 3
SalaryLevel Unique values: 886
JobFunction Unique values: 3
CommuteLength Unique values: 29
EducationalLevel Unique values: 5
EducationField Unique values: 6
Directs Unique values: 1
EmployeeNumber Unique values: 1470
EnvironmentSatisfaction Unique values: 4
Gender Unique values: 2
HourlyRate Unique values: 71
JobInvolvement Unique values: 4
JobLevel Unique values: 5
JobRole Unique values: 9
JobSatisfaction Unique values: 4
MaritalStatus Unique values: 3
MonthlyIncome Unique values: 1349
MonthlyRate Unique values: 1427
NumCompaniesWorked Unique values: 10
Over18 Unique values: 1
OverTime Unique values: 2
PercentSalaryHike Unique values: 15
PerformanceRating Unique values: 2
RelationshipSatisfaction Unique values: 4
WeeklyWorkedHours Unique values: 1
StockOptionLevel Unique values: 4
YearsinIndustry Unique values: 40
TrainingTimesLastYear Unique values: 7
WorkLifeBalance Unique values: 4
YearsOnJob Unique va

In [8]:
# eliminiamo 
to_remove = ['Directs', 'EmployeeNumber', 'name', 'Over18', 'WeeklyWorkedHours']

columns1 = sorted(list(set(data_orig.columns) - set(to_remove)))

print('Rimangono:', len(columns1))

Rimangono: 31


In [9]:
data1 = data_orig[columns1]

In [10]:
# cross correlation matrix
tab1 = data1.corr()

tab1

Unnamed: 0,Age,CommuteLength,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,...,RelationshipSatisfaction,SalaryLevel,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsAtCurrentLevel,YearsOnJob,YearsSinceLastPromotion,YearsWithCurrManager,YearsinIndustry
Age,1.0,-0.001686,0.010146,0.024287,0.02982,0.509604,-0.004892,0.497855,0.028051,0.299635,...,0.053535,0.010661,0.03751,-0.019621,-0.02149,0.212901,0.311309,0.216513,0.202089,0.680381
CommuteLength,-0.001686,1.0,-0.016075,0.031131,0.008783,0.005303,-0.003669,-0.017014,0.027473,-0.029251,...,0.006557,-0.004994,0.044872,-0.036942,-0.026556,0.018845,0.009508,0.010029,0.014406,0.004628
EnvironmentSatisfaction,0.010146,-0.016075,1.0,-0.049857,-0.008278,0.001212,-0.006784,-0.006259,0.0376,0.012594,...,0.007665,0.018355,0.003432,-0.019359,0.027627,0.018007,0.001458,0.016194,-0.004999,-0.002693
HourlyRate,0.024287,0.031131,-0.049857,1.0,0.042861,-0.027853,-0.071335,-0.015794,-0.015297,0.022157,...,0.00133,0.023373,0.050263,-0.008548,-0.004607,-0.024106,-0.019582,-0.026716,-0.020123,-0.002334
JobInvolvement,0.02982,0.008783,-0.008278,0.042861,1.0,-0.01263,-0.021476,-0.015271,-0.016322,0.015012,...,0.034297,0.046128,0.021523,-0.015338,-0.014617,0.008717,-0.021355,-0.024184,0.025976,-0.005533
JobLevel,0.509604,0.005303,0.001212,-0.027853,-0.01263,1.0,-0.001944,0.9503,0.039563,0.142501,...,0.021642,0.002969,0.013984,-0.018191,0.037818,0.389447,0.534739,0.353885,0.375281,0.782208
JobSatisfaction,-0.004892,-0.003669,-0.006784,-0.071335,-0.021476,-0.001944,1.0,-0.007157,0.000644,-0.055699,...,-0.012454,0.030568,0.01069,-0.005779,-0.019459,-0.002305,-0.003803,-0.018214,-0.027656,-0.020185
MonthlyIncome,0.497855,-0.017014,-0.006259,-0.015794,-0.015271,0.9503,-0.007157,1.0,0.034814,0.149515,...,0.025873,0.007709,0.005408,-0.021736,0.030683,0.363818,0.514285,0.344978,0.344079,0.772893
MonthlyRate,0.028051,0.027473,0.0376,-0.015297,-0.016322,0.039563,0.000644,0.034814,1.0,0.017521,...,-0.004085,-0.032184,-0.034323,0.001467,0.007963,-0.012815,-0.023655,0.001567,-0.036746,0.026442
NumCompaniesWorked,0.299635,-0.029251,0.012594,0.022157,0.015012,0.142501,-0.055699,0.149515,0.017521,1.0,...,0.052733,0.038141,0.030075,-0.066054,-0.008366,-0.090754,-0.118421,-0.036814,-0.110319,0.237639


In [11]:
# provo a fare ricerca in automatico
THR = 0.7

for col in tab1.columns:
    print('********')
    print('Esamino:', col)
    cond = (tab1[col] > THR)
    
    print(tab1.loc[cond, col])
    print()

********
Esamino: Age


NameError: name 'THR' is not defined

In [None]:
to_remove = to_remove + ['JobLevel', 'YearsinIndustry','YearsOnJob','YearsWithCurrManager', 'PercentSalaryHike']

to_remove

columns2 = sorted(list(set(data_orig.columns) - set(to_remove)))

print('Rimangono:', len(columns2))

In [None]:
data2 = data_orig[columns2]

tab2 = data2.corr()

for col in tab2.columns:
    print('********')
    print('Esamino:', col)
    cond = (tab2[col] > THR)
    
    print(tab2.loc[cond, col])
    print()

In [13]:
data = data_orig[columns2]

NameError: name 'columns2' is not defined

In [None]:
print("Numero di record nel dataset intero:", data.shape[0])

In [12]:
# distribution of target
FIGSIZE = (9, 6)
plt.figure(figsize=FIGSIZE)
plt.title('Distribution of target values')
plt.grid(True)
sns.histplot(data['Attrition'])
plt.show()

NameError: name 'data' is not defined

#### il dataset è sbilanciato, la classe positiva è minoritaria

In [None]:
# compute the ratio, for class weights
condition = data['Attrition'] == 'Yes'

n_pos = data.loc[condition].shape[0]
n_neg = data.shape[0] - n_pos
ratio = n_neg/n_pos

print('Il rapporto negativi/positivi è:', round(ratio, 2))

In [None]:
# contiamo il numero di valori distinti
# per decidere quali trattare come categoriche e quali come numeriche
count_df = compute_distinct(data, columns2)

count_df

In [None]:
sns.barplot(y=columns2, x=count_df['count'])
plt.show()

In [None]:
FIGSIZE = (14, 20)
plt.figure(figsize=FIGSIZE)

# iterasu tutte le colonne
for i, col in enumerate(columns2):
    plt.subplot(7, 4, i+1)
    plt.grid(True)
    cp = sns.countplot(x=data[col])
    cp.set_xlabel(col,fontsize=8)
    cp.set_ylabel('count',fontsize=8)
    cp.tick_params(labelsize=6)
plt.show()

#### Dalle analisi sopra si giustifica di considerare **MonthlyIncome**, **MonthlyRate**  e **SalaryLevel** come continue numeriche e tutte le altre come categoriche

In [None]:
columns2