# Exploratory Data Analysis - Univariate Analysis

In [1]:
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets
from scipy.stats import shapiro
import statsmodels.api as sm

In [2]:
df = pd.read_csv('./../../../data/cleaned_data.csv')

In [3]:
# Load lists of numerical and categorical columns from the static file
with open('./../../../data/statics.json') as f:
    statics = json.load(f)
categorical_columns = statics['categorical_columns']
numerical_columns = statics['numerical_columns']


In [4]:
# Separate out the dataframe intro numerical and categorical dataframe
num_df = df[numerical_columns]
cat_df = df[categorical_columns]

## Numerical Columns

In [5]:
# Descriptive statics for numerical variables
num_df.describe()
# for column in categorical_columns:
#     print(f"Getting category-wise counts for column: {column}")
#     print(df[column].value_counts())
#     print("\n")

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeNumber,HourlyRate,JobLevel,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,1024.865306,65.891156,2.063946,6502.931293,14313.103401,2.693197,15.209524,0.793878,11.279592,2.79932,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,602.024335,20.329428,1.10694,4707.956783,7117.786044,2.498009,3.659938,0.852077,7.780782,1.289271,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,30.0,1.0,1009.0,2094.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,491.25,48.0,1.0,2911.0,8047.0,1.0,12.0,0.0,6.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,1020.5,66.0,2.0,4919.0,14235.5,2.0,14.0,1.0,10.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,1555.75,83.75,3.0,8379.0,20461.5,4.0,18.0,1.0,15.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,2068.0,100.0,5.0,19999.0,26999.0,9.0,25.0,3.0,40.0,6.0,40.0,18.0,15.0,17.0


From the above table, it can be observed that some of the highly skewed columns include $MonthlyIncome$, $YearsAtCompany$, and $YearsSinceLastPromotion$. More information can be obtained by observing the distribution of all the variables.

In [6]:
# Create interactive plots

# Create a widget for selecting column
numcols = widgets.Dropdown(options = numerical_columns, value = numerical_columns[0], description="Numerial columns")

# Create plotly trace of histogram
num_trace1 = go.Histogram(x=num_df[numerical_columns[0]], 
                         histnorm='probability', 
                         name = 'Distribution')

# Create plotly trace of boc plot
num_trace2 = go.Box(x=num_df[numerical_columns[0]], 
                   boxpoints='outliers', name = 'Quartiles representation')

# Create a widget for histogram
ng1 = go.FigureWidget(data=[num_trace1],
                     layout = go.Layout(
                         title = dict(text='Distribution of features')
                     ))

# Create a widget for box plot
ng2 = go.FigureWidget(data=[num_trace2],
                     layout = go.Layout(
                         title = dict(text='Quartiles representation of features')
                     ))

# Create a function for observing the change in the selection
def num_response(change):
    """
    Function to update the values in the graph based on the selected column.
    """
    with ng1.batch_update():
        ng1.data[0].x = num_df[numcols.value]
        ng1.layout.xaxis.title = 'Distribution of ' + str(numcols.value) + ' variable'
    
    with ng2.batch_update():
        ng2.data[0].x = num_df[numcols.value]
        ng2.layout.xaxis.title = numcols.value
    
numcols.observe(num_response, names='value')

num_container = widgets.VBox([numcols, ng1, ng2])

In [7]:
display(num_container)

VBox(children=(Dropdown(description='Numerial columns', options=('Age', 'DailyRate', 'DistanceFromHome', 'Empl…

From the above distributions following observations can be noted:
- The average age of the participants is 37 years while the median age is rests at 36 years of age. We have representation of almost all sorts of working population right from the age of 18 to the age of 60. There are no outliers that exist in the dataset as far as age is concerned. 
- Variables that approximately follows uniform distribution are variables representing daily rate, hourly rate with exception for values greater than 100, and monthly rate.
- There are variables which are positively skewed that includes distance from home, monthly income, number of companies worked, percentage hike, total working years, and years at a company.
- There are 2 variables which have double peaks. The variables represents years in current role and years since last promotion.
- Only 1 variable representing number of training in last year seems to be following normal distribution. 
- There are outliers present in variables such as monthly income, number of companies worked, total working years, number of trainings in last year, years at company, years in current role, years since last promotion, and years with current manager. In order to decide whether to keep or remove the outliers a more closer look into variables are required.