In [85]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

df = pd.read_csv('data/raw/train.csv')

In [86]:
# Select appropiate columns

columns = ['education', 'gender', 'recruitment_channel', 'no_of_trainings','age','previous_year_rating','length_of_service','KPIs_met >80%','awards_won?','avg_training_score', 'is_promoted']
df = df[columns]

# Encode the categorical ones

df['education'] = df['education'].replace("Below Secondary", 0)
df['education'] = df['education'].replace("Bachelor's", 1)
df['education'] = df['education'].replace("Master's & above", 2)

df['gender'] = df['gender'].replace("m", 0)
df['gender'] = df['gender'].replace("f", 1)

df['recruitment_channel'] = df['recruitment_channel'].replace("sourcing", 0)
df['recruitment_channel'] = df['recruitment_channel'].replace("referred", 1)
df['recruitment_channel'] = df['recruitment_channel'].replace("other", np.nan)

# Standardize the numerical ones

for col in ['no_of_trainings','age','previous_year_rating','length_of_service','KPIs_met >80%','awards_won?','avg_training_score']:
    df[col] = (df[col] - df[col].mean())/ df[col].std()

# Fill in missing values using the mean

for col in columns:
    df[col] = df[col].fillna(df[col].mean())

In [77]:
# Make a PCA analysis

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df)

df_pca = pca.transform(df)

sns.scatterplot(x=df_pca[:,0], y=df_pca[:,1], hue=df['is_promoted'])
plt.xlabel('PC1')
plt.ylabel('PC2')

# Plot a barplot with the weights of the first component and its labels

plt.barh(width=pca.components_[0], y=columns)

array([1.72047052, 1.36468391])

In [87]:
df.to_csv('data/processed/train.csv', index=False)