<a href="https://colab.research.google.com/github/nivyakoshy/DiabetesDataset/blob/main/assesment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


In [3]:
# Load the dataset
train_data = pd.read_csv("/content/train_LZdllcl.csv")
test_data = pd.read_csv("/content/test_2umaH9m.csv")

In [4]:
print(train_data.head())

   employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   35                   5.0   
1               other                1   30                   5.0   
2            sourcing                1   34                   3.0   
3               other                2   39                   1.0   
4               other                1   45                   3.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0                  8              1            0                  49   
1 

In [5]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB
None


In [6]:
print(train_data.describe())

        employee_id  no_of_trainings           age  previous_year_rating  \
count  54808.000000     54808.000000  54808.000000          50684.000000   
mean   39195.830627         1.253011     34.803915              3.329256   
std    22586.581449         0.609264      7.660169              1.259993   
min        1.000000         1.000000     20.000000              1.000000   
25%    19669.750000         1.000000     29.000000              3.000000   
50%    39225.500000         1.000000     33.000000              3.000000   
75%    58730.500000         1.000000     39.000000              4.000000   
max    78298.000000        10.000000     60.000000              5.000000   

       length_of_service  KPIs_met >80%   awards_won?  avg_training_score  \
count       54808.000000   54808.000000  54808.000000        54808.000000   
mean            5.865512       0.351974      0.023172           63.386750   
std             4.265094       0.477590      0.150450           13.371559   
min    

In [7]:
# Check for missing values
print(train_data.isnull().sum())

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64


In [8]:
cf = ['education', 'previous_year_rating']
for i in cf:
    mode_value = train_data[i].mode()[0]
    train_data[i] = train_data[i].fillna(mode_value)

# Check if missing values are filled
print(train_data.isnull().sum())

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64


In [9]:
train_data.fillna(method='ffill', inplace=True)

In [10]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    train_data[col] = label_encoders[col].fit_transform(train_data[col])


In [11]:
# Split data into features and target
X = train_data.drop(['employee_id', 'is_promoted'], axis=1)
y = train_data['is_promoted']

In [12]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [14]:
# Prediction on validation set
y_pred = rf_classifier.predict(X_val)

In [15]:
f1 = f1_score(y_val, y_pred)
print("F1 Score:", f1)

F1 Score: 0.417277913610432


In [16]:
test_data.fillna(method='ffill', inplace=True)

In [17]:
for col in categorical_cols:
    test_data[col] = label_encoders[col].transform(test_data[col])

In [19]:
# Check if there are any remaining missing values
missing_values = test_data.isnull().sum()
print("Missing values in test dataset:")
print(missing_values)

Missing values in test dataset:
employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    1
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64


In [20]:
ef = ['previous_year_rating']
for i in ef:
    mode_value = test_data[i].mode()[0]
    test_data[i] = test_data[i].fillna(mode_value)

# Check if missing values are filled
print(test_data.isnull().sum())

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64


In [22]:
label_encoders = {}
categorical_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    # Fit label encoder only on train data
    label_encoders[col].fit(train_data[col])
    # Transform both train and test data
    train_data[col] = label_encoders[col].transform(train_data[col])
    test_data[col] = label_encoders[col].transform(test_data[col])

In [23]:
test_X = test_data.drop('employee_id', axis=1)
test_predictions = rf_classifier.predict(test_X)

In [29]:
submission_df = pd.DataFrame({'employee_id': test_data['employee_id'], 'is_promoted': test_predictions})
submission_df.to_csv('assesment.csv', index=False)