In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Display the first few rows of each dataframe to understand their structure
train_df.head(), test_df.head(), sample_submission.head()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import numpy as np

In [None]:
# Combine train and test for consistent preprocessing
train_df['is_test'] = 0
test_df['is_test'] = 1
test_df['is_promoted'] = np.nan  # Placeholder for alignment

combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df.head()


In [None]:
# Encode categorical features
categorical_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))
    label_encoders[col] = le


In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
combined_df[['previous_year_rating']] = imputer.fit_transform(combined_df[['previous_year_rating']])


In [None]:
# Split back into train and test
train_processed = combined_df[combined_df['is_test'] == 0].drop(columns=['is_test'])
test_processed = combined_df[combined_df['is_test'] == 1].drop(columns=['is_test', 'is_promoted'])


In [None]:
# Features and target
X = train_processed.drop(columns=['employee_id', 'is_promoted'])
y = train_processed['is_promoted']
X_test_final = test_processed.drop(columns=['employee_id'])

In [None]:
 #Train/test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
# Evaluate on validation set
y_val_pred = model.predict(X_val)
classification_report_output = classification_report(y_val, y_val_pred)
classification_report_output

In [None]:
# Predict on the test data
test_predictions = model.predict(X_test_final)


In [None]:
# Prepare the submission file
submission = test_df[['employee_id']].copy()
submission['is_promoted'] = test_predictions.astype(int)


In [None]:
# Save submission file
submission_path = 'promotion_submission.csv'
submission.to_csv(submission_path, index=False)

submission.head(), submission_path