<a href="https://colab.research.google.com/github/nipunnirmal21/ai_job_data/blob/main/ai_job_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import io

In [3]:
df = pd.read_csv('/content/sample_data/ai_job_dataset.csv')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
print("--- First 5 Rows of the Dataset: ---")
print(df.head())

print("\n--- Basic Information about the Dataset: ---")
df.info()

--- First 5 Rows of the Dataset: ---
    job_id              job_title  salary_usd salary_currency  \
0  AI00001  AI Research Scientist       90376             USD   
1  AI00002   AI Software Engineer       61895             USD   
2  AI00003          AI Specialist      152626             USD   
3  AI00004           NLP Engineer       80215             USD   
4  AI00005          AI Consultant       54624             EUR   

  experience_level employment_type company_location company_size  \
0               SE              CT            China            M   
1               EN              CT           Canada            M   
2               MI              FL      Switzerland            L   
3               SE              FL            India            M   
4               EN              PT           France            S   

  employee_residence  remote_ratio  \
0              China            50   
1            Ireland           100   
2        South Korea             0   
3          

In [6]:
features = ['job_title', 'company_size', 'employment_type', 'industry']
target = 'experience_level'

df_clean = df[features + [target]].dropna()

In [7]:
X = df_clean[features]
y = df_clean[target]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded)

In [9]:
print(f"\nTraining set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")


Training set has 11250 samples.
Testing set has 3750 samples.


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), features)
    ])

In [11]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

In [12]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [13]:
print("\n--- Training the AI model... ---")
pipeline.fit(X_train, y_train)
print("Training complete!")


--- Training the AI model... ---
Training complete!


In [14]:
print("\n--- Evaluating the model's performance on the test set: ---")
y_pred = pipeline.predict(X_test)


--- Evaluating the model's performance on the test set: ---


In [15]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")


Model Accuracy: 0.2456


In [16]:
print("\n--- Classification Report: ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


--- Classification Report: ---
              precision    recall  f1-score   support

          EN       0.24      0.24      0.24       930
          EX       0.25      0.24      0.25       940
          MI       0.25      0.26      0.26       945
          SE       0.24      0.24      0.24       935

    accuracy                           0.25      3750
   macro avg       0.25      0.25      0.25      3750
weighted avg       0.25      0.25      0.25      3750



In [17]:
new_job_data = pd.DataFrame({
    'job_title': ['Data Scientist'],
    'company_size': ['L'],
    'employment_type': ['FT'],
    'industry': ['Finance']
})

predicted_level_encoded = pipeline.predict(new_job_data)
predicted_level = label_encoder.inverse_transform(predicted_level_encoded)
print(f"\n--> Predicted Experience Level: {predicted_level[0]}")


--> Predicted Experience Level: EN
