### Data Preprocessing - Combined Dataset

In [113]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [114]:
# Reading the dataset
data = pd.read_csv('data/dataset.csv')
data.head()

Unnamed: 0,domain,designation,age,degree,cs_bg,experience,avg_time,holidays,week_back,city,...,linux,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary
0,web development using python django,software engineer,22.0,No Degree,No,No,9.0,No,0.0,bangalore,...,0,0,0,0,0,0,0,0,0,3.0
1,web development using node.js express,software engineer,21.0,Degree,No,No,12.0,No,2.0,wfh,...,0,0,0,0,0,0,0,0,0,2.16
2,web development using python django,nodejs developer,22.0,Degree,Yes,No,9.0,Yes,4.0,bangalore,...,0,0,0,0,0,0,0,0,0,2.4
3,mern stack,software developer,23.0,No Degree,No,No,11.0,Yes,7.0,chennai,...,0,0,0,0,0,0,0,0,0,3.55
4,web development using python django,software engineer,22.0,No Degree,No,No,10.0,No,4.0,bangalore,...,0,0,0,0,0,0,0,0,0,2.4


In [115]:
data.columns

Index(['domain', 'designation', 'age', 'degree', 'cs_bg', 'experience',
       'avg_time', 'holidays', 'week_back', 'city', 'graduated', 'flutter',
       'angular', 'python', 'express', 'firebase', 'aws', 'sql', 'javascript',
       'jwt', 'node', 'html', 'ajax', 'hasura', 'communication', 'git', 'jest',
       'mysql', 'react', 'graphql', 'mongodb', 'cnn', 'postman', 'nginx',
       'dart', 'vim', 'django', 'mui', 'typescript', 'opencv', 'jquery',
       'next', 'linux', 'sas', 'vercel', 'redux', 'api', 'bootstrap',
       'mongoose', 'tailwind', 'docker', 'salary'],
      dtype='object')

In [116]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 52 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   domain         669 non-null    object 
 1   designation    669 non-null    object 
 2   age            669 non-null    float64
 3   degree         669 non-null    object 
 4   cs_bg          669 non-null    object 
 5   experience     669 non-null    object 
 6   avg_time       669 non-null    float64
 7   holidays       669 non-null    object 
 8   week_back      669 non-null    float64
 9   city           669 non-null    object 
 10  graduated      669 non-null    object 
 11  flutter        669 non-null    int64  
 12  angular        669 non-null    int64  
 13  python         669 non-null    int64  
 14  express        669 non-null    int64  
 15  firebase       669 non-null    int64  
 16  aws            669 non-null    int64  
 17  sql            669 non-null    int64  
 18  javascript

We can drop the column `city` as it has misentries from imputation.

In [117]:
# Dropping the column city
data = data.drop('city', axis=1)
data.head()

Unnamed: 0,domain,designation,age,degree,cs_bg,experience,avg_time,holidays,week_back,graduated,...,linux,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary
0,web development using python django,software engineer,22.0,No Degree,No,No,9.0,No,0.0,Yes,...,0,0,0,0,0,0,0,0,0,3.0
1,web development using node.js express,software engineer,21.0,Degree,No,No,12.0,No,2.0,Yes,...,0,0,0,0,0,0,0,0,0,2.16
2,web development using python django,nodejs developer,22.0,Degree,Yes,No,9.0,Yes,4.0,Yes,...,0,0,0,0,0,0,0,0,0,2.4
3,mern stack,software developer,23.0,No Degree,No,No,11.0,Yes,7.0,Yes,...,0,0,0,0,0,0,0,0,0,3.55
4,web development using python django,software engineer,22.0,No Degree,No,No,10.0,No,4.0,Yes,...,0,0,0,0,0,0,0,0,0,2.4


In [118]:
data.shape

(669, 51)

We have 669 rows and 51 columns

### Encoding Categorical Columns

1. Using **Label encoding** for `degree` column because it has a meaningful ordinal relationship

In [119]:
# Importing label encoder from sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

In [120]:
pre_data = data.copy()

pre_data['degree_encoded'] = label_encoder.fit_transform(pre_data['degree'])
pre_data.drop('degree', axis=1, inplace=True)
pre_data.head()

Unnamed: 0,domain,designation,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,...,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded
0,web development using python django,software engineer,22.0,No,No,9.0,No,0.0,Yes,0,...,0,0,0,0,0,0,0,0,3.0,2
1,web development using node.js express,software engineer,21.0,No,No,12.0,No,2.0,Yes,0,...,0,0,0,0,0,0,0,0,2.16,0
2,web development using python django,nodejs developer,22.0,Yes,No,9.0,Yes,4.0,Yes,0,...,0,0,0,0,0,0,0,0,2.4,0
3,mern stack,software developer,23.0,No,No,11.0,Yes,7.0,Yes,0,...,0,0,0,0,0,0,0,0,3.55,2
4,web development using python django,software engineer,22.0,No,No,10.0,No,4.0,Yes,0,...,0,0,0,0,0,0,0,0,2.4,2


In [121]:
pre_data['degree_encoded'].value_counts()

2    338
0    328
1      3
Name: degree_encoded, dtype: int64

  **Legend - degree_encoded**
  
    2 - No degree

    0 - Degree

    1 - Diploma

2. **One Hot Encoding** for `domain` column as the values has no meaningful order and each category is distinct 

In [122]:
from sklearn.compose import ColumnTransformer

columns_to_encode = ['domain', 'cs_bg', 'experience', 'holidays', 'graduated']

# Apply one-hot encoding to the categorical columns
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), columns_to_encode)],
    remainder='passthrough'
)
encoded_features = column_transformer.fit_transform(pre_data)

# Get the feature names after one-hot encoding
feature_names = column_transformer.named_transformers_['encoder'].get_feature_names_out(columns_to_encode)

# Create a new DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=list(feature_names) + list(pre_data.columns[len(columns_to_encode):]))
pre_data = pd.concat([pre_data, encoded_df], axis=1)

pre_data.head()

Unnamed: 0,domain,designation,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,...,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded
0,web development using python django,software engineer,22.0,No,No,9.0,No,0.0,Yes,0,...,0,0,0,0,0,0,0,0,3.0,2
1,web development using node.js express,software engineer,21.0,No,No,12.0,No,2.0,Yes,0,...,0,0,0,0,0,0,0,0,2.16,0
2,web development using python django,nodejs developer,22.0,Yes,No,9.0,Yes,4.0,Yes,0,...,0,0,0,0,0,0,0,0,2.4,0
3,mern stack,software developer,23.0,No,No,11.0,Yes,7.0,Yes,0,...,0,0,0,0,0,0,0,0,3.55,2
4,web development using python django,software engineer,22.0,No,No,10.0,No,4.0,Yes,0,...,0,0,0,0,0,0,0,0,2.4,2


In [123]:
print(pre_data.columns.values)

['domain' 'designation' 'age' 'cs_bg' 'experience' 'avg_time' 'holidays'
 'week_back' 'graduated' 'flutter' 'angular' 'python' 'express' 'firebase'
 'aws' 'sql' 'javascript' 'jwt' 'node' 'html' 'ajax' 'hasura'
 'communication' 'git' 'jest' 'mysql' 'react' 'graphql' 'mongodb' 'cnn'
 'postman' 'nginx' 'dart' 'vim' 'django' 'mui' 'typescript' 'opencv'
 'jquery' 'next' 'linux' 'sas' 'vercel' 'redux' 'api' 'bootstrap'
 'mongoose' 'tailwind' 'docker' 'salary' 'degree_encoded'
 'domain_blockchain' 'domain_cybersecurity' 'domain_data science'
 'domain_devops' 'domain_frontend development using react'
 'domain_game development using unity' 'domain_golang'
 'domain_machine learning' 'domain_mean stack' 'domain_mern stack'
 'domain_mobile development using android kotlin'
 'domain_mobile development using flutter'
 'domain_mobile development using swift'
 'domain_web development using node.js express'
 'domain_web development using python django'
 'domain_web development using python django + ang

In [124]:
# Renaming the columns
cols = {'domain_data science':'domain_data_science', 'domain_frontend development using react':'domain_frontend_react', 'domain_game development using unity': 'game_developemnt',
       'domain_machine learning': 'domain_machine_learning', 'domain_mean stack':'domain_mean', 'domain_mern stack': 'domain_mern', 'domain_mobile development using android kotlin':'domain_mob_dev_kotlin',
       'domain_mobile development using flutter':'domain_mobile_development_using_flutter', 'domain_mobile development using swift': 'domain_mob_dev_swift',
       'domain_web development using node.js express':'domain_webdev_nodejs', 'domain_web development using python django':'domain_webdev_django',
       'domain_web development using python django + angular':'domain_webdev_django_angular', 'domain_web development using python django + react':'domain_webdev_django_react'}

pre_data.rename(columns=cols, inplace=True)
pre_data.columns.values

array(['domain', 'designation', 'age', 'cs_bg', 'experience', 'avg_time',
       'holidays', 'week_back', 'graduated', 'flutter', 'angular',
       'python', 'express', 'firebase', 'aws', 'sql', 'javascript', 'jwt',
       'node', 'html', 'ajax', 'hasura', 'communication', 'git', 'jest',
       'mysql', 'react', 'graphql', 'mongodb', 'cnn', 'postman', 'nginx',
       'dart', 'vim', 'django', 'mui', 'typescript', 'opencv', 'jquery',
       'next', 'linux', 'sas', 'vercel', 'redux', 'api', 'bootstrap',
       'mongoose', 'tailwind', 'docker', 'salary', 'degree_encoded',
       'domain_blockchain', 'domain_cybersecurity', 'domain_data_science',
       'domain_devops', 'domain_frontend_react', 'game_developemnt',
       'domain_golang', 'domain_machine_learning', 'domain_mean',
       'domain_mern', 'domain_mob_dev_kotlin',
       'domain_mobile_development_using_flutter', 'domain_mob_dev_swift',
       'domain_webdev_nodejs', 'domain_webdev_django',
       'domain_webdev_django_angular', 

In [125]:
# Dropping the domain column
pre_data = pre_data.drop('domain', axis=1)

3. Applying **Label Encoding** to `designation` as it has 137 unique values

In [126]:
pre_data.designation.nunique()

137

In [127]:
# Apply label encoding to 'designation' column
pre_data['designation_encoded'] = label_encoder.fit_transform(pre_data['designation'])
pre_data.head()

Unnamed: 0,designation,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,angular,...,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded,designation_encoded
0,software engineer,22.0,No,No,9.0,No,0.0,Yes,0,1,...,0,0,0,0,0,0,0,3.0,2,124
1,software engineer,21.0,No,No,12.0,No,2.0,Yes,0,1,...,0,0,0,0,0,0,0,2.16,0,124
2,nodejs developer,22.0,Yes,No,9.0,Yes,4.0,Yes,0,1,...,0,0,0,0,0,0,0,2.4,0,95
3,software developer,23.0,No,No,11.0,Yes,7.0,Yes,0,1,...,0,0,0,0,0,0,0,3.55,2,116
4,software engineer,22.0,No,No,10.0,No,4.0,Yes,0,1,...,0,0,0,0,0,0,0,2.4,2,124


In [128]:
print(pre_data.columns.values)

['designation' 'age' 'cs_bg' 'experience' 'avg_time' 'holidays'
 'week_back' 'graduated' 'flutter' 'angular' 'python' 'express' 'firebase'
 'aws' 'sql' 'javascript' 'jwt' 'node' 'html' 'ajax' 'hasura'
 'communication' 'git' 'jest' 'mysql' 'react' 'graphql' 'mongodb' 'cnn'
 'postman' 'nginx' 'dart' 'vim' 'django' 'mui' 'typescript' 'opencv'
 'jquery' 'next' 'linux' 'sas' 'vercel' 'redux' 'api' 'bootstrap'
 'mongoose' 'tailwind' 'docker' 'salary' 'degree_encoded'
 'domain_blockchain' 'domain_cybersecurity' 'domain_data_science'
 'domain_devops' 'domain_frontend_react' 'game_developemnt'
 'domain_golang' 'domain_machine_learning' 'domain_mean' 'domain_mern'
 'domain_mob_dev_kotlin' 'domain_mobile_development_using_flutter'
 'domain_mob_dev_swift' 'domain_webdev_nodejs' 'domain_webdev_django'
 'domain_webdev_django_angular' 'domain_webdev_django_react' 'cs_bg_No'
 'cs_bg_Yes' 'experience_Bca dropout ' 'experience_No'
 'experience_Yes [ Non IT industry ]' 'holidays_No' 'holidays_Yes'
 'grad

##### Legend for designaton_encoded column

In [129]:
# Creating a dict for legend for designation_encoded column
designation_mapping = dict(pre_data[['designation_encoded', 'designation']].drop_duplicates().values)

for i in designation_mapping:
    print(f'Numerical: {i} ---- Designation: {designation_mapping[i]}')

Numerical: 124 ---- Designation: software engineer
Numerical: 95 ---- Designation: nodejs developer
Numerical: 116 ---- Designation: software developer
Numerical: 34 ---- Designation: django developer
Numerical: 6 ---- Designation: assistant manager (it)
Numerical: 1 ---- Designation: android developer
Numerical: 80 ---- Designation: junior software engineer
Numerical: 47 ---- Designation: full stack developer
Numerical: 74 ---- Designation: junior nodejs developer
Numerical: 103 ---- Designation: python developer
Numerical: 50 ---- Designation: fullstack developer
Numerical: 58 ---- Designation: junior android developer
Numerical: 94 ---- Designation: node js trainee
Numerical: 69 ---- Designation: junior fullstack developer
Numerical: 98 ---- Designation: product engineer -ai/ml
Numerical: 68 ---- Designation: junior full stack developer
Numerical: 86 ---- Designation: mern stack developer
Numerical: 97 ---- Designation: product engineer
Numerical: 104 ---- Designation: python develo

In [130]:
# Dropping the designation column
pre_data = pre_data.drop('designation', axis=1)
pre_data.head()

Unnamed: 0,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,angular,python,...,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded,designation_encoded
0,22.0,No,No,9.0,No,0.0,Yes,0,1,0,...,0,0,0,0,0,0,0,3.0,2,124
1,21.0,No,No,12.0,No,2.0,Yes,0,1,0,...,0,0,0,0,0,0,0,2.16,0,124
2,22.0,Yes,No,9.0,Yes,4.0,Yes,0,1,0,...,0,0,0,0,0,0,0,2.4,0,95
3,23.0,No,No,11.0,Yes,7.0,Yes,0,1,0,...,0,0,0,0,0,0,0,3.55,2,116
4,22.0,No,No,10.0,No,4.0,Yes,0,1,0,...,0,0,0,0,0,0,0,2.4,2,124


In [131]:
# Removing extra columns avoid dummy variable trap
for idx, col in enumerate(pre_data.columns):
    print(f'Index: {idx}, Column: {col}')

Index: 0, Column: age
Index: 1, Column: cs_bg
Index: 2, Column: experience
Index: 3, Column: avg_time
Index: 4, Column: holidays
Index: 5, Column: week_back
Index: 6, Column: graduated
Index: 7, Column: flutter
Index: 8, Column: angular
Index: 9, Column: python
Index: 10, Column: express
Index: 11, Column: firebase
Index: 12, Column: aws
Index: 13, Column: sql
Index: 14, Column: javascript
Index: 15, Column: jwt
Index: 16, Column: node
Index: 17, Column: html
Index: 18, Column: ajax
Index: 19, Column: hasura
Index: 20, Column: communication
Index: 21, Column: git
Index: 22, Column: jest
Index: 23, Column: mysql
Index: 24, Column: react
Index: 25, Column: graphql
Index: 26, Column: mongodb
Index: 27, Column: cnn
Index: 28, Column: postman
Index: 29, Column: nginx
Index: 30, Column: dart
Index: 31, Column: vim
Index: 32, Column: django
Index: 33, Column: mui
Index: 34, Column: typescript
Index: 35, Column: opencv
Index: 36, Column: jquery
Index: 37, Column: next
Index: 38, Column: linux


In [132]:
# Observing the one hot encoded columns
pre_data.iloc[20:50, 66:75]

Unnamed: 0,cs_bg_No,cs_bg_Yes,experience_Bca dropout,experience_No,experience_Yes [ Non IT industry ],holidays_No,holidays_Yes,graduated_No,graduated_Yes
20,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
21,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
22,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
23,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
24,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
25,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
26,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
27,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
28,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
29,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [133]:
pre_data.iloc[20:50, :7]

Unnamed: 0,age,cs_bg,experience,avg_time,holidays,week_back,graduated
20,22.0,No,Yes [ Non IT industry ],16.0,Yes,4.0,Yes
21,22.0,No,No,15.0,Yes,7.0,Yes
22,25.0,No,Yes [ Non IT industry ],13.0,Yes,0.0,No
23,22.0,Yes,No,12.0,Yes,0.0,Yes
24,22.0,Yes,No,12.0,Yes,4.0,Yes
25,22.0,No,No,12.0,No,7.0,Yes
26,21.0,No,No,12.0,No,4.0,Yes
27,24.0,No,No,12.0,No,3.0,Yes
28,23.0,No,No,12.0,No,1.0,No
29,21.0,No,No,10.0,No,5.0,No


In [134]:
pre_data[pre_data['experience'] == 'Bca dropout ']

Unnamed: 0,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,angular,python,...,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded,designation_encoded
467,19.0,No,Bca dropout,9.0,No,0.0,Yes,0,0,0,...,0,0,1,0,0,0,0,2.4,2,50
600,19.0,No,Bca dropout,12.0,No,0.0,No,0,0,0,...,0,0,0,0,0,1,0,3.0,2,132


In [135]:
pre_data.iloc[600:601, 66:75]

Unnamed: 0,cs_bg_No,cs_bg_Yes,experience_Bca dropout,experience_No,experience_Yes [ Non IT industry ],holidays_No,holidays_Yes,graduated_No,graduated_Yes
600,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


### Legends
    1. cs_bg
        1 - Yes
        0 - No
        
    2. experience
          0 0 1 - Yes [ Non IT industry ]
          0 1 0 - No
          1 0 0 - Bca dropout
          
    3. holidays
          1 - Yes
          0 - No
          
     4. graduated
        1 - Yes
        0 - No

In [136]:
# Dropping the unnecessary columns created for avoiding dummy variable trap
cols_to_drop = ['cs_bg', 'experience', 'holidays', 'graduated', 'cs_bg_No', 'experience_Bca dropout ', 'holidays_No', 'graduated_No']
pre_data = pre_data.drop(cols_to_drop, axis=1)
pre_data.head()

Unnamed: 0,age,avg_time,week_back,flutter,angular,python,express,firebase,aws,sql,...,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded,designation_encoded
0,22.0,9.0,0.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,3.0,2,124
1,21.0,12.0,2.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,2.16,0,124
2,22.0,9.0,4.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,2.4,0,95
3,23.0,11.0,7.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,3.55,2,116
4,22.0,10.0,4.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,2.4,2,124


##### Moving the salary column to the last

In [151]:
# Moving the `salary` column to last
# column_to_move = pre_data.pop("salary")

# column_to_move = column_to_move.ilco[:, 1:]

# insert column with insert(location, column_name, column_value)

pre_data.insert(110, "salary", column_to_move)

pre_data.head()

Unnamed: 0,age,avg_time,week_back,flutter,angular,python,express,firebase,aws,sql,...,vercel,redux,api,bootstrap,mongoose,tailwind,docker,degree_encoded,designation_encoded,salary
0,22.0,9.0,0.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,2,124,3.0
1,21.0,12.0,2.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,124,2.16
2,22.0,9.0,4.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,95,2.4
3,23.0,11.0,7.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,2,116,3.55
4,22.0,10.0,4.0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,2,124,2.4


### Saving the preprocessed dataset

pre_data.to_csv('encoded_dataset.csv')