### Data Preprocessing - Combined Dataset

In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [13]:
# Reading the dataset
data = pd.read_csv('data/dataset.csv')
data.head()

Unnamed: 0,domain,designation,age,degree,cs_bg,experience,avg_time,holidays,week_back,city,...,linux,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary
0,web development using python django,software engineer,22.0,No Degree,No,No,9.0,No,0.0,bangalore,...,0,0,0,0,0,0,0,0,0,3.0
1,web development using node.js express,software engineer,21.0,Degree,No,No,12.0,No,2.0,wfh,...,0,0,0,0,0,0,0,0,0,2.16
2,web development using python django,nodejs developer,22.0,Degree,Yes,No,9.0,Yes,4.0,bangalore,...,0,0,0,0,0,0,0,0,0,2.4
3,mern stack,software developer,23.0,No Degree,No,No,11.0,Yes,7.0,chennai,...,0,0,0,0,0,0,0,0,0,3.55
4,web development using python django,software engineer,22.0,No Degree,No,No,10.0,No,4.0,bangalore,...,0,0,0,0,0,0,0,0,0,2.4


In [8]:
data.columns

Index(['domain', 'designation', 'age', 'degree', 'cs_bg', 'experience',
       'avg_time', 'holidays', 'week_back', 'city', 'graduated', 'flutter',
       'angular', 'python', 'express', 'firebase', 'aws', 'sql', 'javascript',
       'jwt', 'node', 'html', 'ajax', 'hasura', 'communication', 'git', 'jest',
       'mysql', 'react', 'graphql', 'mongodb', 'cnn', 'postman', 'nginx',
       'dart', 'vim', 'django', 'mui', 'typescript', 'opencv', 'jquery',
       'next', 'linux', 'sas', 'vercel', 'redux', 'api', 'bootstrap',
       'mongoose', 'tailwind', 'docker', 'salary'],
      dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 52 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   domain         669 non-null    object 
 1   designation    669 non-null    object 
 2   age            669 non-null    float64
 3   degree         669 non-null    object 
 4   cs_bg          669 non-null    object 
 5   experience     669 non-null    object 
 6   avg_time       669 non-null    float64
 7   holidays       669 non-null    object 
 8   week_back      669 non-null    float64
 9   city           669 non-null    object 
 10  graduated      669 non-null    object 
 11  flutter        669 non-null    int64  
 12  angular        669 non-null    int64  
 13  python         669 non-null    int64  
 14  express        669 non-null    int64  
 15  firebase       669 non-null    int64  
 16  aws            669 non-null    int64  
 17  sql            669 non-null    int64  
 18  javascript

We can drop the column `city` as it has misentries from imputation.

In [14]:
# Dropping the column city
data = data.drop('city', axis=1)
data.head()

Unnamed: 0,domain,designation,age,degree,cs_bg,experience,avg_time,holidays,week_back,graduated,...,linux,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary
0,web development using python django,software engineer,22.0,No Degree,No,No,9.0,No,0.0,Yes,...,0,0,0,0,0,0,0,0,0,3.0
1,web development using node.js express,software engineer,21.0,Degree,No,No,12.0,No,2.0,Yes,...,0,0,0,0,0,0,0,0,0,2.16
2,web development using python django,nodejs developer,22.0,Degree,Yes,No,9.0,Yes,4.0,Yes,...,0,0,0,0,0,0,0,0,0,2.4
3,mern stack,software developer,23.0,No Degree,No,No,11.0,Yes,7.0,Yes,...,0,0,0,0,0,0,0,0,0,3.55
4,web development using python django,software engineer,22.0,No Degree,No,No,10.0,No,4.0,Yes,...,0,0,0,0,0,0,0,0,0,2.4


In [15]:
data.shape

(669, 51)

We have 669 rows and 51 columns

### Encoding Categorical Columns

1. Using **Label encoding** for `degree` column because it has a meaningful ordinal relationship

In [17]:
# Importing label encoder from sklearn
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

In [20]:
pre_data = data.copy()

pre_data['degree_encoded'] = label_encoder.fit_transform(pre_data['degree'])
pre_data.drop('degree', axis=1, inplace=True)
pre_data.head()

Unnamed: 0,domain,designation,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,...,sas,vercel,redux,api,bootstrap,mongoose,tailwind,docker,salary,degree_encoded
0,web development using python django,software engineer,22.0,No,No,9.0,No,0.0,Yes,0,...,0,0,0,0,0,0,0,0,3.0,2
1,web development using node.js express,software engineer,21.0,No,No,12.0,No,2.0,Yes,0,...,0,0,0,0,0,0,0,0,2.16,0
2,web development using python django,nodejs developer,22.0,Yes,No,9.0,Yes,4.0,Yes,0,...,0,0,0,0,0,0,0,0,2.4,0
3,mern stack,software developer,23.0,No,No,11.0,Yes,7.0,Yes,0,...,0,0,0,0,0,0,0,0,3.55,2
4,web development using python django,software engineer,22.0,No,No,10.0,No,4.0,Yes,0,...,0,0,0,0,0,0,0,0,2.4,2


In [21]:
pre_data['degree_encoded'].value_counts()

2    338
0    328
1      3
Name: degree_encoded, dtype: int64

  **Legend - degree_encoded**
  
    2 - No degree

    0 - Degree

    1 - Diploma

2. **One Hot Encoding** for `domain` column as the values has no meaningful order and each category is distinct 

In [23]:
# Apply one-hot encoding to the 'domain' column
df_encoded  = pd.get_dummies(pre_data['domain'])


# Concatenate the encoded columns with the original DataFrame
pre_data = pd.concat([pre_data, df_encoded], axis=1)

# Drop the original 'domain' column
pre_data.drop('domain', axis=1, inplace=True)

pre_data.head()

Unnamed: 0,designation,age,cs_bg,experience,avg_time,holidays,week_back,graduated,flutter,angular,...,machine learning,mean stack,mern stack,mobile development using android kotlin,mobile development using flutter,mobile development using swift,web development using node.js express,web development using python django,web development using python django + angular,web development using python django + react
0,software engineer,22.0,No,No,9.0,No,0.0,Yes,0,1,...,0,0,0,0,0,0,0,1,0,0
1,software engineer,21.0,No,No,12.0,No,2.0,Yes,0,1,...,0,0,0,0,0,0,1,0,0,0
2,nodejs developer,22.0,Yes,No,9.0,Yes,4.0,Yes,0,1,...,0,0,0,0,0,0,0,1,0,0
3,software developer,23.0,No,No,11.0,Yes,7.0,Yes,0,1,...,0,0,1,0,0,0,0,0,0,0
4,software engineer,22.0,No,No,10.0,No,4.0,Yes,0,1,...,0,0,0,0,0,0,0,1,0,0


In [24]:
pre_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 67 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   designation                                    669 non-null    object 
 1   age                                            669 non-null    float64
 2   cs_bg                                          669 non-null    object 
 3   experience                                     669 non-null    object 
 4   avg_time                                       669 non-null    float64
 5   holidays                                       669 non-null    object 
 6   week_back                                      669 non-null    float64
 7   graduated                                      669 non-null    object 
 8   flutter                                        669 non-null    int64  
 9   angular                                        669 non

In [26]:
# Renaming the columns
cols = {'data science':'data_science', 'frontend development using react':'frontend_react', 'game development using unity': 'game_developemnt',
       'machine learning': 'machine_learning', 'mean stack':'mean', 'mern stack': 'mern', 'mobile development using android kotlin':'mob_dev_kotlin',
       'mobile development using android kotlin':'mob_dev_flutter', 'mobile development using swift': 'mob_dev_swift',
       'web development using node.js express':'webdev_nodejs', 'web development using python django':'webdev_django',
       'web development using python django + angular':'webdev_django_angular', 'web development using python django + react':'webdev_django_react'}

pre_data.rename(columns=cols, inplace=True)
pre_data.columns

Index(['designation', 'age', 'cs_bg', 'experience', 'avg_time', 'holidays',
       'week_back', 'graduated', 'flutter', 'angular', 'python', 'express',
       'firebase', 'aws', 'sql', 'javascript', 'jwt', 'node', 'html', 'ajax',
       'hasura', 'communication', 'git', 'jest', 'mysql', 'react', 'graphql',
       'mongodb', 'cnn', 'postman', 'nginx', 'dart', 'vim', 'django', 'mui',
       'typescript', 'opencv', 'jquery', 'next', 'linux', 'sas', 'vercel',
       'redux', 'api', 'bootstrap', 'mongoose', 'tailwind', 'docker', 'salary',
       'degree_encoded', 'blockchain', 'cybersecurity', 'data_science',
       'devops', 'frontend_react', 'game_developemnt', 'golang',
       'machine_learning', 'mean', 'mern', 'mob_dev_flutter',
       'mobile development using flutter', 'mob_dev_swift', 'webdev_nodejs',
       'webdev_django', 'webdev_django_angular', 'webdev_django_react'],
      dtype='object')