# Data Wrangling

In [1]:
# Importing dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
# Reading the dataset
path = 'data/combined_data/raw_data.csv'

data = pd.read_csv(path)
data.head()

Unnamed: 0,Name,Domain,Batch,Placed Company,Designation,CTC (LPA),age,domain,current_job,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated,toi,salary
0,Fais M,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.00,,,,,,,,,,,,,,
1,Hashim Rasheed,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,,,,,,,,,,,,,,
2,Mohammed Arshad,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.40,,,,,,,,,,,,,,
3,Muhammed Musthafa P,MERN Stack,BCK01,-,Software Developer,-,,,,,,,,,,,,,,
4,Muhammed Shafeerali,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.40,,,,,,,,,,,,,,


In [4]:
data.shape

(670, 20)

#### Columns

In [5]:
print(list(data.columns))

['Name', 'Domain', 'Batch', 'Placed Company', 'Designation', 'CTC (LPA)', 'age', 'domain', 'current_job', 'degree', 'cs_bg', 'experience', 'avg_time', 'holidays', 'week_back', 'city', 'skills', 'graduated', 'toi', 'salary']


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 0 to 669
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            670 non-null    object 
 1   Domain          670 non-null    object 
 2   Batch           670 non-null    object 
 3   Placed Company  670 non-null    object 
 4   Designation     670 non-null    object 
 5   CTC (LPA)       670 non-null    object 
 6   age             26 non-null     float64
 7   domain          26 non-null     object 
 8   current_job     26 non-null     object 
 9   degree          26 non-null     object 
 10  cs_bg           26 non-null     object 
 11  experience      26 non-null     object 
 12  avg_time        26 non-null     float64
 13  holidays        26 non-null     object 
 14  week_back       26 non-null     float64
 15  city            26 non-null     object 
 16  skills          26 non-null     object 
 17  graduated       26 non-null     obj

The `Null` values are from the office_data as they doesn't have the questions from google form

In [7]:
mask = data['salary'].notna()

data.loc[mask, ['Name', 'salary', 'CTC (LPA)']]

Unnamed: 0,Name,salary,CTC (LPA)
16,Asharudheen,2.6,3.0
61,Rahul K,4.2,3.6
79,Suhail P A,4.7,4.65
81,Afhaam K,4.0,4.8
96,Sujin S R,5.5,4.8
107,Sirajudheen,9.0,4.2
115,Muhammed Aslam,4.8,4.2
238,Shibu A,2.4,4.0
248,Sudheesh M,4.2,4.8
268,Muhammed Niyas MT,6.0,3.6


In the combined records there is significant difference in salary from office_data and from the goole form data
- Seems using the salary from office data will be the better option

### Removing unecessary columns

In [8]:
data = data.drop(['Name', 'current_job', 'salary', 'toi'], axis=1)
data.head()

Unnamed: 0,Domain,Batch,Placed Company,Designation,CTC (LPA),age,domain,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated
0,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.00,,,,,,,,,,,
1,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,,,,,,,,,,,
2,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.40,,,,,,,,,,,
3,MERN Stack,BCK01,-,Software Developer,-,,,,,,,,,,,
4,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.40,,,,,,,,,,,


- Removed `Name` as it is not relvant to the problem statement
- `Designation` from *Office data* seems more reliable
- Chose `CTC (LPA)` from *Office data* over `salary` google form data
- Dropped `toi` columns as it only contains 2 records with non-null value

### Changing column names

In [9]:
print(list(data.columns))

['Domain', 'Batch', 'Placed Company', 'Designation', 'CTC (LPA)', 'age', 'domain', 'degree', 'cs_bg', 'experience', 'avg_time', 'holidays', 'week_back', 'city', 'skills', 'graduated']


In [10]:
cols = ['domain', 'batch', 'placed_company', 'designation', 'salary', 'age', 'domain', 'degree', 'cs_bg',
        'experience', 'avg_time', 'holidays', 'week_back', 'city', 'skills', 'graduated']

data.columns = cols
data.head()

Unnamed: 0,domain,batch,placed_company,designation,salary,age,domain.1,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated
0,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.00,,,,,,,,,,,
1,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,,,,,,,,,,,
2,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.40,,,,,,,,,,,
3,MERN Stack,BCK01,-,Software Developer,-,,,,,,,,,,,
4,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.40,,,,,,,,,,,


### Checking for null values

In [11]:
data.isnull().sum()

domain              0
batch               0
placed_company      0
designation         0
salary              0
age               644
domain            644
degree            644
cs_bg             644
experience        644
avg_time          644
holidays          644
week_back         644
city              644
skills            644
graduated         644
dtype: int64

Null values are in the records added from office data which doesn't have the answers to questions from google survey

### Imputing missing values

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Step 1: Prepare the data
df = pd.DataFrame({...})  # Your dataframe
categorical_cols = ['domain', 'batch', 'placed_company', 'designation', 'degree', 'cs_bg', 'experience', 'holidays', 'city', 'skills', 'graduated']
numerical_cols = ['age', 'avg_time', 'week_back']

# Step 2: Split the data
df_complete = df.dropna(subset=categorical_cols + numerical_cols)
df_missing = df[df.isnull().any(axis=1)]

# Step 3: Impute missing categorical values with Random Forest
imputer_categorical = RandomForestRegressor(n_estimators=100)
imputer_categorical.fit(df_complete.drop(categorical_cols + numerical_cols, axis=1), df_complete[categorical_cols])

imputed_categorical = imputer_categorical.predict(df_missing.drop(categorical_cols + numerical_cols, axis=1))

# Replace missing categorical values in the original dataframe with the imputed values
df.loc[df.isnull().any(axis=1), categorical_cols] = imputed_categorical

# Step 4: Impute missing numerical values with SimpleImputer (mean strategy)
imputer_numerical = SimpleImputer(strategy='mean')
imputer_numerical.fit(df_complete[numerical_cols])

imputed_numerical = imputer_numerical.transform(df_missing[numerical_cols])

# Replace missing numerical values in the original dataframe with the imputed values
df.loc[df.isnull().any(axis=1), numerical_cols] = imputed_numerical

# Print the dataframe with imputed values
print(df)