# Data Wrangling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Importing dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [3]:
# Reading the dataset
path = '/content/drive/MyDrive/Colab Notebooks/worth/data/raw_data.csv'

data = pd.read_csv(path)
data.head()

Unnamed: 0,Name,Domain,Batch,Placed Company,Designation,CTC (LPA),age,domain,current_job,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated,toi,salary
0,Fais M,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.00,,,,,,,,,,,,,,
1,Hashim Rasheed,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,,,,,,,,,,,,,,
2,Mohammed Arshad,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.40,,,,,,,,,,,,,,
3,Muhammed Musthafa P,MERN Stack,BCK01,-,Software Developer,-,,,,,,,,,,,,,,
4,Muhammed Shafeerali,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.40,,,,,,,,,,,,,,


In [4]:
data.shape

(670, 20)

#### Columns

In [5]:
print(list(data.columns))

['Name', 'Domain', 'Batch', 'Placed Company', 'Designation', 'CTC (LPA)', 'age', 'domain', 'current_job', 'degree', 'cs_bg', 'experience', 'avg_time', 'holidays', 'week_back', 'city', 'skills', 'graduated', 'toi', 'salary']


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 0 to 669
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            670 non-null    object 
 1   Domain          670 non-null    object 
 2   Batch           670 non-null    object 
 3   Placed Company  670 non-null    object 
 4   Designation     670 non-null    object 
 5   CTC (LPA)       670 non-null    object 
 6   age             26 non-null     float64
 7   domain          26 non-null     object 
 8   current_job     26 non-null     object 
 9   degree          26 non-null     object 
 10  cs_bg           26 non-null     object 
 11  experience      26 non-null     object 
 12  avg_time        26 non-null     float64
 13  holidays        26 non-null     object 
 14  week_back       26 non-null     float64
 15  city            26 non-null     object 
 16  skills          26 non-null     object 
 17  graduated       26 non-null     obj

The `Null` values are from the office_data as they doesn't have the questions from google form

In [7]:
mask = data['salary'].notna()

data.loc[mask, ['Name', 'salary', 'CTC (LPA)']]

Unnamed: 0,Name,salary,CTC (LPA)
16,Asharudheen,2.6,3.0
61,Rahul K,4.2,3.6
79,Suhail P A,4.7,4.65
81,Afhaam K,4.0,4.8
96,Sujin S R,5.5,4.8
107,Sirajudheen,9.0,4.2
115,Muhammed Aslam,4.8,4.2
238,Shibu A,2.4,4.0
248,Sudheesh M,4.2,4.8
268,Muhammed Niyas MT,6.0,3.6


In the combined records there is significant difference in salary from office_data and from the goole form data
- Seems using the salary from office data will be the better option

### Removing unecessary columns

In [8]:
data = data.drop(['Name', 'current_job', 'salary', 'toi', 'domain'], axis=1)
data.head()

Unnamed: 0,Domain,Batch,Placed Company,Designation,CTC (LPA),age,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated
0,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.00,,,,,,,,,,
1,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,,,,,,,,,,
2,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.40,,,,,,,,,,
3,MERN Stack,BCK01,-,Software Developer,-,,,,,,,,,,
4,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.40,,,,,,,,,,


- Removed `Name` as it is not relvant to the problem statement
- `Designation` from *Office data* seems more reliable
- Chose `CTC (LPA)` from *Office data* over `salary` google form data
- Dropped `toi` columns as it only contains 2 records with non-null value

### Changing column names

In [9]:
print(list(data.columns))

['Domain', 'Batch', 'Placed Company', 'Designation', 'CTC (LPA)', 'age', 'degree', 'cs_bg', 'experience', 'avg_time', 'holidays', 'week_back', 'city', 'skills', 'graduated']


In [10]:
cols = ['domain', 'batch', 'placed_company', 'designation', 'salary', 'age', 'degree', 'cs_bg',
        'experience', 'avg_time', 'holidays', 'week_back', 'city', 'skills', 'graduated']

data.columns = cols
data.head()

Unnamed: 0,domain,batch,placed_company,designation,salary,age,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated
0,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.00,,,,,,,,,,
1,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,,,,,,,,,,
2,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.40,,,,,,,,,,
3,MERN Stack,BCK01,-,Software Developer,-,,,,,,,,,,
4,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.40,,,,,,,,,,


### Checking for null values

In [11]:
data.isnull().sum()

domain              0
batch               0
placed_company      0
designation         0
salary              0
age               644
degree            644
cs_bg             644
experience        644
avg_time          644
holidays          644
week_back         644
city              644
skills            644
graduated         644
dtype: int64

Null values are in the records added from office data which doesn't have the answers to questions from google survey

### Imputing missing values

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 0 to 669
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   domain          670 non-null    object 
 1   batch           670 non-null    object 
 2   placed_company  670 non-null    object 
 3   designation     670 non-null    object 
 4   salary          670 non-null    object 
 5   age             26 non-null     float64
 6   degree          26 non-null     object 
 7   cs_bg           26 non-null     object 
 8   experience      26 non-null     object 
 9   avg_time        26 non-null     float64
 10  holidays        26 non-null     object 
 11  week_back       26 non-null     float64
 12  city            26 non-null     object 
 13  skills          26 non-null     object 
 14  graduated       26 non-null     object 
dtypes: float64(3), object(12)
memory usage: 78.6+ KB


In [28]:
data['salary'].sort_values(ascending=True)

3         -
25     1.80
420    1.80
418    1.80
14     1.80
       ... 
134    8.60
443    8.70
665    9.35
559    9.40
560    9.40
Name: salary, Length: 670, dtype: object

In [33]:
data['placed_company'].sort_values(ascending=True)

53                      Lanware Solutions
54           SciWiz Innovations Pvt. Ltd.
294          SciWiz Innovations Pvt. Ltd.
3                                       -
612    1ClickTech Global Services Pvt Ltd
                      ...                
322                iKure Techsoft Pvt Ltd
65                  iTday India Pvt. Ltd.
521                 iTday India Pvt. Ltd.
50                        tekSystems (hp)
60                        tekSystems (hp)
Name: placed_company, Length: 670, dtype: object

In [34]:
# Replacing "-" with null values
data['placed_company'] = data['placed_company'].replace('-', np.NaN)
data['salary'] = data['salary'].replace('-', np.NaN)

data['salary'] = data['salary'].astype("float")

In [35]:
# Converting datatypes of object to category for imputation
df = data.copy()

list_str_obj_cols = df.columns[df.dtypes == "object"].tolist()
for str_obj_col in list_str_obj_cols:
    df[str_obj_col] = df[str_obj_col].astype("category")

In [36]:
# Imputation with miceforest
import miceforest as mf

# Create kernels
kernel = mf.ImputationKernel(
  data=df,
  save_all_iterations=True,
  random_state=1343
)

# Run the MICE algorithm for 3 iterations on each of the datasets
kernel.mice(3,verbose=True)
#print(kernel)
completed_dataset = kernel.complete_data(dataset=0, inplace=False)

  warn(


Initialized logger with name mice 1-3
Dataset 0
1  | placed_company | salary | age | degree | cs_bg | experience | avg_time | holidays | week_back | city | skills | graduated
2  | placed_company | salary | age | degree | cs_bg | experience | avg_time | holidays | week_back | city | skills | graduated
3  | placed_company | salary | age | degree | cs_bg | experience | avg_time | holidays | week_back | city | skills | graduated


In [37]:
completed_dataset.head()

Unnamed: 0,domain,batch,placed_company,designation,salary,age,degree,cs_bg,experience,avg_time,holidays,week_back,city,skills,graduated
0,Web Development Using Python Django,BCK01,VNC Digital Services,Software Engineer,3.0,22.0,No Degree,No,No,9.0,No,0.0,Bangalore,"Angular, Express, Node, MongoDB, MySQL, Typesc...",Yes
1,Web Development Using Node.js Express,BCK01,Emstell Technology Consulting,Software Engineer,2.16,21.0,Degree,No,No,12.0,No,2.0,WFH,"Angular, Express, Node, MongoDB, MySQL, Typesc...",Yes
2,Web Development Using Python Django,BCK01,Creative Panda,Node JS Developer,2.4,22.0,Degree,Yes,No,9.0,Yes,4.0,Bangalore,"Angular, Express, Node, MongoDB, MySQL, Typesc...",Yes
3,MERN Stack,BCK01,NORQ Technologies,Software Developer,3.55,23.0,No Degree,No,No,11.0,Yes,7.0,Chennai,"Angular, Express, Node, MongoDB, MySQL, Typesc...",Yes
4,Web Development Using Python Django,BCK01,ActionFi,Software Engineer,2.4,22.0,No Degree,No,No,10.0,No,4.0,Bangalore,"Angular, Express, Node, MongoDB, MySQL, Typesc...",Yes


In [38]:
completed_dataset.isnull().sum()

domain            0
batch             0
placed_company    0
designation       0
salary            0
age               0
degree            0
cs_bg             0
experience        0
avg_time          0
holidays          0
week_back         0
city              0
skills            0
graduated         0
dtype: int64

In [39]:
completed_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 0 to 669
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   domain          670 non-null    category
 1   batch           670 non-null    category
 2   placed_company  670 non-null    category
 3   designation     670 non-null    category
 4   salary          670 non-null    float64 
 5   age             670 non-null    float64 
 6   degree          670 non-null    category
 7   cs_bg           670 non-null    category
 8   experience      670 non-null    category
 9   avg_time        670 non-null    float64 
 10  holidays        670 non-null    category
 11  week_back       670 non-null    float64 
 12  city            670 non-null    category
 13  skills          670 non-null    category
 14  graduated       670 non-null    category
dtypes: category(11), float64(4)
memory usage: 64.5 KB


In [40]:
# Moving the `salary` column to last
column_to_move = completed_dataset.pop("salary")

# insert column with insert(location, column_name, column_value)

completed_dataset.insert(14, "salary", column_to_move)

In [41]:
completed_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 0 to 669
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   domain          670 non-null    category
 1   batch           670 non-null    category
 2   placed_company  670 non-null    category
 3   designation     670 non-null    category
 4   age             670 non-null    float64 
 5   degree          670 non-null    category
 6   cs_bg           670 non-null    category
 7   experience      670 non-null    category
 8   avg_time        670 non-null    float64 
 9   holidays        670 non-null    category
 10  week_back       670 non-null    float64 
 11  city            670 non-null    category
 12  skills          670 non-null    category
 13  graduated       670 non-null    category
 14  salary          670 non-null    float64 
dtypes: category(11), float64(4)
memory usage: 64.5 KB


In [42]:
# Saving the dataframe to csv
# completed_dataset.to_csv("imputed_dataset.csv", index=False)