In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot

### Import Data

In [2]:
raw = pd.read_csv('./Capmpingi_employee_reviews.csv')

### Explore Data

In [3]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26993 entries, 0 to 26992
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Title                25912 non-null  object 
 1   Place                24597 non-null  object 
 2   Job_type             11556 non-null  object 
 3   Department           22083 non-null  object 
 4   Date                 25915 non-null  object 
 5   Overall_rating       25898 non-null  float64
 6   work_life_balance    26977 non-null  float64
 7   skill_development    26976 non-null  float64
 8   salary_and_benefits  26947 non-null  float64
 9   job_security         26943 non-null  float64
 10  career_growth        26931 non-null  float64
 11  work_satisfaction    26909 non-null  float64
 12  Likes                23864 non-null  object 
 13  Dislikes             22966 non-null  object 
dtypes: float64(7), object(7)
memory usage: 2.9+ MB


In [4]:
raw.shape

(26993, 14)

In [5]:
raw.head(2)

Unnamed: 0,Title,Place,Job_type,Department,Date,Overall_rating,work_life_balance,skill_development,salary_and_benefits,job_security,career_growth,work_satisfaction,Likes,Dislikes
0,Senior Consultant,Pune,Full Time,General Insurance Department,8 Sep 2023,4.0,4.0,3.0,3.0,4.0,4.0,4.0,Deserved candidates are promoted promptly.\nUn...,With designation promotions good salary increm...
1,Senior Software Engineer,"Kolkata, West Bengal",Full Time,Software Development Department,7 Sep 2023,3.0,4.0,4.0,3.0,4.0,4.0,3.0,You got lot of learning platform and monthly l...,You will get fully not tech project.\nThere is...


In [6]:
raw.describe()

Unnamed: 0,Overall_rating,work_life_balance,skill_development,salary_and_benefits,job_security,career_growth,work_satisfaction
count,25898.0,26977.0,26976.0,26947.0,26943.0,26931.0,26909.0
mean,3.707236,3.662379,3.636084,3.037444,3.782541,3.628458,3.171801
std,1.25718,1.287039,1.270036,1.337186,1.269125,1.329579,1.349958
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,3.0,3.0,3.0,2.0,3.0,3.0,2.0
50%,4.0,4.0,4.0,3.0,4.0,4.0,3.0
75%,5.0,5.0,5.0,4.0,5.0,5.0,4.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0


### Data cleaning

In [7]:
# Drop unwanted columns
# raw.drop(['Place','Likes','Dislikes'],axis=1,inplace=True)
# raw.head(2)

### Data wrangling

#### Rename columns

In [8]:
raw.rename(columns=str.lower,inplace=True)

#### Replace nans with not responded

In [9]:
raw.isnull().count()

title                  26993
place                  26993
job_type               26993
department             26993
date                   26993
overall_rating         26993
work_life_balance      26993
skill_development      26993
salary_and_benefits    26993
job_security           26993
career_growth          26993
work_satisfaction      26993
likes                  26993
dislikes               26993
dtype: int64

In [10]:
raw.replace(np.nan,'Unknown',inplace=True)

# Remove clolumns that has both title and departement nan values
to_be_deleted = (raw['department'] == 'Unknown') & (raw['title'] == 'Unknown')
indexes_to_be_deleted = raw.loc[to_be_deleted].index

raw.drop(axis=0,index=indexes_to_be_deleted,inplace=True)

raw.loc[to_be_deleted]

Unnamed: 0,title,place,job_type,department,date,overall_rating,work_life_balance,skill_development,salary_and_benefits,job_security,career_growth,work_satisfaction,likes,dislikes


#### Convert rating to text in place of number rating

In [11]:
raw['overall_rating'].unique()

array([4.0, 3.0, 1.0, 5.0, 2.0, 'Unknown'], dtype=object)

In [12]:
rating = {1:'1-Too Bad',2:'2-Bad',3:'3-Normal',4:'4-Good',5:'5-Very Good'}

In [13]:
raw.replace(rating,inplace=True)
raw.head(2)

Unnamed: 0,title,place,job_type,department,date,overall_rating,work_life_balance,skill_development,salary_and_benefits,job_security,career_growth,work_satisfaction,likes,dislikes
0,Senior Consultant,Pune,Full Time,General Insurance Department,8 Sep 2023,4-Good,4-Good,3-Normal,3-Normal,4-Good,4-Good,4-Good,Deserved candidates are promoted promptly.\nUn...,With designation promotions good salary increm...
1,Senior Software Engineer,"Kolkata, West Bengal",Full Time,Software Development Department,7 Sep 2023,3-Normal,4-Good,4-Good,3-Normal,4-Good,4-Good,3-Normal,You got lot of learning platform and monthly l...,You will get fully not tech project.\nThere is...


### Data analysis

In [14]:
grouping = raw.groupby(by=['title'])
# grouping.get_group('title')
grouping['job_type'].value_counts()

title                          job_type 
'sr. Scrum Master'             Unknown      1
(google Maping                 Full Time    1
-Senior Consultant             Full Time    1
.net Architect                 Full Time    1
.net Developer                 Full Time    1
                                           ..
Worksoft Consultant            Unknown      1
Wso2 Developer                 Full Time    1
Xamarin Architect , Developer  Unknown      1
Xyz                            Unknown      1
ZENDESK ADMINISTRATOR          Unknown      1
Name: count, Length: 5009, dtype: int64

In [15]:
# pyplot.bar(raw['department'],raw['department'])
# pyplot.show()

### Save the output

In [16]:
# raw.to_excel('Data_salaries_output.xlxs',sheet_name="salaries")
# # 
# with pd.ExcelWriter('Data_salaries_output.xlxs',engine='xlsxwriter') as excel_writer:
#     raw.to_excel(excel_writer, sheet_name='salaries', index=False)
raw.to_csv('Capmpingi_employee_reviews_output.csv')