In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('salary.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.duplicated().sum()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.nunique()

In [None]:
data_cat = data[['title', 'gender', 'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree',
                 'Highschool', 'Some_College', 'Race_Asian', 'Race_White','Race_Two_Or_More', 
                 'Race_Black', 'Race_Hispanic', 'Race', 'Education']]

In [None]:
for i in data_cat.columns:
     print(data_cat[i].unique())

In [None]:
for i in data_cat.columns:
     print(data_cat[i].value_counts())

In [None]:
for i in data_cat.columns:
    plt.figure(figsize = (15,6))
    sns.countplot(data_cat[i], data = data_cat, palette = 'hls')
    plt.xticks(rotation = 90)
    plt.show()



In [None]:
for i in data_cat.columns:
    plt.figure(figsize = (15,6))
    data_cat[i].value_counts().plot(kind = 'pie', autopct = '%1.1f%%')
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
data_salaries = data[['company', 'title', 'totalyearlycompensation','location',
                      'yearsofexperience', 'yearsatcompany', 'gender','Race', 'Education']]

In [None]:
data_salaries = pd.DataFrame(data_salaries)

In [None]:
data_salaries.head()

In [None]:
data_salaries.tail()

In [None]:
data_salaries['title'].value_counts()

In [None]:
undesired_titles = ['Marketing', 'Mechanical Engineer', 'Sales', 'Recruiter', 'Human Resources']
data_salaries_original = data_salaries.copy()
data_salaries = data_salaries[data_salaries['title'].apply(lambda x: x not in undesired_titles)]
data_salaries['title'].value_counts()

In [None]:
data_salaries.shape

In [None]:
data_salaries.info()

In [None]:
data_salaries.isnull().sum()

In [None]:
null_data = pd.DataFrame(data_salaries.isnull().sum(), columns = ['Count of Nulls'])
null_data.index.name = 'Column Name'
null_data[null_data ['Count of Nulls'] > 0].sort_values('Count of Nulls', ascending=False)

In [None]:
data_salaries.fillna({'company':'NA', 'gender':'NA','Race': 'NA', 'Education': 'NA'}, inplace = True)

In [None]:
data_salaries.isnull().sum()

In [None]:
job_titles = data_salaries[['company','title', 'totalyearlycompensation']].groupby(['title'])\
                                                                          .mean()\
                                                                          .round(2)\
                                                                          .sort_values('totalyearlycompensation',ascending = False)
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
job_titles.plot.bar()
plt.title('Highest Average Annual Compensation by Job Title', size=17)
plt.xlabel('Average Annual Compensation ($)', size = 15)
plt.ylabel('Job Title', size = 15)
plt.show()

In [None]:
top_jobs = data_salaries['title'].value_counts()
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
top_jobs.plot.bar()
plt.title("Number of Workers in Each Job Title", size=17)
plt.xlabel('Number of Workers', size = 15)
plt.ylabel('Job Title', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
companies_with_most_tech_workers = data_salaries['company'].value_counts()[:10].plot.bar();
plt.title('Top 10 companies with the highest number of tech workers', size=17)
plt.xlabel('Number of Workers', size = 15)
plt.ylabel('Company', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
highest_paying_companies = data_salaries[['company','title','totalyearlycompensation']].groupby(['company'])\
                                                                                       .max().sort_values('totalyearlycompensation',ascending = False)\
                                                                                       .head(20).plot.bar();
plt.title('Top 10 Companies that paid the highest compensation', size=10)
plt.xlabel('Compensation in Millions ($)', size = 15)
plt.ylabel('Company', size = 15)
plt.show();

In [None]:
data_salaries['gender'].replace('Title: Senior Software Engineer', 'NA', inplace = True)

In [None]:
pay_by_gender = data_salaries[['totalyearlycompensation', 'gender']].groupby(['gender']).mean().round(2)
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
pay_by_gender.sort_values('totalyearlycompensation', ascending = False).head(10).plot.bar(legend = False);
plt.title('Compensation by Gender', size=20)
plt.xlabel('Compensation ($)', size = 15)
plt.ylabel('Gender', size = 15)
plt.show();

In [None]:
male_salaries = data_salaries[data_salaries.gender == 'Male'].copy()
top4_male_salaries = male_salaries.nlargest(4,'totalyearlycompensation')
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
top4_male_salaries.plot.bar(x = 'title', y = 'totalyearlycompensation', legend = False);
plt.title('Top 4 Male Salaries', size=20)
plt.xlabel('Compensation in Millions ($)', size = 15)
plt.ylabel('Title', size = 15)
plt.show();

In [None]:
female_salaries = data_salaries[data_salaries.gender == 'Female'].copy()
top4_female_salaries = female_salaries.nlargest(4,'totalyearlycompensation')
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
top4_female_salaries.plot.bar(x = 'title', y = 'totalyearlycompensation', legend = False);
plt.title('Top 4 Female Salaries', size=20)
plt.xlabel('Compensation in Millions ($)', size = 15)
plt.ylabel('Title', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
gender_distribution = data_salaries['gender'].value_counts().plot.bar(figsize = (12,8));
plt.title('Gender Distribution', size=20)
plt.xlabel('Count', size = 15)
plt.ylabel('Gender', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
pay_by_race = data_salaries[['totalyearlycompensation','Race']].groupby(['Race'])\
                                                               .mean()\
                                                               .round(2)\
                                                               .sort_values('totalyearlycompensation', ascending = False)\
                                                               .plot.bar()
plt.title('Compensation By Race', size=20)
plt.xlabel('Compensation ($)', size = 15)
plt.ylabel('Race', size = 15)
plt.show()

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
race_distribution = data_salaries['Race'].value_counts().plot.pie();
plt.title('Race Distribution', size=20)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
pay_by_education = data_salaries[['totalyearlycompensation','Education']].groupby(['Education']).mean().round().sort_values('totalyearlycompensation', ascending = False).plot.bar()
plt.title('Compensation By Education', size=20)
plt.xlabel('Compensation ($)', size = 15)
plt.ylabel('Education', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
education_distribution = data_salaries['Education'].value_counts().plot.bar()
plt.title('Distribution of Education', size=20)
plt.xlabel('Count', size = 15)
plt.ylabel('Education', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
years_of_experience = data_salaries[['title', 'totalyearlycompensation','yearsofexperience']].groupby(['yearsofexperience'])\
                                                                                             .mean()\
                                                                                             .round(2)\
                                                                                             .sort_values('totalyearlycompensation', ascending = False)\
                                                                                             .head(20).plot.bar()
plt.title('Years of Experience VS Compensation', size=17)
plt.xlabel('Compensation ($)', size = 15)
plt.ylabel('Years of Experience', size = 15)
plt.show();

In [None]:
plt.figure(figsize = (15,6))
plt.style.use('fivethirtyeight')
location = data_salaries['location'].value_counts().iloc[:20].plot.bar()
plt.title('Top 20 locations of tech jobs', size=17)
plt.xlabel('Number of workers', size = 15)
plt.ylabel('Company', size = 15)
plt.show();