In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as pt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as po
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import random
import plotly.figure_factory as ff

%matplotlib inline

In [2]:
df_cleaned = pd.read_csv("../input/software-enginners-data-cleaned/software_enginners_data_cleaned.csv",engine='python')
print(df_cleaned.head())
print(df_cleaned.info())

                        Job Title                   Salary Estimate  \
0    Skokie JavaScript Tutor Jobs  $11-$28 Per Hour(Glassdoor est.)   
1  Evanston JavaScript Tutor Jobs  $11-$28 Per Hour(Glassdoor est.)   
2         QuickBase App Developer  $11-$28 Per Hour(Glassdoor est.)   
3       Senior Software Developer  $11-$28 Per Hour(Glassdoor est.)   
4   Mid Level Full Stack Engineer  $11-$28 Per Hour(Glassdoor est.)   

                                     Job Description  Rating  \
0  #Skokie JavaScript Tutor Jobs\n\nVarsity Tutor...     4.1   
1  #Evanston JavaScript Tutor Jobs\n\nVarsity Tut...     4.1   
2  ZTERS is a site services management company ba...     NaN   
3  Join us as we revolutionize online school mana...     5.0   
4  PLEASE APPLY TO THE POSITION HERE: https://www...     5.0   

          Company Name              Location  Headquarters  \
0  Varsity Tutors\n4.1            Skokie, IL           NaN   
1  Varsity Tutors\n4.1          Evanston, IL           NaN   
2 

In [3]:
def numeric_col(df):
    col_num = []
    for col in df.columns:
        if df[col].dtypes == 'int64' or "float64":
            col_num.append(col)
    return col_num

def object_col(df):
    col_obj = []
    for col in df.columns:
        if df[col].dtypes == 'object':
            col_obj.append(col)
    return col_obj

In [None]:
#def job_title():
    #"""
    #"""
    #values = df_cleaned["Job Title"].value_counts().index
    #job_list = list(values)
    
    #return job_list
#print(job_list)

In [4]:
def popular_language(column):
    """
    Take a column name 
    and return a dictionary of 
    the most popular programming languages
    """
    dic_of_languages = dict({ "javascript": 0,"python": 0,"react": 0,"angular": 0, "node": 0,".net": 0 })
    num_item = df_cleaned.shape[0]
    
    for desc in df_cleaned[column]:
        
        if "javascript" in desc.lower():
            dic_of_languages["javascript"] += 1
            
        elif "python" in desc.lower():
            dic_of_languages["python"] += 1
            
        elif ("react" or "react.js") in desc.lower():
            dic_of_languages["react"] += 1
            
        elif "angular" in desc.lower():
            dic_of_languages["angular"] += 1 
        
        elif ("node" or "node.js") in desc.lower():
            dic_of_languages["node"] += 1
        
        elif (".net" or "net") in desc.lower():
            dic_of_languages[".net"] += 1
            
    return dic_of_languages

dic1 = popular_language("Job Title")
dic2 = popular_language("Job Description")

print(dic1)
print(dic2)

{'javascript': 30, 'python': 5, 'react': 16, 'angular': 16, 'node': 11, '.net': 37}
{'javascript': 854, 'python': 0, 'react': 1, 'angular': 0, 'node': 1, '.net': 0}


In [5]:
def seniority(title):
    
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr' in title.lower():
        return 'senior'
    
    elif 'jr' in title.lower() or 'jr.' in title.lower():
        return 'jr'
    
    else:
        return 'none'

df_cleaned['seniority'] = df_cleaned['Job Title'].apply(seniority)

In [6]:
df = pd.pivot_table(df_cleaned, index = ['Job Title','seniority'], values = 'avr_salary').sort_values('avr_salary', ascending = False)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,avr_salary
Job Title,seniority,Unnamed: 2_level_1
Senior Software Engineer - React,senior,120.0
Online Mobile Banking Software Engineer,none,120.0
LAMP Software Engineer,none,120.0
Senior Software Engineer - Interoperability,senior,120.0
Sr. Java Developer,senior,120.0


In [7]:
df_cleaned["desc_len"] = df_cleaned["Job Description"].apply(lambda x: len(x)).sort_values(ascending = False)
print(df_cleaned["desc_len"].head())

0    1413
1    1417
2    2734
3    2759
4    5775
Name: desc_len, dtype: int64


In [8]:
df = pd.pivot_table(df_cleaned, index = 'Job Title', values = 'avr_salary').sort_values(by = "avr_salary",ascending = False)
df.head()

Unnamed: 0_level_0,avr_salary
Job Title,Unnamed: 1_level_1
Senior Software Engineer - React,120.0
Online Mobile Banking Software Engineer,120.0
LAMP Software Engineer,120.0
Senior Software Engineer - Interoperability,120.0
Sr. Java Developer,120.0


In [9]:
df = pd.pivot_table(df_cleaned, index = ['State'], values = 'avr_salary').sort_values('avr_salary', ascending = False)
df.head()

Unnamed: 0_level_0,avr_salary
State,Unnamed: 1_level_1
HI,120.0
KY,97.5
ME,93.833333
RI,86.0
as,83.0


In [10]:
df = pd.pivot_table(df_cleaned, index = ['Type of ownership'], values = 'avr_salary').sort_values('avr_salary', ascending = False)
df.head()

Unnamed: 0_level_0,avr_salary
Type of ownership,Unnamed: 1_level_1
Private Practice / Firm,86.0
Government,77.333333
Company - Public,69.827731
Subsidiary or Business Segment,69.565789
Company - Private,68.880912


In [11]:
def min_employer():
    
    try:
        df_cleaned["min_employer"] = df_cleaned["Size"].astype(str).apply(lambda x: x.split()[0])
    except IndexError:
        return 1
    
dic = min_employer()

In [12]:
def max_employer():
    
    try:
        df_cleaned["max_employer"] = df_cleaned["Size"].astype(str).apply(lambda x: str(x).split()[0])
    except IndexError:
        return 1
    
dic = max_employer()

In [13]:
print(df_cleaned.head())

                        Job Title                   Salary Estimate  \
0    Skokie JavaScript Tutor Jobs  $11-$28 Per Hour(Glassdoor est.)   
1  Evanston JavaScript Tutor Jobs  $11-$28 Per Hour(Glassdoor est.)   
2         QuickBase App Developer  $11-$28 Per Hour(Glassdoor est.)   
3       Senior Software Developer  $11-$28 Per Hour(Glassdoor est.)   
4   Mid Level Full Stack Engineer  $11-$28 Per Hour(Glassdoor est.)   

                                     Job Description  Rating  \
0  #Skokie JavaScript Tutor Jobs\n\nVarsity Tutor...     4.1   
1  #Evanston JavaScript Tutor Jobs\n\nVarsity Tut...     4.1   
2  ZTERS is a site services management company ba...     NaN   
3  Join us as we revolutionize online school mana...     5.0   
4  PLEASE APPLY TO THE POSITION HERE: https://www...     5.0   

          Company Name              Location  Headquarters  \
0  Varsity Tutors\n4.1            Skokie, IL           NaN   
1  Varsity Tutors\n4.1          Evanston, IL           NaN   
2 

# univariate analysis

In [14]:
fig = px.pie(df_cleaned, values='avr_salary', names='Type of ownership')
fig.update_layout(
title = dict(text = "sector employement data",x = 0.44,y = 0.95,font_size = 20), width=800, height=650)
fig.update_traces(textposition='outside', textinfo='percent+label')

In [15]:
fig = px.histogram(df_cleaned, x = "avr_salary", nbins = 10, title = 'Histogram of avr_salary')
fig.show()

In [16]:
fig = px.histogram(df_cleaned, x = "Rating", nbins = 10, title = 'Histogram of Rating')
fig

In [17]:
fig = px.histogram(df_cleaned, x = "max_employer", nbins = 10, title = 'Histogram maximun of employer')
fig

In [18]:
fig = px.histogram(df_cleaned, x = "min_employer", nbins = 10, title = 'Histogram minimun of employer')
fig

In [19]:
fig = px.histogram(df_cleaned, x = "age", nbins = 10, title = 'Histogram of age')
fig

In [20]:
fig = px.histogram(df_cleaned, x = "Remote", nbins = 10, title = 'Histogram of remote')
fig

In [21]:
fig = px.bar(df_cleaned, x = "Salary Estimate")
fig

In [None]:
df = df_cleaned[['Location', 
             'Size',
             'Type of ownership', 
             'Industry', 
             'Sector', 
             'Revenue', 
             'desc_len', 
             'State', 
             'javascript', 
             'html',
             'react', 
             'angular', 
             'git', 
             'node', 
             'seniority',
             'php',
             'mysql',
             'j2ee',
             'net',
             'new' ]]

In [None]:
def Plot_barplot():
    
    for col in df.columns:
        cat_num = df[col].value_counts()
        print("graph of %s:" % col)
        fig = px.bar(x = cat_num.index, y = cat_num )
        fig.show()
    print("plots created successfully")
    
Plot_barplot()

In [None]:
print(df_cleaned.info())

# biavariate analysis

In [None]:
fig = px.scatter(df_cleaned, x = "age", y = "avr_salary")
fig.show()

In [None]:
fig = px.violin(df_cleaned, y = "avr_salary", x = "Type of ownership", points="all",box = True ,hover_data = df_cleaned.columns)
fig.show()

In [None]:
fig = px.violin(df_cleaned, y = "avr_salary", x = "Remote", points="all",box = True ,hover_data = df_cleaned.columns)
fig.show()

In [None]:
fig = px.violin(df_cleaned, y = "avr_salary", x = "Sector", points="all",box = True ,hover_data = df_cleaned.columns)
fig.show()

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(df_cleaned[['age','avr_salary','Rating',"javascript","react","node"]].corr(),vmax=.3, center=0, cmap = cmap,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})