In [30]:
import pandas as pd
import plotly.express as px
import plotly.subplots as sp

df = pd.read_csv('startup_data.csv')

# Load the data
employee_df = pd.read_csv('startup_data.csv')
department_df = pd.read_csv('department_data.csv')
employee_df.columns = employee_df.columns.str.strip()
department_df.columns = department_df.columns.str.strip()

employee_df['Department'] = employee_df['Department'].str.strip()
department_df['Department'] = department_df['Department'].str.strip()
df = pd.merge(employee_df, department_df, on='Department', how='left')

# first 5 series 
df.head()

# last 5 series
df.tail()

# Trim whitespace from column names
# df.columns
# department_df.columns

Unnamed: 0,EmployeeID,FirstName,LastName,Role,Department,Salary,StartDate,Project,PerformanceRating,MonthlyBudget,ActualSpending,OfficeSpace,EquipmentCosts,SoftwareLicenses,TrainingBudget,MarketingExpenses,TravelExpenses,Utilities,Miscellaneous
5,6,Alice,Davis,DevOps Engineer,Operations,72000,2021-09-20,Project Zeta,4.3,75000,76500,14000,20000,12000,4000,0,2500,3000,1200
6,7,Robert,Wilson,Backend Developer,Development,71000,2023-02-05,Project Alpha,4.4,100000,98500,15000,25000,10000,5000,0,2000,3000,1500
7,8,Laura,Moore,Frontend Developer,Development,69000,2022-10-25,Project Beta,4.0,100000,98500,15000,25000,10000,5000,0,2000,3000,1500
8,9,James,Taylor,HR Manager,Human Resources,73000,2020-12-01,,4.8,45000,44000,7000,3000,2000,5000,1000,2000,1500,1000
9,10,Emma,Anderson,Marketing Specialist,Marketing,67000,2021-08-15,Project Gamma,4.2,65000,68000,10000,5000,4000,3000,25000,5000,2000,1500


In [31]:
# Function to save plots
def save_plot(fig, filename):
    fig.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close(fig)

print("\nMissing Values:")
print(df.isnull().sum())

print("\nUnique Departments:")
print(df['Department'].unique())

print("\nEmployee Count by Department:")
print(df['Department'].value_counts())

specific_department = 'Development'
dev_employees = df[df['Department'] == specific_department]
print(f"\nEmployees in {specific_department}:")
print(dev_employees)

# General information about the DataFrame
# df.info()

# General information about count, mean, std, min, max, percentiles
# df.describe()

df['Department'].unique()


Missing Values:
EmployeeID           0
FirstName            0
LastName             0
Role                 0
Department           0
Salary               0
StartDate            0
Project              0
PerformanceRating    0
MonthlyBudget        0
ActualSpending       0
OfficeSpace          0
EquipmentCosts       0
SoftwareLicenses     0
TrainingBudget       0
MarketingExpenses    0
TravelExpenses       0
Utilities            0
Miscellaneous        0
dtype: int64

Unique Departments:
['Development' 'Data Science' 'Design' 'Management' 'Quality Assurance'
 'Operations' 'Human Resources' 'Marketing']

Employee Count by Department:
Department
Development          3
Data Science         1
Design               1
Management           1
Quality Assurance    1
Operations           1
Human Resources      1
Marketing            1
Name: count, dtype: int64

Employees in Development:
   EmployeeID   FirstName   LastName                   Role   Department  \
0           1   John        Doe        S

array(['Development', 'Data Science', 'Design', 'Management',
       'Quality Assurance', 'Operations', 'Human Resources', 'Marketing'],
      dtype=object)

In [32]:
import plotly.express as px

# Define role groups
role_groups = {
    'Software Development': ['Software Engineer', 'Backend Developer', 'Frontend Developer', 'Full Stack Developer', 'Mobile Developer', 'Software Architect'],
    'Data & Analytics': ['Data Scientist', 'Data Analyst', 'Data Engineer', 'Machine Learning Engineer', 'Business Intelligence Analyst'],
    'Design & UX': ['UX Designer', 'UI Designer', 'Graphic Designer', 'Product Designer'],
    'Management & Product': ['Product Manager', 'Project Manager', 'Technical Lead', 'Scrum Master', 'Agile Coach'],
    'Operations & IT': ['DevOps Engineer', 'System Administrator', 'Network Engineer', 'Cloud Engineer', 'IT Support Specialist'],
    'Other Specialists': ['Marketing Specialist', 'HR Specialist', 'Financial Analyst', 'Sales Manager', 'Customer Support']
}

# Determine group for each role
def group_roles(role):
    for group, roles in role_groups.items():
        if any(r.lower() in role.lower() for r in roles):
            return group
    return 'Other Specialists'

# Apply role grouping
df['Role Group'] = df['Role'].apply(group_roles)

# Calculate average salary by role group
avg_salary_by_role_group = df.groupby('Role Group')['Salary'].mean().reset_index()

# Plot using Plotly
fig = px.bar(avg_salary_by_role_group, 
              x='Role Group', 
              y='Salary', 
              title='Average Salary by Role Group', 
              labels={'Role Group': 'Role Group', 'Salary': 'Average Salary'},
              color='Role Group',
              color_discrete_sequence=px.colors.qualitative.Plotly)  # Using Plotly's qualitative color scheme

# Update layout for better aesthetics
fig.update_layout(xaxis_tickangle=-45)
fig.show()



In [33]:
# Berechnung der Verteilung der Abteilungen
department_distribution = df['Department'].value_counts().reset_index()
department_distribution.columns = ['Department', 'Count']

# Plot des Kreisdiagramms (Pie Chart) mit Plotly
fig = px.pie(department_distribution, 
             names='Department', 
             values='Count', 
             title='Verteilung der Abteilungen', 
             hole=0.3,  # Für ein Donut-Diagramm, falls gewünscht
             labels={'Department': 'Abteilung', 'Count': 'Anzahl'},
             template='plotly_dark',  # Optional: Dunkles Thema
             )

# Layout aktualisieren für bessere Optik
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=True)

# Diagramm anzeigen
fig.show()

In [34]:
# Erstellen eines Histogramms mit Plotly
fig = px.histogram(df, 
                   x='PerformanceRating', 
                   nbins=5,  # Anzahl der Bins (ähnlich wie `bins` in Seaborn)
                   color_discrete_sequence=['lightgreen'],  # Farbe des Balkens
                   title='Distribution of Performance Ratings')

# Layout anpassen
fig.update_layout(
    xaxis_title='Performance Ratings',
    yaxis_title='Frequency',
    bargap=0.2,  # Platz zwischen den Balken
    template='plotly_white'  # Helles Design
)

# Histogramm anzeigen
fig.show()

In [35]:
# Boxplot der Gehaltsverteilung über die Abteilungen
fig = px.box(df, 
             x='Department', 
             y='Salary', 
             title='Salary Distribution Across Departments')

# Layout anpassen
fig.update_layout(
    xaxis_title='Department',
    yaxis_title='Salary',
    xaxis_tickangle=-45,  # Rotiert die X-Achsen-Beschriftungen um 45 Grad
    template='plotly_white'  # Helles Design
)

# Boxplot anzeigen
fig.show()


In [36]:
# pip install statsmodels

# Berechne die Korrelation zwischen Gehalt und Leistungsbewertung
correlation = df['Salary'].corr(df['PerformanceRating'])

# Erstelle einen Scatterplot mit einer Regressionslinie (Trendlinie)
fig = px.scatter(df, 
                 x='PerformanceRating', 
                 y='Salary', 
                 trendline='ols',  # Ordinary Least Squares Regression
                 title=f'Correlation between Salary and Performance Rating: {correlation:.2f}',
                 labels={'PerformanceRating': 'Performance Rating', 'Salary': 'Salary'},
                 template='plotly_white')

# Layout anpassen
fig.update_layout(
    xaxis_title='Performance Rating',
    yaxis_title='Salary',
    legend_title_text='Trendline'
)

# Plot anzeigen
fig.show()

In [37]:
# Erstelle ein Unterplot-Layout mit 1 Zeile und 2 Spalten
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=('Salary Distribution by Department', 
                            'Salary Distribution by Performance Rating'))

# Plot 1: Salary Distribution by Department
fig1 = px.box(df, x='Department', y='Salary', color='Department', title='Salary Distribution by Department')
for trace in fig1['data']:
    fig.add_trace(trace, row=1, col=1)

# Plot 2: Salary Distribution by Performance Rating
fig2 = px.box(df, x='PerformanceRating', y='Salary', color='Department', title='Salary Distribution by Performance Rating')
for trace in fig2['data']:
    fig.add_trace(trace, row=1, col=2)

# Layout anpassen
fig.update_layout(height=600, width=1200, showlegend=False, title_text='Salary Distribution Subplots')

# X-Achsen-Labels rotieren
fig.update_xaxes(tickangle=45)

# Plot anzeigen
fig.show()