### Loading Data and Statistical Analysis

In [4]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

In [5]:
# we may load external data
# df = pd.read_csv('data/salaries.csv') # or xml, json, excel, etc.
# or load from a URL
url = 'https://raw.githubusercontent.com/onionmccabbage/pythonFurtherAdvancedSept2025/refs/heads/main/data/Salaries.csv'
df = pd.read_csv(url)
df.head(7) # or tail()
df.describe() # plenty to give actionable statistics

Unnamed: 0,phd,service,salary
count,78.0,78.0,78.0
mean,19.705128,15.051282,108023.782051
std,12.498425,12.139768,28293.661022
min,1.0,0.0,57800.0
25%,10.25,5.25,88612.5
50%,18.5,14.5,104671.0
75%,27.75,20.75,126774.75
max,56.0,51.0,186960.0


In [6]:
# we may examine our data strucure
df.columns
df.size
df['salary'].mean() # for one column
df[['service', 'salary']].mean() # a list of columns
df.max()

rank            Prof
discipline         B
phd               56
service           51
sex             Male
salary        186960
dtype: object

#### Grouping and Aggregating Data

In [7]:
df_rank = df.groupby(['rank'])
# anything done in np or pd is very performant - even for massive data sets
df_rank[['salary']].mean() # consider single [] or double [[]]

Unnamed: 0_level_0,salary
rank,Unnamed: 1_level_1
AssocProf,91786.230769
AsstProf,81362.789474
Prof,123624.804348


In [8]:
# some meaningful analysis
df_n = df.groupby(['sex'])
df_n[['salary']].mean()
df_years = df.groupby('rank')
df_years[['service']].mean()

Unnamed: 0_level_0,service
rank,Unnamed: 1_level_1
AssocProf,11.307692
AsstProf,2.210526
Prof,21.413043


In [11]:
# mini challenge
# show the mean salary for each rank, grouped by sex
whichColumns = ['rank', 'sex'] # it is sometimes useful to write list first
mf = df.groupby(whichColumns)  # ... then use these lists within our code
s = ['salary']
mf[s].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,salary
rank,sex,Unnamed: 2_level_1
AssocProf,Female,88512.8
AssocProf,Male,102697.666667
AsstProf,Female,78049.909091
AsstProf,Male,85918.0
Prof,Female,121967.611111
Prof,Male,124690.142857


In [15]:
# more data analysese
# find mean salary for all salaries over 120000 grouped by rank
b = df[ df['salary']>120000 ] 
# b.groupby(['rank'])
b[['salary']].mean()

salary    141722.4
dtype: float64

#### Aggregating data

In [19]:
# we can aggregate values using .agg()
df[['phd', 'salary']].agg(['max','min','mean'])

Unnamed: 0,phd,salary
max,56.0,186960.0
min,1.0,57800.0
mean,19.705128,108023.782051
