In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('salaries_by_college_major.csv')

In [None]:
df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business


In [None]:
df.shape

(51, 6)

In [None]:
df.columns

Index(['Undergraduate Major', 'Starting Median Salary',
       'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary',
       'Mid-Career 90th Percentile Salary', 'Group'],
      dtype='object')

## **Missing Values and Junk Data**

In [3]:
df.isna()
#Row 50 returns true

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [4]:
df.tail()
#Row 50 has NaN

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
46,Psychology,35900.0,60400.0,31600.0,127000.0,HASS
47,Religion,34100.0,52000.0,29700.0,96400.0,HASS
48,Sociology,36500.0,58200.0,30700.0,118000.0,HASS
49,Spanish,34000.0,53100.0,31000.0,96400.0,HASS
50,Source: PayScale Inc.,,,,,


In [5]:
clean_df = df.dropna()
clean_df.tail()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
45,Political Science,40800.0,78200.0,41200.0,168000.0,HASS
46,Psychology,35900.0,60400.0,31600.0,127000.0,HASS
47,Religion,34100.0,52000.0,29700.0,96400.0,HASS
48,Sociology,36500.0,58200.0,30700.0,118000.0,HASS
49,Spanish,34000.0,53100.0,31000.0,96400.0,HASS


## **Accessing Columns and Individual Cells in a Dataframe**

In [6]:
#Accessing a Particular Column
clean_df['Starting Median Salary']

0     46000.0
1     57700.0
2     42600.0
3     36800.0
4     41600.0
5     35800.0
6     38800.0
7     43000.0
8     63200.0
9     42600.0
10    53900.0
11    38100.0
12    61400.0
13    55900.0
14    53700.0
15    35000.0
16    35900.0
17    50100.0
18    34900.0
19    60900.0
20    38000.0
21    37900.0
22    47900.0
23    39100.0
24    41200.0
25    43500.0
26    35700.0
27    38800.0
28    39200.0
29    37800.0
30    57700.0
31    49100.0
32    36100.0
33    40900.0
34    35600.0
35    49200.0
36    40800.0
37    45400.0
38    57900.0
39    35900.0
40    54200.0
41    39900.0
42    39900.0
43    74300.0
44    50300.0
45    40800.0
46    35900.0
47    34100.0
48    36500.0
49    34000.0
Name: Starting Median Salary, dtype: float64

In [7]:
#Finding the Highest Value in a columm
clean_df['Starting Median Salary'].max()

74300.0

In [8]:
# Index for the row with the largest value.
clean_df['Starting Median Salary'].idxmax()

43

In [12]:
# To see a corresponding value of the largest value (Location Property)
clean_df['Undergraduate Major'].loc[43]
# You can also do
clean_df['Undergraduate Major'][43]

'Physician Assistant'

In [11]:
# If you don't specify a particular column you can use the .loc property to retrieve an entire row:
clean_df.loc[43]

Undergraduate Major                  Physician Assistant
Starting Median Salary                           74300.0
Mid-Career Median Salary                         91700.0
Mid-Career 10th Percentile Salary                66400.0
Mid-Career 90th Percentile Salary               124000.0
Group                                               STEM
Name: 43, dtype: object

## **Challenges**


### **1. What college major has the highest mid-career salary? How much do graduates with this major earn?**

In [15]:
print(clean_df['Mid-Career Median Salary'].max())
print(clean_df['Mid-Career Median Salary'].idxmax())

107000.0
8


In [16]:
clean_df['Undergraduate Major'][8]

'Chemical Engineering'

In [22]:
max_row_index = clean_df['Mid-Career Median Salary'].idxmax()
print(f'''
College Major with the highest Mid-Career Salary is "{clean_df['Undergraduate Major'][max_row_index]}".
Graduates of this major earn "{clean_df['Mid-Career Median Salary'].max()}".
''')


College Major with the highest Mid-Career Salary is "Chemical Engineering".
Graduates of this major earn "107000.0".



### **2. Which college major has the lowest starting salary and how much do graduates earn after university?**

In [21]:
print(clean_df['Starting Median Salary'].min())
print(clean_df['Starting Median Salary'].idxmin())

34000.0
49


In [25]:
clean_df['Undergraduate Major'][49]

'Spanish'

In [28]:
min_row_index = clean_df['Starting Median Salary'].idxmin()
print(f'''
College Major with the loswer Starting Salary is "{clean_df['Undergraduate Major'][min_row_index]}".
Graduates on the major earn "{clean_df['Starting Median Salary'].min()}".
''')


College Major with the loswer Starting Salary is "Spanish".
Graduates on the major earn "34000.0".



### **3. Which college major has the lowest mid-career salary and how much do people expect to earn with this degree?**

In [31]:
print(clean_df['Mid-Career Median Salary'].min())
print(clean_df['Mid-Career Median Salary'].idxmin())

52000.0
18


In [32]:
clean_df['Mid-Career Median Salary'][18]

52000.0

In [34]:
lowest_mid_salary_index = clean_df['Mid-Career Median Salary'].idxmin()
print(f'''
Collage Major with the lowest Mid-Career Salary is "{clean_df['Undergraduate Major'][lowest_mid_salary_index]}".
People Expect to earn "{clean_df['Mid-Career Median Salary'][lowest_mid_salary_index]}".
''')


Collage Major with the lowest Mid-Career Salary is "Education".
People Expect to earn "52000.0".



In [29]:
clean_df.head()


Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business
