# Loading Necessary Packages

In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

## Reading the data

In [2]:
data_sal = pd.read_csv('intermediate_datasets/salaries_filtered.csv')

## Looking at the shape and a sample of the data

In [3]:
# Dropping unnecessary columns
data_sal.drop(data_sal.columns[[0, 5, 6, 7, 8]], axis = 1, inplace = True)
print("Number of rows: ", data_sal.shape[0])
print("Number of columns: ", data_sal.shape[1])
print("\nFirst 5 rows")
data_sal.head(5)

Number of rows:  100
Number of columns:  4

First 5 rows


Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary
0,Massachusetts Institute of Technology,Engineering,"$72,200.00","$126,000.00"
1,California Institute of Technology,Engineering,"$75,500.00","$123,000.00"
2,Worcester Polytechnic Institute,Engineering,"$61,000.00","$114,000.00"
3,Carnegie Mellon University,Engineering,"$61,800.00","$111,000.00"
4,Rensselaer Polytechnic Institute,Engineering,"$61,100.00","$110,000.00"


# Combining above data with average SAT scores, location, etc for the same schools

## Reading the data


In [4]:
data_sat = pd.read_csv('intermediate_datasets/university_stats.csv')

## Looking at the shape and a sample of the data

In [5]:
# Dropping unnecessary columns
data_sat.drop(data_sat.columns[[0]], axis = 1, inplace = True)
data_sat.columns = ["School Name", "Average SAT Score", "City", "State", "Latitude", "Longitude"]
print("Number of rows: ", data_sat.shape[0])
print("Number of columns: ", data_sat.shape[1])
print("\nFirst 5 rows")
data_sat.head(5)

Number of rows:  100
Number of columns:  6

First 5 rows


Unnamed: 0,School Name,Average SAT Score,City,State,Latitude,Longitude
0,Princeton University,1400.0,Princeton,NJ,40.3563,-74.6693
1,Harvard University,1430.0,Cambridge,MA,42.3759,-71.1185
2,Yale University,1420.0,New Haven,CT,41.3112,-72.9246
3,Columbia University,1430.0,New York,NY,40.6943,-73.9249
4,Massachusetts Institute of Technology,1460.0,Cambridge,MA,42.3759,-71.1185


## Merging datasets

In [6]:
merged_ds = data_sal.merge(data_sat, left_on='School Name', right_on='School Name')

In [7]:
merged_ds.head(10)

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Average SAT Score,City,State,Latitude,Longitude
0,Massachusetts Institute of Technology,Engineering,"$72,200.00","$126,000.00",1460.0,Cambridge,MA,42.3759,-71.1185
1,California Institute of Technology,Engineering,"$75,500.00","$123,000.00",1510.0,Pasadena,CA,34.1597,-118.139
2,Worcester Polytechnic Institute,Engineering,"$61,000.00","$114,000.00",1200.0,Worcester,MA,42.2705,-71.8079
3,Carnegie Mellon University,Engineering,"$61,800.00","$111,000.00",1380.0,Pittsburgh,PA,40.4396,-79.9762
4,Rensselaer Polytechnic Institute,Engineering,"$61,100.00","$110,000.00",1280.0,Troy,NY,42.7354,-73.6751
5,Georgia Institute of Technology,Engineering,"$58,300.00","$106,000.00",1320.0,Atlanta,GA,33.7627,-84.4224
6,Colorado School of Mines,Engineering,"$58,100.00","$106,000.00",1250.0,Golden,CO,39.7406,-105.2118
7,Stevens Institute of Technology,Engineering,"$60,600.00","$105,000.00",1260.0,Hoboken,NJ,40.7453,-74.0279
8,Illinois Institute of Technology,Engineering,"$56,000.00","$97,800.00",1130.0,Chicago,IL,41.8373,-87.6862
9,Rochester Institute of Technology,Engineering,"$48,900.00","$84,600.00",1140.0,Rochester,NY,43.168,-77.6162


# Combining school region data with the above dataset

In [8]:
data_reg = pd.read_csv('original_datasets/salaries-by-region.csv')

In [9]:
# Formatting school names
for i in range(len(data_reg['School Name'])):
    
    # replace abbreviations
    sub = re.sub(r'\s\([^)]*\)', '', data_reg['School Name'][i])
    data_reg['School Name'][i] = sub
    
    # replace comma with - 
    replace_comma = data_reg['School Name'][i].replace(', ', '-')
    data_reg['School Name'][i] = replace_comma
    
    # replace spacing around - 
    if '- ' in data_reg['School Name'][i]:
        replace_spacing_left = data_reg['School Name'][i].replace('- ', '-')
        data_reg['School Name'][i] = replace_spacing_left
        
    elif ' -' in data_reg['School Name'][i]:
        replace_spacing_right = data_reg['School Name'][i].replace(' -', '-')
        data_reg['School Name'][i] = replace_spacing_right

In [10]:
data_reg.head(5)

Unnamed: 0,School Name,Region,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Stanford University,California,"$70,400.00","$129,000.00","$68,400.00","$93,100.00","$184,000.00","$257,000.00"
1,California Institute of Technology,California,"$75,500.00","$123,000.00",,"$104,000.00","$161,000.00",
2,Harvey Mudd College,California,"$71,800.00","$122,000.00",,"$96,000.00","$180,000.00",
3,University of California-Berkeley,California,"$59,900.00","$112,000.00","$59,500.00","$81,000.00","$149,000.00","$201,000.00"
4,Occidental College,California,"$51,900.00","$105,000.00",,"$54,800.00","$157,000.00",


In [11]:
merged_ds_reg = data_reg.merge(merged_ds, left_on='School Name', right_on='School Name')

In [12]:
merged_ds_reg.drop(merged_ds_reg.columns[[4, 5, 6, 7, 9, 10]], axis = 1, inplace = True)
merged_ds_reg.columns = ["School Name", "Region", "Starting Median Salary", "Mid-Career Median Salary", "School Type", "Average SAT Score", "City", "State", "Latitude", "Longitude"]

In [13]:
merged_ds_reg

Unnamed: 0,School Name,Region,Starting Median Salary,Mid-Career Median Salary,School Type,Average SAT Score,City,State,Latitude,Longitude
0,California Institute of Technology,California,"$75,500.00","$123,000.00",Engineering,1510.0,Pasadena,CA,34.1597,-118.1390
1,University of California-Berkeley,California,"$59,900.00","$112,000.00",State,1300.0,Berkeley,CA,37.8723,-122.2760
2,University of California-San Diego,California,"$51,100.00","$101,000.00",State,1140.0,La Jolla,CA,32.8328,117.2713
3,University of California-Davis,California,"$52,300.00","$99,600.00",State,1050.0,Davis,CA,38.5552,-121.7365
4,University of California-Irvine,California,"$48,300.00","$96,700.00",State,1060.0,Irvine,CA,33.6772,-117.7738
...,...,...,...,...,...,...,...,...,...,...
95,University of Connecticut,Northeastern,"$48,000.00","$88,800.00",State,1130.0,Storrs,CT,41.8045,-72.2552
96,University of Rhode Island,Northeastern,"$43,900.00","$85,300.00",State,990.0,Kingston,RI,41.4738,-71.5236
97,Rochester Institute of Technology,Northeastern,"$48,900.00","$84,600.00",Engineering,1140.0,Rochester,NY,43.1680,-77.6162
98,University of Vermont,Northeastern,"$44,800.00","$82,700.00",State,1100.0,Burlington,VT,44.4877,-73.2314


In [14]:
# Converting salaries to float values

for i in range(len(merged_ds_reg['Starting Median Salary'])):
    
    # replace "$" and comma with "" 
    replace_dollar = merged_ds_reg['Starting Median Salary'][i].replace('$', '')
    replace_comma = replace_dollar.replace(',', '')
    merged_ds_reg['Starting Median Salary'][i] = replace_comma
    
for i in range(len(merged_ds_reg['Mid-Career Median Salary'])):
    
    # replace "$" and comma with "" 
    replace_dollar = merged_ds_reg['Mid-Career Median Salary'][i].replace('$', '')
    replace_comma = replace_dollar.replace(',', '')
    merged_ds_reg['Mid-Career Median Salary'][i] = replace_comma

In [15]:
merged_ds_reg["Starting Median Salary"] = pd.to_numeric(merged_ds_reg["Starting Median Salary"])
merged_ds_reg["Mid-Career Median Salary"] = pd.to_numeric(merged_ds_reg["Mid-Career Median Salary"])

In [16]:
merged_ds_reg

Unnamed: 0,School Name,Region,Starting Median Salary,Mid-Career Median Salary,School Type,Average SAT Score,City,State,Latitude,Longitude
0,California Institute of Technology,California,75500.0,123000.0,Engineering,1510.0,Pasadena,CA,34.1597,-118.1390
1,University of California-Berkeley,California,59900.0,112000.0,State,1300.0,Berkeley,CA,37.8723,-122.2760
2,University of California-San Diego,California,51100.0,101000.0,State,1140.0,La Jolla,CA,32.8328,117.2713
3,University of California-Davis,California,52300.0,99600.0,State,1050.0,Davis,CA,38.5552,-121.7365
4,University of California-Irvine,California,48300.0,96700.0,State,1060.0,Irvine,CA,33.6772,-117.7738
...,...,...,...,...,...,...,...,...,...,...
95,University of Connecticut,Northeastern,48000.0,88800.0,State,1130.0,Storrs,CT,41.8045,-72.2552
96,University of Rhode Island,Northeastern,43900.0,85300.0,State,990.0,Kingston,RI,41.4738,-71.5236
97,Rochester Institute of Technology,Northeastern,48900.0,84600.0,Engineering,1140.0,Rochester,NY,43.1680,-77.6162
98,University of Vermont,Northeastern,44800.0,82700.0,State,1100.0,Burlington,VT,44.4877,-73.2314


# Exporting as one of the final datasets to be used

In [17]:
merged_ds_reg.to_csv('region_sat_school.csv', index = False)

# Exploring the new csv file

In [18]:
ds1 = pd.read_csv('region_sat_school.csv')

In [19]:
data_struct = pd.DataFrame(data = ds1.columns, columns = ['columns']) # columns
data_struct['unique_values'] = ds1.nunique().values # number of unique values
data_struct['dtype'] = ds1.dtypes.values # data types
data_struct['nulls'] = ds1.isnull().sum().values # number of null or missing values

In [20]:
data_struct

Unnamed: 0,columns,unique_values,dtype,nulls
0,School Name,100,object,0
1,Region,5,object,0
2,Starting Median Salary,76,float64,0
3,Mid-Career Median Salary,81,float64,0
4,School Type,4,object,0
5,Average SAT Score,48,float64,5
6,City,97,object,0
7,State,42,object,0
8,Latitude,98,float64,0
9,Longitude,98,float64,0


# Cleaning dataset of salaries by major

## Reading the data

In [21]:
data_major = pd.read_csv('original_datasets/degrees-that-pay-back.csv')

## Looking at the shape and sample of the data

In [22]:
# Dropping unnecessary columns
data_major.drop(data_major.columns[[4, 5, 6, 7]], axis = 1, inplace = True)
print("Number of rows: ", data_major.shape[0])
print("Number of columns: ", data_major.shape[1])
print("\nFirst 5 rows")
data_major.head(5)

Number of rows:  50
Number of columns:  4

First 5 rows


Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Percent change from Starting to Mid-Career Salary
0,Accounting,"$46,000.00","$77,100.00",67.6
1,Aerospace Engineering,"$57,700.00","$101,000.00",75.0
2,Agriculture,"$42,600.00","$71,900.00",68.8
3,Anthropology,"$36,800.00","$61,500.00",67.1
4,Architecture,"$41,600.00","$76,800.00",84.6


In [23]:
# Converting salaries to float values

for i in range(len(data_major['Starting Median Salary'])):
    
    # replace "$" and comma with "" 
    replace_dollar = data_major['Starting Median Salary'][i].replace('$', '')
    replace_comma = replace_dollar.replace(',', '')
    data_major['Starting Median Salary'][i] = replace_comma
    
for i in range(len(data_major['Mid-Career Median Salary'])):
    
    # replace "$" and comma with "" 
    replace_dollar = data_major['Mid-Career Median Salary'][i].replace('$', '')
    replace_comma = replace_dollar.replace(',', '')
    data_major['Mid-Career Median Salary'][i] = replace_comma

In [24]:
data_major["Starting Median Salary"] = pd.to_numeric(data_major["Starting Median Salary"])
data_major["Mid-Career Median Salary"] = pd.to_numeric(data_major["Mid-Career Median Salary"])

In [25]:
data_major.head(5)

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Percent change from Starting to Mid-Career Salary
0,Accounting,46000.0,77100.0,67.6
1,Aerospace Engineering,57700.0,101000.0,75.0
2,Agriculture,42600.0,71900.0,68.8
3,Anthropology,36800.0,61500.0,67.1
4,Architecture,41600.0,76800.0,84.6


In [26]:
data_struct1 = pd.DataFrame(data = data_major.columns, columns = ['columns']) # columns
data_struct1['unique_values'] = data_major.nunique().values # number of unique values
data_struct1['dtype'] = data_major.dtypes.values # data types
data_struct1['nulls'] = data_major.isnull().sum().values # number of null or missing values

In [27]:
data_struct1

Unnamed: 0,columns,unique_values,dtype,nulls
0,Undergraduate Major,50,object,0
1,Starting Median Salary,43,float64,0
2,Mid-Career Median Salary,49,float64,0
3,Percent change from Starting to Mid-Career Salary,48,float64,0


# Exporting as one of the final datasets to be used

In [28]:
data_major.to_csv('major_salaries.csv', index = False)

# Exploring the new csv file

In [29]:
ds2 = pd.read_csv('major_salaries.csv')

In [30]:
data_struct2 = pd.DataFrame(data = ds2.columns, columns = ['columns']) # columns
data_struct2['unique_values'] = ds2.nunique().values # number of unique values
data_struct2['dtype'] = ds2.dtypes.values # data types
data_struct2['nulls'] = ds2.isnull().sum().values # number of null or missing values

In [31]:
data_struct2

Unnamed: 0,columns,unique_values,dtype,nulls
0,Undergraduate Major,50,object,0
1,Starting Median Salary,43,float64,0
2,Mid-Career Median Salary,49,float64,0
3,Percent change from Starting to Mid-Career Salary,48,float64,0
