## Importing packages and csvs

In [1]:
# Importing packages
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from ET_mysql_key import ET_key
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Setting up csv variables
# Note: WSJ = Wall Street Journal data, CC = Chronicle of Higher Education data
WSJ_salaries_college_file = "../Data_Files/college_salaries/salaries-by-college-type.csv" # median salary data 
CC_institution_details_file = "../Data_Files/college_completion/cc_institution_details.csv" # college graduation info
CC_institution_grads_file = "../Data_Files/college_completion/cc_institution_grads.csv" # college demographic info

## Extraction

In [3]:
# WSJ salary data
WSJ_college_raw_df = pd.read_csv(WSJ_salaries_college_file)
WSJ_college_raw_df.shape
WSJ_college_raw_df.head(3)

(269, 8)

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Amherst College,Liberal Arts,"$54,500.00","$107,000.00",,"$84,900.00","$162,000.00",
1,Appalachian State University,State,"$40,400.00","$69,100.00","$37,200.00","$50,400.00","$90,800.00","$115,000.00"
2,Arizona State University,Party,"$47,400.00","$84,100.00","$44,600.00","$60,700.00","$114,000.00","$163,000.00"


In [4]:
# CC graduation data 
CC_institution_details_raw_df = pd.read_csv(CC_institution_details_file)
CC_institution_details_raw_df.shape
CC_institution_details_raw_df.head(3)

(3795, 55)

Unnamed: 0,unitid,chronname,city,state,level,control,basic,hbcu,flagship,long_x,...,vsa_enroll_elsewhere_after6_first,vsa_grad_after4_transfer,vsa_grad_elsewhere_after4_transfer,vsa_enroll_after4_transfer,vsa_enroll_elsewhere_after4_transfer,vsa_grad_after6_transfer,vsa_grad_elsewhere_after6_transfer,vsa_enroll_after6_transfer,vsa_enroll_elsewhere_after6_transfer,counted_pct
0,222178,Abilene Christian University,Abilene,Texas,4-year,Private not-for-profit,Masters Colleges and Universities--larger prog...,,,-99.711594,...,,,,,,,,,,100.0|04
1,138558,Abraham Baldwin Agricultural College,Tifton,Georgia,4-year,Public,Associates--Public 4-year Primarily Associates,,,-83.523399,...,,,,,,,,,,71.3|04
2,172866,Academy College,Minneapolis,Minnesota,4-year,Private for-profit,Baccalaureate and Associates Colleges,,,-93.259385,...,,,,,,,,,,40.7|04


In [5]:
# CC demographic data
CC_institution_grads_raw_df = pd.read_csv(CC_institution_grads_file)
CC_institution_grads_raw_df.shape
CC_institution_grads_raw_df.head(3)

(970704, 10)

Unnamed: 0,unitid,year,gender,race,cohort,grad_cohort,grad_100,grad_150,grad_100_rate,grad_150_rate
0,101462,2010,B,X,2y all,134.0,15.0,28.0,11.2,20.9
1,101471,2010,B,X,2y all,261.0,59.0,85.0,22.6,32.6
2,101499,2010,B,X,2y all,304.0,50.0,84.0,16.4,27.6


## Cleaning

In [6]:
# WSJ salary data
# Removing unnecssary columns, standardizing column names, and creating a new df
WSJ_college_cols = ["School Name", "School Type", "Starting Median Salary", "Mid-Career Median Salary"]
WSJ_college_df = WSJ_college_raw_df[WSJ_college_cols].copy()
WSJ_college_df = WSJ_college_df.rename(columns={"School Name": "school_name", "School Type": "school_type", 
                                                "Starting Median Salary": "median_salary_start", 
                                                "Mid-Career Median Salary": "median_salary_mid"})
WSJ_college_Df = WSJ_college_df.dropna(how="any")
WSJ_college_Df.shape
WSJ_college_Df.head(3)

(269, 4)

Unnamed: 0,school_name,school_type,median_salary_start,median_salary_mid
0,Amherst College,Liberal Arts,"$54,500.00","$107,000.00"
1,Appalachian State University,State,"$40,400.00","$69,100.00"
2,Arizona State University,Party,"$47,400.00","$84,100.00"


In [7]:
# CC graduation data 
# Removing unnecessary columns, standardizing column names, 
# dropping duplicates, and creating a new df
CC_institution_details_cols = ["unitid", "chronname", "state", "level","control","basic",
                               "student_count","aid_value","grad_100_value","grad_100_percentile",
                              "grad_150_value","grad_150_percentile"]
CC_institution_details_df = CC_institution_details_raw_df[CC_institution_details_cols].drop_duplicates().copy()
CC_institution_details_df = CC_institution_details_df.rename(columns={"unitid": "school_id", 
                                                                      "chronname":"school_name"})
CC_institution_details_Df = CC_institution_details_df.dropna(how="any")
CC_institution_details_Df.shape
CC_institution_details_Df.head(3)

(3594, 12)

Unnamed: 0,school_id,school_name,state,level,control,basic,student_count,aid_value,grad_100_value,grad_100_percentile,grad_150_value,grad_150_percentile
0,222178,Abilene Christian University,Texas,4-year,Private not-for-profit,Masters Colleges and Universities--larger prog...,3806,9405.0,37.0,44.0,56.9,53.0
2,172866,Academy College,Minnesota,4-year,Private for-profit,Baccalaureate and Associates Colleges,236,4564.0,100.0,97.0,100.0,96.0
3,108232,Academy of Art University,California,4-year,Private for-profit,Schools of art- music- and design,12181,5342.0,5.6,36.0,29.2,58.0


In [8]:
# CC demographic data
# Selecting only most recent values (from year 2010), removing unnecessary columns,
# standardizing column names, dropping duplicates, and creating a new df
CC_institution_grads_calc_df = CC_institution_grads_raw_df[CC_institution_grads_raw_df.year == 2010]
CC_institution_grads_cols = ["unitid", "gender", "race", "cohort"]
CC_institution_grads_df = CC_institution_grads_calc_df[CC_institution_grads_cols].drop_duplicates().copy()
CC_institution_grads_df = CC_institution_grads_df.rename(columns={"unitid":"school_id"})
CC_institution_grads_Df = CC_institution_grads_df.dropna(how="any")
CC_institution_grads_Df.shape
CC_institution_grads_Df.head(3)

(107856, 4)

Unnamed: 0,school_id,gender,race,cohort
0,101462,B,X,2y all
1,101471,B,X,2y all
2,101499,B,X,2y all


## Loading

In [9]:
# Creating database connection 
# ET_key for ET's MySQL access only
connection_string = ("root:{0}@localhost/Colleges").format(ET_key)
engine = create_engine(f'mysql://{connection_string}')

In [10]:
# Confirming tables 
engine.table_names()

['demographics', 'grad_rates', 'salaries']

In [12]:
# Loading dataframes into database
WSJ_college_Df.to_sql(name='salaries', con=engine, if_exists='replace', index=True)
CC_institution_details_Df.to_sql(name='grad_rates', con=engine, if_exists='replace', index=True)
CC_institution_grads_Df.to_sql(name='demographics', con=engine, if_exists='replace', index=True)

In [13]:
# Confirming that tables loaded successfully
pd.read_sql("select * from Salaries limit 3",con = engine)
pd.read_sql("select * from Grad_Rates limit 3",con = engine)
pd.read_sql("select * from Demographics limit 3",con = engine)

Unnamed: 0,index,school_name,school_type,median_salary_start,median_salary_mid
0,0,Amherst College,Liberal Arts,"$54,500.00","$107,000.00"
1,1,Appalachian State University,State,"$40,400.00","$69,100.00"
2,2,Arizona State University,Party,"$47,400.00","$84,100.00"


Unnamed: 0,index,school_id,school_name,state,level,control,basic,student_count,aid_value,grad_100_value,grad_100_percentile,grad_150_value,grad_150_percentile
0,0,222178,Abilene Christian University,Texas,4-year,Private not-for-profit,Masters Colleges and Universities--larger prog...,3806,9405.0,37.0,44.0,56.9,53.0
1,2,172866,Academy College,Minnesota,4-year,Private for-profit,Baccalaureate and Associates Colleges,236,4564.0,100.0,97.0,100.0,96.0
2,3,108232,Academy of Art University,California,4-year,Private for-profit,Schools of art- music- and design,12181,5342.0,5.6,36.0,29.2,58.0


Unnamed: 0,index,school_id,gender,race,cohort
0,0,101462,B,X,2y all
1,1,101471,B,X,2y all
2,2,101499,B,X,2y all
