
# IPEDS Cohort Clustering Model


In [None]:
# Author: Matthew Fikes
# Modified: 3/16/22

import pandas as pd
import numpy as np
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from matplotlib import pyplot as plt
import warnings
from sklearn import preprocessing

This code is set up to find similar IPEDS schools based on a given UNITID. The code will obtain values from IPEDS directly and filter by some of the target school characteristics. Some of the filters are set up to limit data to only 2-year public schools. These are noted with comments in the code and can be altered to find other groups.

The code is not entirely automated, the number of clusters should be modified based on the results of the generated dendrograms. Information on reading dendrograms is included where the first chart is generated.

This example resulted in exactly 10 schools after some filtering. Your results may vary and you may wish to keep a larger initial cluster and filter it manually by looking at programs offered at the different comparison institutions. 


## Static Variables

In [None]:
local_id = int(input(r'Enter your school UNITID from IPEDS (ex. 193283): ')) # replace with UNITID for your school. 

# these can be found during initial cluster analysis and removed manually via this list. This could probably be automatic
# if the first clustering operation removed the smallest cluster before running again.
outlier_ids = [] 


## Functions

In [None]:
'''Gets Directory Data from IPEDS, only return Public 2-year schools'''

def getDirectoryData():
    
    # this goes to the current directory and can be updated on the IPEDS datacenter
    url = 'https://nces.ed.gov/ipeds/datacenter/data/HD2020.zip'
    
    # limit to a few fields about identification and institutional characteristics for filtering later
    dir_fields = ['UNITID','INSTNM','IALIAS','ADDR','CITY','STABBR','ZIP','OBEREG','OPEID','SECTOR','CONTROL',
                  'HLOFFER','LOCALE','INSTCAT','C18BASIC','INSTSIZE','CBSA','CBSATYPE','CSA','COUNTYNM','LONGITUD','LATITUDE']
    
    try:
        file = urlopen(url)
    except:
        print("Zip file not found.")
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = zipfile.open(zipfile.namelist()[0])
    data = pd.read_csv(files,encoding='cp1252')
    df = data[dir_fields]
    
    # Sector filters results to 2-year schools
    output = df[(df['SECTOR']==4)]
    return(output)
   
    

In [None]:
'''Pulls Fall Enrollment Data from IPEDS Fall Enrollment File A:
    Race/ethnicity, gender, attendance status, and level of student
    for specified 4-digit year, returns as a Pandas dataframe'''

def getFallEnrollment(year):
    url = 'https://nces.ed.gov/ipeds/datacenter/data/EF{0}A.zip'.format(year)
    try:
        file = urlopen(url)
    except:
        print("Zip file for year {0} not found.".format(year))
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
    
    if len(files)>1:
        user_choice = input("Revision found for {0}. Use revised? Y/N: ".format(year))
        if user_choice in['Y','y']:
            data = pd.read_csv(files.pop()) 
        else:
            data = pd.read_csv(files[0])
    else:
        data = pd.read_csv(files[0])
    
    # this filters the Fall Enrollment data to "All students, Undergraduate Total"
    fall_data = data[(data['EFALEVEL']==2)][['UNITID','EFTOTLT']]
    fall_data.rename(columns={'EFTOTLT':'FallEnrollment'},inplace=True)
    return(fall_data)
   
    

In [None]:
'''Pulls 12-Month Enrollment Data from IPEDS Fall Enrollment File:
    12-month unduplicated headcount by race/ethnicity, gender and level of student
    for specified 4-digit year, returns as a Pandas dataframe'''

def get12MEnrollment(year):
    url = 'https://nces.ed.gov/ipeds/datacenter/data/EFFY{0}.zip'.format(year)
    try:
        file = urlopen(url)
    except:
        print("Zip file for year {0} not found.".format(year))
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
    
    if len(files)>1:
        user_choice = input("Revision found for {0}. Use revised? Y/N: ".format(year))
        if user_choice in['Y','y']:
            data = pd.read_csv(files.pop())
 
        else:
            data = pd.read_csv(files[0])
    else:
        data = pd.read_csv(files[0])
    
    # limits results to All students, undergraduate total
    fte_data = data[(data['EFFYALEV']==2)][['UNITID','EFYTOTLT']]
    fte_data.rename(columns={'EFYTOTLT':'LatestFTE'},inplace=True)
    return(fte_data)
   
    

In [None]:
'''Pulls Grad Rate Data from IPEDS Fall Enrollment File A:
    Race/ethnicity, gender, attendance status, and level of student
    for specified 4-digit year, returns as a Pandas dataframe'''

def getGradRates(year):
    url = 'https://nces.ed.gov/ipeds/datacenter/data/GR{0}.zip'.format(year)
    try:
        file = urlopen(url)
    except:
        print("Zip file for year {0} not found.".format(year))
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
    
    if len(files)>1:
        user_choice = input("Revision found for {0}. Use revised? Y/N: ".format(year))
        if user_choice in['Y','y']:
            data = pd.read_csv(files.pop())
 
        else:
            data = pd.read_csv(files[0])
    else:
        data = pd.read_csv(files[0])
    # COHORT 4 filters to Degree/certif-seeking students 2017 cohort ( 2-yr institution)
    # GRTYPE 29 filters to  Degree/certif-seeking students ( 2-yr institution) Adjusted cohort (revised cohort minus exclusions)
    fte_data = data[(data['COHORT']==4) & (data['GRTYPE']==29)][['UNITID','GRTOTLT']]
    fte_data.rename(columns={'GRTOTLT':'GradRate'},inplace=True)
    return(fte_data)
   
    

In [None]:
'''Pulls Fall Enrollment Data from IPEDS Fall Enrollment File A:
    Race/ethnicity, gender, attendance status, and level of student
    for specified 4-digit year, returns as a Pandas dataframe'''

def getFinance(year):
    url = 'https://nces.ed.gov/ipeds/datacenter/data/F{0}_F1A.zip'.format(year)
    try:
        file = urlopen(url)
    except:
        print("Zip file for year {0} not found.".format(year))
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
    
    if len(files)>1:
        user_choice = input("Revision found for {0}. Use revised? Y/N: ".format(year))
        if user_choice in['Y','y']:
            data = pd.read_csv(files.pop())
 
        else:
            data = pd.read_csv(files[0])
    else:
        data = pd.read_csv(files[0])
    
    # F1B09: Total Operating Revenue
    # F1B01: Tutition/Fees as a % of Operating Revenue
    # F1B11: State Appropriations as a % of Operating Revenue
    # F1N07: Total Expenditures
    fin_data = data[['UNITID','F1B09','F1B11','F1B01','F1N07   ']]
    fin_data.replace(0,np.nan,inplace=True)
    fin_data.dropna(axis=0,how='any',inplace=True)
    fin_data['TuitFeePct'] = data['F1B01']/data['F1B09']
    fin_data['StAppPct'] = data['F1B11']/data['F1B09']
    fin_data.rename(columns={'F1B09':'TotOpRevenue','F1N07   ':'TotalExpend'},inplace=True)
    fin_data.drop(['F1B11','F1B01'],axis=1,inplace=True)
    return(fin_data)
   
    

In [None]:
'''Pulls Fall Enrollment Data from IPEDS Fall Enrollment File A:
    Race/ethnicity, gender, attendance status, and level of student
    for specified 4-digit year, returns as a Pandas dataframe'''

def getCosts(year):
    url = 'https://nces.ed.gov/ipeds/datacenter/data/IC{0}_AY.zip'.format(year)
    try:
        file = urlopen(url)
    except:
        print("Zip file for year {0} not found.".format(year))
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
    
    if len(files)>1:
        user_choice = input("Revision found for {0}. Use revised? Y/N: ".format(year))
        if user_choice in['Y','y']:
            data = pd.read_csv(files.pop())
 
        else:
            data = pd.read_csv(files[0])
    else:
        data = pd.read_csv(files[0])
        
    # limits data to in-state tuition and fees and out-of-state tuition and fees
    fte_data = data[['UNITID','TUITION2','FEE2','TUITION3','FEE3']]
    fte_data.replace('.',np.nan,inplace=True)
    fte_data.dropna(axis=0,how='any',inplace=True)
    fte_data['In-State'] = fte_data['TUITION2'].astype(int)+fte_data['FEE2'].astype(int)
    fte_data['Out-of-State'] =fte_data['TUITION3'].astype(int)+fte_data['FEE3'].astype(int)
    
    return(fte_data[['UNITID','In-State','Out-of-State']])
    
    

In [None]:
'''Pulls Completion Data from IPEDS Completions File C:
    Number of students receivign awards/degrees, 
    by award level and by gender, race/ethnicity and age categories'''

def getCompletions(year):
    warnings.filterwarnings("ignore")
    url = 'https://nces.ed.gov/ipeds/datacenter/data/C{0}_C.zip'.format(year)
    
    try:
        file = urlopen(url)
    except:
        print("Zip file for year {0} not found.".format(year))
        return
    zipfile = ZipFile(BytesIO(file.read()))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
    
    if len(files)>1:
        user_choice = input("Revision found for {0}. Use revised? Y/N: ".format(year))
        if user_choice in['Y','y']:
            data = pd.read_csv(files.pop())
 
        else:
            data = pd.read_csv(files[0])
    else:
        data = pd.read_csv(files[0])
    
    # limits to associates degrees and certificates
    comp_data = data[(data['AWLEVELC'].isin([2,3,11,12]))][['UNITID','CSTOTLT']].groupby('UNITID').sum().reset_index()
    comp_data.rename(columns={'CSTOTLT':'Completions'},inplace=True)
    return(comp_data)
   
    

In [None]:
#load in all the data
print('Loading data, this may take a moment.')
warnings.filterwarnings("ignore")
directory_df = getDirectoryData()
fall_df = getFallEnrollment(2020)
fte_df = get12MEnrollment(2020)
grad_df = getGradRates(2020)
fin_df = getFinance(1920)
cost_df = getCosts(2020)
comp_df = getCompletions(2020)
warnings.filterwarnings("default")
print('Load complete.')

In [None]:
try:
    local_c18basic = directory_df[(directory_df['UNITID']==local_id)]['C18BASIC'].values[0]
    local_instsize = directory_df[(directory_df['UNITID']==local_id)]['INSTSIZE'].values[0]
except:
    print('UNITID not found.')
    local_id = int(input(r'Enter your school UNITID from IPEDS (ex. 193283): '))

 This filters initial cluster group to matches with Carnegie 18 Basic classification for local institution

In [None]:
directory_df = directory_df[(directory_df['C18BASIC']==local_c18basic)]

In [None]:
# merge data
m1 = directory_df[['UNITID']].merge(fall_df,on='UNITID',how='left')
m2 = m1.merge(fte_df,on='UNITID',how='left')
m3 = m2.merge(grad_df,on='UNITID',how='left')
m4 = m3.merge(fin_df,on='UNITID',how='left')
m5 = m4.merge(cost_df,on='UNITID',how='left')
merged_df = m5.merge(comp_df,on='UNITID',how='left')
merged_df.dropna(axis=0,how='any',inplace=True)

In [None]:
# remove outliers from data
new_df = merged_df[~merged_df.UNITID.isin(outlier_ids)]

In [None]:
# make fieldlist for calculation/normalization
val_fields = ['FallEnrollment','LatestFTE','GradRate','TotOpRevenue','TotalExpend','TuitFeePct','StAppPct','In-State','Out-of-State','Completions']

In [None]:
labels = new_df['UNITID']
d = pd.DataFrame(preprocessing.normalize(new_df[val_fields],axis=0),columns=val_fields)
scaled_df = pd.concat([labels.reset_index(drop=True),d.reset_index(drop=True)],axis=1)


## Run Clustering Model

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as shc

In [None]:
pca = PCA(n_components = 2)
X_fit = pca.fit_transform(d)
X_principal = pd.DataFrame(X_fit)
X_principal.columns = ['P1', 'P2']

In [None]:

plt.figure(figsize =(8, 8))
plt.title('Visualising the data')
Dendrogram = shc.dendrogram((shc.linkage(X_principal, method ='ward')))

Use the dendrogram above to find the ideal number of clusters.
For instructions on interpreting a dendrogram, check here: 

https://www.displayr.com/what-is-dendrogram/

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 2) # set n_clusters to value from dendrogram above (or custom value for testing)
  
# Visualizing the clustering
plt.figure(figsize =(6, 6))
plt.scatter(X_principal['P1'], X_principal['P2'], 
           c = ac2.fit_predict(X_principal), cmap ='rainbow')
plt.show()

In [None]:
clusters = pd.concat([labels.reset_index(drop=True),pd.Series(ac2.labels_).reset_index(drop=True)],axis=1)
clusters.rename(columns={0:'Cluster'},inplace=True)

In [None]:
data_w_clusters = clusters.merge(new_df,on='UNITID')
local_cluster = data_w_clusters[(data_w_clusters['UNITID']==local_id)]['Cluster'].values[0]
print('Target school is in cluster #',local_cluster)

In [None]:
new_group = data_w_clusters[(data_w_clusters['Cluster']==local_cluster)]

#### Summary statistics for initial cluster

In [None]:
new_group.describe()

#### Cluster distribution

In [None]:

clusters.groupby('Cluster').count()

In [None]:
labels2 = new_group['UNITID']
d2 = pd.DataFrame(preprocessing.normalize(new_group[val_fields],axis=0),columns=val_fields)
scaled_df2 = pd.concat([labels2.reset_index(drop=True),d2.reset_index(drop=True)],axis=1)


## Run Clustering Model again on smaller set

In [None]:
pca2 = PCA(n_components = 2)
X_fit2 = pca.fit_transform(d2)
X_principal2 = pd.DataFrame(X_fit2)
X_principal2.columns = ['P1', 'P2']

In [None]:

plt.figure(figsize =(8, 8))
plt.title('Visualising the data')
Dendrogram = shc.dendrogram((shc.linkage(X_principal2, method ='ward')))

In [None]:
ac3 = AgglomerativeClustering(n_clusters = 3) # use results from dendrogram above to get n_clusters
  
# Visualizing the clustering
plt.figure(figsize =(6, 6))
plt.scatter(X_principal2['P1'], X_principal2['P2'], 
           c = ac3.fit_predict(X_principal2), cmap ='rainbow')
#plt.legend(ac3.labels_)
plt.show()

In [None]:
clusters2 = pd.concat([labels2.reset_index(drop=True),pd.Series(ac3.labels_).reset_index(drop=True)],axis=1)
clusters2.rename(columns={0:'NewCluster'},inplace=True)
data_clusters = clusters2.merge(new_group,on='UNITID')

In [None]:
new_mv_cluster = data_clusters[(data_clusters['UNITID']==local_id)]['NewCluster'].values[0]

In [None]:
new_mv_group = data_clusters[(data_clusters['NewCluster']==new_mv_cluster)]

#### Display averages by cluster

In [None]:
data_clusters.groupby('NewCluster')[val_fields].mean()

#### Show Summary statistics for cluster

In [None]:
new_mv_group.describe()

In [None]:
group_data = new_mv_group[['UNITID']].merge(directory_df,on='UNITID')

In [None]:
full_data = new_mv_group.merge(directory_df,on='UNITID')
full_data.drop(columns=['Cluster','NewCluster'],inplace=True)

## Final custom filtering to limit institution size and locale information to local institution record
'LOCALE' variable refers to census designations for the level of urbanization in the area:

11 = City: Large: Territory inside an urbanized area and inside a principal city with population of 250,000 or more. 

12 = City: Midsize: Territory inside an urbanized area and inside a principal city with population less than 250,000 and greater than or equal to 100,000.

13 = City: Small: Territory inside an urbanized area and inside a principal city with population less than 100,000.

21 = Suburb: Large: Territory outside a principal city and inside an urbanized area with population of 250,000 or more.

22 = Suburb: Midsize: Territory outside a principal city and inside an urbanized area with population less than 250,000 and greater than or equal to 100,000.

23 = Suburb: Small: Territory outside a principal city and inside an urbanized area with population less than 100,000.

31 = Town: Fringe: Territory inside an urban cluster that is less than or equal to 10 miles from an urbanized area.

32 = Town: Distant: Territory inside an urban cluster that is more than 10 miles and less than or equal to 35 miles from an urbanized area.

33 = Town: Remote: Territory inside an urban cluster that is more than 35 miles of an urbanized area.

41 - Rural: Fringe: Census-defined rural territory that is less than or equal to 5 miles from an urbanized area, as well as rural territory that is less than or equal to 2.5 miles from an urban cluster. 

42 = Rural: Distant: Census-defined rural territory that is more than 5 miles but less than or equal to 25 miles from an urbanized area, as well as rural territory that is more than 2.5 miles but less than or equal to 10 miles from an urban cluster. 

43 = Rural: Remote: Census-defined rural territory that is more than 25 miles from an urbanized area and is also more than 10 miles from an urban cluster.

In [None]:
cohort = group_data[((group_data['INSTSIZE']==local_instsize))|(group_data['LOCALE'].isin([13,21,22]))]

#### Output list of USERIDs in new cluster

In [None]:
final_cohort = cohort[(cohort['LOCALE'].isin([13,21,22]))]
final_cohort['UNITID'].unique().tolist()

#### Output list of schools in new cluster

In [None]:
final_cohort['INSTNM'].unique().tolist()

### Export full data for new cluster to Excel

In [None]:
full_data.merge(final_cohort[['UNITID']],on='UNITID',how='inner').to_excel('New Cohort.xlsx',index=None)