In [1]:
import camelot 
import numpy as np
import pandas as pd

In [33]:
# Peeling from the pdf
pdf =\
    camelot.read_pdf('../csu_enrollment_by_ethnic_origin_13_17.pdf',\
        pages='1')


# Taking what was peeled from the pdf and placing its first object
# into a datafame
df = pd.DataFrame(pdf[0].df)


# Dropping unnecessary rows
df.drop([0, 1], inplace=True)


# Reindexing
df.index = np.arange(0, df.shape[0])


# Assigning new column names to avoid duplicate axes
df.rename(columns={0: 'zero', 1: 'one', 2: 'two', 3: 'three'}, inplace=True)


# Removing all of the line break characters
df = df.apply(lambda x: x.str.split('\n'))


# The dataframe 'df' is partitioned into its respective columns
df_zero_raw = df['zero']
df_one_raw = df['one']
df_two_raw = df['two']
df_three_raw = df['three']


# Partitioning 'zero' column into 
# 'Undergraduate', 'Graduate', 'Combine' lists
df_zero_raw_undergraduate = df_zero_raw[0]
df_zero_raw_graduate = df_zero_raw[1]
df_zero_raw_combined = df_zero_raw[2]


# Removing the partition name 
# ('Undergraduate', 'Graduate', 'Combine') from each list

# Undergraduate
df_zero_undergraduate = df_zero_raw_undergraduate
del df_zero_undergraduate[0]

# Graduate
df_zero_graduate = df_zero_raw_graduate
del df_zero_graduate[0]

# Combined
df_zero_combined = df_zero_raw_combined
del df_zero_combined[0]


# Changing all row values in 'Undergraduate', 'Graduate', 'Combine' lists
# to lowercase
for i in range(10):
    df_zero_undergraduate[i] =\
        df_zero_undergraduate[i].lower().replace(' ', '_').replace('-', '_')
    df_zero_graduate[i] =\
        df_zero_graduate[i].lower().replace(' ', '_').replace('-', '_')
    df_zero_combined[i] =\
        df_zero_combined[i].lower().replace(' ', '_').replace('-', '_')
    
    
# Partitioning 'one' column into year 
# '2013', '2014', '2015', '2016', '2017' per 
# 'Undergraduate', 'Graduate', 'Combine' lists

# Undergraduate by year
df_one_undergraduate_13 = df_one_raw.loc[0][0:50:5]
df_one_undergraduate_14 = df_one_raw.loc[0][1:50:5]
df_one_undergraduate_15 = df_one_raw.loc[0][2:50:5]
df_one_undergraduate_16 = df_one_raw.loc[0][3:50:5]
df_one_undergraduate_17 = df_one_raw.loc[0][4:50:5]

# Graduate by year
df_one_graduate_13 = df_one_raw.loc[1][0:50:5]
df_one_graduate_14 = df_one_raw.loc[1][1:50:5]
df_one_graduate_15 = df_one_raw.loc[1][2:50:5]
df_one_graduate_16 = df_one_raw.loc[1][3:50:5]
df_one_graduate_17 = df_one_raw.loc[1][4:50:5]

# Combined by year
df_one_combined_13 = df_one_raw.loc[2][0:50:5]
df_one_combined_14 = df_one_raw.loc[2][1:50:5]
df_one_combined_15 = df_one_raw.loc[2][2:50:5]
df_one_combined_16 = df_one_raw.loc[2][3:50:5]
df_one_combined_17 = df_one_raw.loc[2][4:50:5]


# Change the 'NA' row value in position 5 of 'Combined'
#year 2013 column to '19'
df_one_combined_13[5] = '19'


# Partitioning 'two' column into 
# 'Undergraduate', 'Graduate', 'Combined' lists
df_two_undergraduate = df_two_raw.loc[0][0:10]
df_two_graduate = df_two_raw.loc[1][0:10]
df_two_combined = df_two_raw.loc[2][0:10] 


# Changing the '' at position 5 in 'Combined' to '-7'
df_two_combined[5] = '-7'


# Partitioning 'three' column into 
# 'Undergraduate', 'Graduate', 'Combined' lists
df_three_undergraduate = df_three_raw.loc[0][0:10]
df_three_graduate = df_three_raw.loc[1][0:10]
df_three_combined = df_three_raw.loc[2][0:10] 


# Changing the '' at position 8 in 'Undergraduate' and 'Graduate' to 0.0
df_three_undergraduate[8] = 0.0
df_three_graduate[8] = 0.0


# Changing the '' at positions 5 and 8 in 'Combined' to '-36.8' and 0.0 respectively
df_three_combined[5] = '-36.8'
df_three_combined[8] = 0.0


# Creating the dataframes

# Column names
column_names = [
    'student_ethnic_origin',
    'fall_2013',
    'fall_2014',
    'fall_2015',
    'fall_2016',
    'fall_2017',
    '4_year_#_change',
    '4_year_%_change'
    ]


# Undergraduate lists
undergraduate_list_of_lists = [
    df_zero_undergraduate,
    df_one_undergraduate_13,
    df_one_undergraduate_14,
    df_one_undergraduate_15,
    df_one_undergraduate_16,
    df_one_undergraduate_17,
    df_two_undergraduate,
    df_three_undergraduate
    ]

# Undergraduate dataframe
df_undergraduate =\
    pd.DataFrame(list(zip(*undergraduate_list_of_lists)), columns=column_names)


# Graduate lists
graduate_list_of_lists = [
    df_zero_graduate,
    df_one_graduate_13,
    df_one_graduate_14,
    df_one_graduate_15,
    df_one_graduate_16,
    df_one_graduate_17,
    df_two_graduate,
    df_three_graduate
    ]

# Graduate dataframe
df_graduate =\
    pd.DataFrame(list(zip(*graduate_list_of_lists)), columns=column_names)


# Combined lists
combined_list_of_lists = [
    df_zero_combined,
    df_one_combined_13,
    df_one_combined_14,
    df_one_combined_15,
    df_one_combined_16,
    df_one_combined_17,
    df_two_combined,
    df_three_combined
    ]

# Combined dataframe
df_combined =\
    pd.DataFrame(list(zip(*combined_list_of_lists)), columns=column_names)


# Changing each row value in the '4_year_%_change' column,
# for each 'Undergraduate', 'Graduate', and 'Combined' table, to a float
df_undergraduate['4_year_%_change'] =\
    df_undergraduate.iloc[0:10, 7].replace('\%', '',\
    regex=True).astype(float) 
df_graduate['4_year_%_change'] =\
    df_graduate.iloc[0:10, 7].replace('\%', '',\
    regex=True).astype(float)
df_combined['4_year_%_change'] =\
    df_combined.iloc[0:10, 7].replace('\%', '',\
    regex=True).astype(float)


# Changing each row value in 
# 'fall_2008', 'fall_2014, 'fall_2015', 'fall_2016', 'fall_2017', 
# '4_year_#_change',  
# for each 'Undergraduate', 'Graduate', and 'Combined' table, to an integer  
for i in range(1, 7):
    df_undergraduate.iloc[0:10, i] =\
        df_undergraduate.iloc[0:10, i].replace('\,', '',\
        regex=True).astype(int)
    df_graduate.iloc[0:10, i] =\
        df_graduate.iloc[0:10, i].replace('\,', '',\
        regex=True).astype(int)
    df_combined.iloc[0:10, i] =\
        df_combined.iloc[0:10, i].replace('\,', '',\
        regex=True).astype(int)
    
    
# Finally, we change the row names for 'student_ethnic_origin' column
# and reorder the rows
new_origin_names =\
    ['american_indian', 'asian', 'african_american', 'hispanic',\
    'international_students',\
    'native_hawaiian_or_pacific_islander', 'multi_racial', 'white',\
    'race_and_ethnicity_unknown',\
    'total']

# Undergraduate
df_undergraduate['student_ethnic_origin'] =\
    df_undergraduate['student_ethnic_origin']\
    .replace(list(df_undergraduate['student_ethnic_origin']), new_origin_names)
df_undergraduate = df_undergraduate.reindex([4, 1, 2, 3, 0, 6, 7, 5, 8, 9])

# Graduate
df_graduate['student_ethnic_origin'] =\
    df_graduate['student_ethnic_origin']\
    .replace(list(df_graduate['student_ethnic_origin']), new_origin_names)
df_graduate = df_graduate.reindex([4, 1, 2, 3, 0, 6, 7, 5, 8, 9])

# Combined
df_combined['student_ethnic_origin'] =\
    df_combined['student_ethnic_origin']\
    .replace(list(df_combined['student_ethnic_origin']), new_origin_names)
df_combined = df_combined.reindex([4, 1, 2, 3, 0, 6, 7, 5, 8, 9])

Unnamed: 0,student_ethnic_origin,fall_2013,fall_2014,fall_2015,fall_2016,fall_2017,4_year_#_change,4_year_%_change
4,international_students,110,112,119,142,139,29,26.4
1,asian,175,177,182,189,202,27,15.4
2,african_american,2849,2924,3027,3121,3167,318,11.2
3,hispanic,396,431,443,466,483,87,22.0
0,american_indian,40,34,29,33,32,-8,-20.0
6,multi_racial,219,186,177,154,156,-63,-28.8
7,white,4356,4310,4445,4290,4261,-95,-2.2
5,native_hawaiian_or_pacific_islander,19,20,18,12,12,-7,-36.8
8,race_and_ethnicity_unknown,0,0,0,0,0,0,0.0
9,total,8164,8194,8440,8407,8452,288,3.5
