# Setting up

In [1]:
# Dependencies 
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import sem
plt.style.use('seaborn')

# Hide warning messages in notebook
# import warnings
# warnings.filterwarnings('ignore')

# Importing 4 csv files and merging them into one

In [None]:
# Import datasets
demo_2016 = pd.read_csv("assets/data/2016_demo_data.csv")
demo_2017 = pd.read_csv("assets/data/2017_demo_data.csv")
demo_2018 = pd.read_csv("assets/data/2018_demo_data.csv")
demo_2019 = pd.read_csv("assets/data/2019_demo_data.csv")

In [None]:
# Append datasets
final_df = demo_2016.append(demo_2017, ignore_index=True)
final_df = final_df.append(demo_2018, ignore_index=True)
final_df = final_df.append(demo_2019, ignore_index=True)
final_df

In [None]:
# Export the dataframe (do this Only Once!)
# final_df.to_csv("assets/data/final_demo_data.csv", index=False)

# Importing the final csv file

In [2]:
final_demo = pd.read_csv("assets/data/final_demo_data.csv")
final_demo.head()

Unnamed: 0,student_id,year,address,city,state,zipcode,ethnicity
0,112140070,2016,3901 Kent Way,So San Francisco,CA,94080,Hispanic/Latino
1,113040117,2016,540 Serra Dr,So San Francisco,CA,94080,Asian Indian
2,113090072,2016,935 Gellert Blvd,Daly City,CA,94015,Black or African American
3,113130101,2016,14 Tunitas Ln,So San Francisco,CA,94080,White
4,113130103,2016,16 Sonora Ave,So San Francisco,CA,94080,White


# Checking the dataset

In [None]:
# Type of variables
final_demo.dtypes

In [None]:
# Any NaN in the dataset
final_demo.isnull().sum()

In [None]:
# Any uplicates (or similarities, mis-spellings) in ethnicity and city
ethnicity = final_demo["ethnicity"].unique()
city = final_demo["city"].unique()

# Cleaning the dataset

In [3]:
# Change the type of "student_id" to string
final_demo["student_id"] = final_demo["student_id"].astype(str)

In [4]:
# Drop NaN in the dataset
final_demo.dropna(inplace=True)

In [5]:
# Replace ethnicity categories
final_demo.replace({"Asian Indian": "General Asian",
                   "Cambodian": "General Asian",
                   "Chinese": "General Asian",
                   "Filipino": "General Asian",
                   "Hmong": "General Asian",
                   "Japanese": "General Asian",
                   "Korean": "General Asian",
                   "Laotian": "General Asian",
                   "Other Asian": "General Asian",
                   "Vietnamese": "General Asian",
                   "Samoan": "Pacific Islander",
                   "Other Pacific Islander": "Pacific Islander",
                   "Guamanian": "Pacific Islander",
                   "Tahitian": "Pacific Islander",
                   "Laotian": "Pacific Islander",
                   "Hawaiian": "Pacific Islander"}, inplace=True)

In [6]:
# Replace city categories
final_demo.replace({"So San Francisco": "South SF",
                    "South San Francisco": "South SF",
                    "So. San Francisco": "South SF",
                    "So San Francisco ": "South SF",
                    "So  San Francisco": "South SF",
                    "So Sn Francisco": "South SF",
                    "So SanFrancisco": "South SF",
                    "So San  Francisco": "South SF",
                    "So San Francico": "South SF",
                    "S San Francisco": "South SF",
                    "So San Fran": "South SF",
                    "south San Francisco": "South SF",
                    "South San Francisco ": "South SF",
                    "South San Francico": "South SF",
                    "So San Francsico": "South SF",
                    "So San Franicsco": "South SF",
                    "Concord ": "Concord",
                    "Burlingame ": "Burlingame",
                    "Pacifica ": "Pacifica",
                    "Daly cITY": "Daly City",
                    "Daly City ": "Daly City",
                    "Daly City  ": "Daly City",
                    "Daly Citiy": "Daly City",
                    "Daly Ciy": "Daly City",
                    "Daly CIty": "Daly City",
                    "San Mateo ": "San Mateo"
                   }, inplace=True)

# Creating yearly enrollment group

In [7]:
# Year subgroups
enroll2016 = final_demo.loc[final_demo["year"]==2016]
enroll2017 = final_demo.loc[final_demo["year"]==2017]
enroll2018 = final_demo.loc[final_demo["year"]==2018]
enroll2019 = final_demo.loc[final_demo["year"]==2019]

## + Creating subgroups - Ethnicity

In [32]:
### YEAR 2016 ###

# Calcaulte number of enrollment based on ethnicity
enrollRace2016 = pd.DataFrame(enroll2016.groupby(["ethnicity"])["student_id"].count())

# Add year column
enrollRace2016["year"] = 2016

# Rename column name
enrollRace2016.rename({"student_id": "enrollment"}, axis=1, inplace=True)

In [38]:
### YEAR 2017 ###

# Calcaulte number of enrollment based on ethnicity
enrollRace2017 = pd.DataFrame(enroll2017.groupby(["ethnicity"])["student_id"].count())

# Add year column
enrollRace2017["year"] = 2017

# Rename column name
enrollRace2017.rename({"student_id": "enrollment"}, axis=1, inplace=True)

In [39]:
### YEAR 2018 ###

# Calcaulte number of enrollment based on ethnicity
enrollRace2018 = pd.DataFrame(enroll2018.groupby(["ethnicity"])["student_id"].count())

# Add year column
enrollRace2018["year"] = 2018

# Rename column name
enrollRace2018.rename({"student_id": "enrollment"}, axis=1, inplace=True)

In [40]:
### YEAR 2019 ###

# Calcaulte number of enrollment based on ethnicity
enrollRace2019 = pd.DataFrame(enroll2019.groupby(["ethnicity"])["student_id"].count())

# Add year column
enrollRace2019["year"] = 2019

# Rename column name
enrollRace2019.rename({"student_id": "enrollment"}, axis=1, inplace=True)

In [43]:
# Append 4 dataframes into one
enrollRace = enrollRace2016.append(enrollRace2017)
enrollRace = enrollRace.append(enrollRace2018)
enrollRace = enrollRace.append(enrollRace2019)

In [45]:
# Export to csv file
enrollRace.to_csv("assets/data/race_data.csv", index=True)

## + Creating subgroups - City

In [46]:
### YEAR 2016 ###

# Calcaulte number of enrollment based on city
enrollCity2016 = pd.DataFrame(enroll2016.groupby(["city"])["student_id"].count())

# Add year column
enrollCity2016["year"] = 2016

# Rename column name
enrollCity2016.rename({"student_id": "enrollment"}, axis=1, inplace=True)

In [47]:
enrollCity2016

Unnamed: 0_level_0,enrollment,year
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Antioch,1,2016
Belmont,1,2016
Brentwood,4,2016
Brisbane,3,2016
Burlingame,2,2016
Castro Valley,1,2016
Clovis,1,2016
Colma,1,2016
Concord,4,2016
Daly City,1388,2016
