In [23]:
# Requirements
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Getting the data

In [24]:
from pyspark.sql import SparkSession, functions
spark = SparkSession.builder.appName('app').getOrCreate()

In [26]:
df = spark.read.csv('go_to_college.csv', header=True, inferSchema=True)

Columns' Explanation


- `type_school` : Type of school that student attends
- `school_accreditation` : Quality of school. A is better than B.
- `gender` : Gender of student
- `interest` : How interested are students if they go to college
- `residence` : Type of residence
- `parent_age` : Parent age
- `parent_salary` : Parent salary per month in IDR/Rupiah
- `house_area` : Parent house area in meter square
- `average_grades` : Average of grades in scale of 0-100
- `parent_was_in_college` : Was parent ever in college?
- `will_go_to_college` : Predictions about going to college

In [27]:
df.printSchema()

root
 |-- type_school: string (nullable = true)
 |-- school_accreditation: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- interest: string (nullable = true)
 |-- residence: string (nullable = true)
 |-- parent_age: integer (nullable = true)
 |-- parent_salary: integer (nullable = true)
 |-- house_area: double (nullable = true)
 |-- average_grades: double (nullable = true)
 |-- parent_was_in_college: boolean (nullable = true)
 |-- will_go_to_college: boolean (nullable = true)



In [28]:
df.dtypes

[('type_school', 'string'),
 ('school_accreditation', 'string'),
 ('gender', 'string'),
 ('interest', 'string'),
 ('residence', 'string'),
 ('parent_age', 'int'),
 ('parent_salary', 'int'),
 ('house_area', 'double'),
 ('average_grades', 'double'),
 ('parent_was_in_college', 'boolean'),
 ('will_go_to_college', 'boolean')]

In [29]:
cols = ['parent_age', 'parent_salary', 'house_area', 'average_grades']
# df.select(cols).describe().show()
df.select(cols).summary().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|       parent_age|     parent_salary|        house_area|    average_grades|
+-------+-----------------+------------------+------------------+------------------+
|  count|             1000|              1000|              1000|              1000|
|   mean|           52.208|         5381570.0| 74.51530000000005| 86.09719999999999|
| stddev|3.500426972383368|1397545.9096822797|15.293345687989016|3.3787384085236942|
|    min|               40|           1000000|              20.0|              75.0|
|    25%|               50|           4360000|              64.6|             83.73|
|    50%|               52|           5440000|              75.5|             85.57|
|    75%|               54|           6380000|              84.8|             88.26|
|    max|               65|          10000000|             120.0|              98.0|
+-------+-----------------+------------------+------------------+

# Categorical Columns

**Result**

> Many of `Very Interested` and `Interested` student are not going to college. The opposite also happened to the  `Not Interested` and `Less Interested` student. We can see that interest doesn't linearly correlated with college admission. Even so, the presence of significance can make us use the `interest` column as a predictor.

> Other categorical columns don't give siginificant difference in affecting student to go to college.

In [30]:
# interest
col = 'interest'
for value in df.select(col).distinct().collect():
    
    print(value[0])
    df.filter(df[col]==value[0])\
        .groupBy('will_go_to_college')\
        .count()\
        .show()

Very Interested
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  190|
|             false|  134|
+------------------+-----+

Less Interested
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   87|
|             false|  142|
+------------------+-----+

Uncertain
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  158|
|             false|  103|
+------------------+-----+

Interested
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   19|
|             false|   81|
+------------------+-----+

Not Interested
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   46|
|             false|   40|
+------------------+-----+



In [31]:
# type_school
col = 'type_school'
for value in df.select(col).distinct().collect():
    
    print(value[0])
    df.filter(df[col]==value[0])\
        .groupBy('will_go_to_college')\
        .count()\
        .show()

Vocational
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  187|
|             false|  204|
+------------------+-----+

Academic
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  313|
|             false|  296|
+------------------+-----+



In [32]:
# school_accreditation
col = 'school_accreditation'
for value in df.select(col).distinct().collect():
    
    print(value[0])
    df.filter(df[col]==value[0])\
        .groupBy('will_go_to_college')\
        .count()\
        .show()

B
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  252|
|             false|  267|
+------------------+-----+

A
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  248|
|             false|  233|
+------------------+-----+



In [33]:
# gender
col = 'gender'
for value in df.select(col).distinct().collect():
    
    print(value[0])
    df.filter(df[col]==value[0])\
        .groupBy('will_go_to_college')\
        .count()\
        .show()

Female
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  251|
|             false|  234|
+------------------+-----+

Male
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  249|
|             false|  266|
+------------------+-----+



In [34]:
# residence
col = 'residence'
for value in df.select(col).distinct().collect():
    
    print(value[0])
    df.filter(df[col]==value[0])\
        .groupBy('will_go_to_college')\
        .count()\
        .show()

Urban
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  271|
|             false|  268|
+------------------+-----+

Rural
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  229|
|             false|  232|
+------------------+-----+



In [35]:
# parent_was_in_college
col = 'parent_was_in_college'
for value in df.select(col).distinct().collect():
    
    print(value[0])
    df.filter(df[col]==value[0])\
        .groupBy('will_go_to_college')\
        .count()\
        .show()

True
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  272|
|             false|  248|
+------------------+-----+

False
+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  228|
|             false|  252|
+------------------+-----+



# Numerical Columns

**Result**


> 1.   Good grade students tend to go to college and bad grade students tend not to go to college. 
2.   Old and young parents tend to hold their children from going to college.
3.   Parents' income and their home are positively correlated with sending their children to college.

>*Warning* :\
We have not examined the possibility of multicollinearity.



In [36]:
# Numerical columns + summary
cols = ['summary', 'parent_age', 'parent_salary', 'house_area', 'average_grades']

# Students who will go to college
print('Students who will go to college')
df.filter(df['will_go_to_college']=='true').summary().select(cols).show()

# Students who wont go to college
print('Students who wont go to college')
df.filter(df['will_go_to_college']=='false').summary().select(cols).show()

Students who will go to college
+-------+------------------+-----------------+-----------------+-----------------+
|summary|        parent_age|    parent_salary|       house_area|   average_grades|
+-------+------------------+-----------------+-----------------+-----------------+
|  count|               500|              500|              500|              500|
|   mean|            52.358|        6046040.0|81.65779999999991|87.80306000000002|
| stddev|2.9966307065072875|1212242.129428243|12.84237877283995|3.568375125806835|
|    min|                41|          1000000|             40.9|             75.0|
|    25%|                51|          5300000|             74.4|            85.41|
|    50%|                52|          6060000|             81.3|            87.64|
|    75%|                54|          6730000|             90.1|            90.28|
|    max|                64|         10000000|            120.0|             98.0|
+-------+------------------+-----------------+---------

In [37]:
# Good grades
filt = df['average_grades'] > 90
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  131|
|             false|    2|
+------------------+-----+



In [38]:
# Bad grades
filt = df['average_grades'] < 83
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   39|
|             false|  117|
+------------------+-----+



In [39]:
# Old parents
filt = df['parent_age'] > 55
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   59|
|             false|   92|
+------------------+-----+



In [40]:
# Young parents
filt = df['parent_age'] < 48
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   25|
|             false|   64|
+------------------+-----+



In [41]:
# Middle age parents
filt = (df['parent_age'] > 50) & (df['parent_age'] < 54)
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  210|
|             false|  154|
+------------------+-----+



In [42]:
# Quartile of parent_salary
q1, q2, q3 = df.approxQuantile("parent_salary", [0.25, 0.5, 0.75], 0)
q1, q2, q3

(4360000.0, 5440000.0, 6380000.0)

In [43]:
# Relative low income salary
filt = df['parent_salary'] < q1
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   36|
|             false|  213|
+------------------+-----+



In [44]:
# Relative high income salary
filt = df['parent_salary'] > q3
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  197|
|             false|   53|
+------------------+-----+



In [45]:
# Relative middle income salary
filt = (df['parent_salary'] > q1) & (df['parent_salary'] < q3)
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  266|
|             false|  232|
+------------------+-----+



In [46]:
# Median or higher salary
filt = df['parent_salary'] > q2
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  353|
|             false|  146|
+------------------+-----+



In [47]:
# Quartile of house area
q1, q2, q3 = df.approxQuantile("house_area", [0.25, 0.5, 0.75], 0)
q1, q2, q3

(64.6, 75.5, 84.8)

In [48]:
# Small house_area
filt = df['house_area'] < q1
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|   46|
|             false|  200|
+------------------+-----+



In [49]:
# Large house area
filt = df['house_area'] > q3
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  198|
|             false|   52|
+------------------+-----+



In [50]:
# Middle house area
filt = (df['house_area'] > q1) & (df['house_area'] < q3)
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  253|
|             false|  243|
+------------------+-----+



In [51]:
# Median or larger house area
filt = df['house_area'] > q2
df.filter(filt).groupBy('will_go_to_college').count().show()

+------------------+-----+
|will_go_to_college|count|
+------------------+-----+
|              true|  350|
|             false|  148|
+------------------+-----+

