In [1]:
import os
import pandas as pd 
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
#Set all the parameters for the data you would like to scrape
semesters = ['fl18', 'fl19']
ratings = ["1_stars","2_stars","3_stars","4_stars","5_stars","not_labeled"]
cols = ["semester", "hw_num", "student_id", "review", "rating"]
data_dict = {el:[] for el in cols}

# print(os.getcwd())
os.chdir("data/")
owd = os.getcwd()
print("OWD: ", owd)
print(data_dict)


OWD:  c:\Users\schac\Documents\School Stuff\FL2021\Visualization\Final Project\data
{'semester': [], 'hw_num': [], 'student_id': [], 'review': [], 'rating': []}


## Reviews and Ratings

In [3]:
#BUILDING initial dataframe with ratings and reviews 

for semester in semesters: 

    #look at single semester of hw reviews 
    sem_cwd = owd + "/hw_reviews_" + semester
    os.chdir(sem_cwd)
    print("Sem CWD: ", sem_cwd)
    
    #get class demographics data into df 
    # demo_path = "demographics_" + semester + ".csv"
    # raw_class_demo_df = pd.read_csv(demo_path)
    
    #Scrape data from all ratings folders 
    for stars in ratings: 
        rate_cwd = sem_cwd + "/"+ stars
        os.chdir(rate_cwd)
        # print(os.getcwd())
        
        #inside each ratings folder 
        for filename in os.listdir():
            # print(filename)
            data_dict['semester'].append(semester)
            
            #getting text from file 
            f = open(filename, 'r', encoding="utf8", errors='ignore')
            content = f.read()
            data_dict['review'].append(content)
            f.close()
            
            #getting hw and student id from filename 
            hw_num = filename.split('_')[0]
            s = filename.split('_')[2]
            stud_id = s.split('.')[0]
            data_dict['hw_num'].append(hw_num)
            data_dict['student_id'].append(stud_id)
            
            #getting rating from directory 
            if stars == "not_labeled":
                rate = "na"
            else: 
                rate = stars.split('_')[0]
            data_dict['rating'].append(rate)
            
        os.chdir(sem_cwd)
        

    #return to all semesters of data 
    os.chdir(owd)


Sem CWD:  c:\Users\schac\Documents\School Stuff\FL2021\Visualization\Final Project\data/hw_reviews_fl18
Sem CWD:  c:\Users\schac\Documents\School Stuff\FL2021\Visualization\Final Project\data/hw_reviews_fl19


In [4]:
#REVIEWS data set  
hw_data = pd.DataFrame(data = data_dict)
print(hw_data.shape)
# hw_data.to_csv('reviews.csv')
hw_data.head(10)

(1453, 5)


Unnamed: 0,semester,hw_num,student_id,review,rating
0,fl18,hw2,AF72BBZ,This assignment is a chance to review what i l...,1
1,fl18,hw4,1B3313Z,I think this assignment is much harder than th...,1
2,fl18,hw4,AF72BBZ,I continues to struggle with basic concept abo...,1
3,fl18,hw4,B68C78Z,This time the homework is totally different fr...,1
4,fl18,hw4,B8F758Z,Homework 4 is the most time consuming and the ...,1
5,fl18,hw4,DD61E5Z,I did not like this homework. I thought the di...,1
6,fl18,hw4,DD781FZ,"In my opinion, hw4 is the hardest homework so ...",1
7,fl18,hw5,1B3313Z,I think this assignment becomes even more chal...,1
8,fl18,hw5,942FB2Z,"HW5 was hard, not necessarily in the sense tha...",1
9,fl18,hw5,AF72BBZ,This assignment is harder compared to previous...,1


## Demographics

In [None]:
#RAW DEMOGRAPHICS data set 
raw_class_demo_df.columns = ['ID', 'consent_form', 'major_choice', 'major_text', 'standing', 'gender', 'ethnicity']

num_students = raw_class_demo_df.shape[0]
print("Class Size: ", num_students)

signed = raw_class_demo_df['consent_form']
num = signed.value_counts()['y']
print("Number Consenting to Study: ", num)

perc_consent = round((num/num_students)*100, 2)
print("Percent Consenting to Study: ", perc_consent)

raw_class_demo_df.head(10)

In [None]:
#DEMOGRAPHIC CONSENTING summary data 
print("----- Basic Demographic Stats for Consenting Students -----")
print(" ")
class_demo_df = raw_class_demo_df[raw_class_demo_df['consent_form'] == 'y']

#GENDER statistics 
gender = class_demo_df['gender']
num_m = gender.value_counts()['Male']
num_f = gender.value_counts()['Female']
no_gender = num - (num_m + num_f)
perc_m = round((num_m / num)*100,2)
perc_f = round((num_f / num)*100,2)
perc_n = round((no_gender / num)*100,2)
print("Perc Male: ", perc_m, "Perc Female: ", perc_f, "Perc No Response: ", perc_n)

#ETHNICITY statistics
eth_selected_vals = class_demo_df['ethnicity'].dropna().unique()
eth_counts = []
eth_perc = []
for e in eth_selected_vals:
    eth_counts.append(class_demo_df['ethnicity'].value_counts()[e])
    
num_nan = num - sum(eth_counts)
perc_nan = round((num_nan/num)*100,2)

for n in eth_counts: 
    val = round((n/num)*100,2)
    eth_perc.append(val)


eth_dict = dict(zip(eth_selected_vals,eth_perc))
eth_dict['No response'] = perc_nan
print(eth_dict)

#YEAR statistics
yr_selected_vals = class_demo_df['standing'].dropna().unique()
yr_counts = []
yr_perc = []
for yr in yr_selected_vals:
    yr_counts.append(class_demo_df['standing'].value_counts()[yr])
    
yr_num_nan = num - sum(yr_counts)
yr_perc_nan = round((yr_num_nan/num)*100,2)

for n in yr_counts: 
    val = round((n/num)*100,2)
    yr_perc.append(val)

yr_dict = dict(zip(yr_selected_vals,yr_perc))
yr_dict['No response'] = yr_perc_nan
print(yr_dict)

#MAJOR statistics
num_cs = 0
num_bus = 0
major_num_nan = 0
for i in class_demo_df['major_choice']:
    if isinstance(i, str) and ("COMPUTER SCIENCE" in i):
        num_cs += 1
    if isinstance(i, str) and ("BUSINESS" in i) and ("COMPUTER SCIENCE" not in i):
        num_bus += 1
    if not isinstance(i, str):
        major_num_nan += 1
        
perc_cs = round((num_cs/num)*100,2)
perc_bus = round((num_bus/num)*100,2)
perc_nan = round((major_num_nan/num)*100,2)
perc_other = round((100 - (perc_cs + perc_bus + perc_nan)),2)

print("CS Students: ", perc_cs, "Business: ", perc_bus,"Other: ", perc_other, "No Response: ", perc_nan)



## Final Dataframe and Export as csv

In [None]:
# JOINING demographic data with hw review data 
hw_demo_data = hw_data.join(class_demo_df.set_index('ID'), on='student_id',how='inner')
sample_size = hw_demo_data['student_id'].dropna().unique().shape[0]

cdf = set(class_demo_df['ID'].dropna().unique())
hdf = set(hw_demo_data['student_id'].dropna().unique())
no_hw_reviews = list(cdf-hdf)
print("Number of students who consented but didn't submit any hw reviews:", len(no_hw_reviews))
print("Number Consenting Students that Submitted a Review:", sample_size)

#creating this dataset as a csv 
hw_demo_data.to_csv('hw_demographics.csv')
hw_demo_data.head(5)


