# Data Analysis (part 4)

## Professors who teach multiple sections of the same course in the same semester
* This is not so common in Annenberg, but it is common in other departments that are popular in SAS
* Let's look at Economics professors to prove this point

In [1]:
%matplotlib inline

import json
import pandas as pd
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import requests
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import plotly_express as px
import math
import plotly

The first step will be repeating the procedure for loading in and cleaning the data, this time for ECON:

In [2]:
#repeating procedure for loading and cleaning data

econ_revs1 = requests.get('http://api.penncoursereview.com/v1/depts/econ/reviews/?token=rPY7nzxkE0Dqjzlu9LUDX2mV30W0qo').json()

econ_revs_df = json_normalize(econ_revs1['result']['values'])

### Cleaning the columns for our analysis

econ_revs_df2 = econ_revs_df.drop(columns=['comments','id', 'instructor.id', 'instructor.name','instructor.path','path','section.aliases',
                          'section.id','section.path',])

econ_revs_df2.columns = ['instructor_first', 'instructor_last', 
                         'num_reviewers','num_students', 
                      'AmountLearned',
       'CommAbility', 'CourseQuality', 'Difficulty',
       'InstructorAccess', 'InstructorQuality',
       'ReadingsValue', 'RecommendMajor',
       'RecommendNonMajor', 'StimulateInterest',
       'TAQuality', 'WorkRequired', 'CourseTitle',
       'CourseCode', 'Section', 'Semester']

#converting to numeric so we can work with stats
econ_revs_df2[['CourseQuality', 'InstructorQuality', 'Difficulty',
       'AmountLearned', 'WorkRequired', 'StimulateInterest',
       'InstructorAccess', 'CommAbility', 'ReadingsValue', 'TAQuality',
       'RecommendMajor', 'RecommendNonMajor', 'num_reviewers',
            'num_students']] = econ_revs_df2[['CourseQuality', 'InstructorQuality', 'Difficulty',
       'AmountLearned', 'WorkRequired', 'StimulateInterest',
       'InstructorAccess', 'CommAbility', 'ReadingsValue', 'TAQuality',
       'RecommendMajor', 'RecommendNonMajor', 'num_reviewers', 'num_students']].apply(pd.to_numeric)

#to split a course code to get level
econ_revs_df2['level']= econ_revs_df2['CourseCode'].apply(lambda v: v.split("-")[1])

#reordering the columns
econ_revs_df2 = econ_revs_df2[['CourseCode','level','Section','CourseTitle','Semester','instructor_first',
 'instructor_last','CourseQuality','InstructorQuality','Difficulty','AmountLearned','WorkRequired',
'StimulateInterest','InstructorAccess','CommAbility','ReadingsValue','TAQuality','RecommendMajor',
'RecommendNonMajor','num_reviewers','num_students']]

econ_revs_df2.head(5)

Unnamed: 0,CourseCode,level,Section,CourseTitle,Semester,instructor_first,instructor_last,CourseQuality,InstructorQuality,Difficulty,...,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
0,ECON-001-001,1,1,LECTURE,2002A,REBECCA,STEIN,2.52,2.97,2.82,...,2.22,2.4,3.13,2.88,2.29,,3.57,2.4,98,195
1,ECON-001-002,1,2,LECTURE,2002A,REBECCA,STEIN,2.48,2.82,2.6,...,2.17,2.26,3.01,2.76,2.29,,3.41,2.52,154,195
2,ECON-002-001,2,1,LECTURE,2002A,WENDY,EUDEY,3.03,3.47,2.59,...,2.17,3.13,3.38,3.43,2.25,,3.66,2.95,104,156
3,ECON-002-002,2,2,LECTURE,2002A,WENDY,EUDEY,3.26,3.61,2.57,...,2.18,3.29,3.35,3.58,2.64,,3.68,3.13,117,162
4,ECON-002-003,2,3,LECTURE,2002A,WENDY,EUDEY,3.08,3.41,2.52,...,2.01,3.1,3.22,3.48,2.35,,3.63,2.95,144,160


In [3]:
# saving the raw data
econ_revs_df.to_csv('../data/raw_data/econ_revs_df',index=False)

In [4]:
# saving the cleaned data
econ_revs_df2.to_csv('../data/clean_data/econ_revs_df2',index=False)

#### We need to identify which professors teach multiple sections in a semester. For each semester, which instructors appear more than once?

In [5]:
multiple = econ_revs_df2[econ_revs_df2.duplicated(subset=['Semester','instructor_last','level'], keep=False)]
multiple


Unnamed: 0,CourseCode,level,Section,CourseTitle,Semester,instructor_first,instructor_last,CourseQuality,InstructorQuality,Difficulty,...,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
0,ECON-001-001,001,001,LECTURE,2002A,REBECCA,STEIN,2.52,2.97,2.82,...,2.22,2.40,3.13,2.88,2.29,,3.57,2.40,98,195
1,ECON-001-002,001,002,LECTURE,2002A,REBECCA,STEIN,2.48,2.82,2.60,...,2.17,2.26,3.01,2.76,2.29,,3.41,2.52,154,195
2,ECON-002-001,002,001,LECTURE,2002A,WENDY,EUDEY,3.03,3.47,2.59,...,2.17,3.13,3.38,3.43,2.25,,3.66,2.95,104,156
3,ECON-002-002,002,002,LECTURE,2002A,WENDY,EUDEY,3.26,3.61,2.57,...,2.18,3.29,3.35,3.58,2.64,,3.68,3.13,117,162
4,ECON-002-003,002,003,LECTURE,2002A,WENDY,EUDEY,3.08,3.41,2.52,...,2.01,3.10,3.22,3.48,2.35,,3.63,2.95,144,160
5,ECON-002-004,002,004,LECTURE,2002A,JOHN,KNOWLES,2.32,2.27,2.79,...,2.33,2.14,2.34,2.25,2.49,,3.32,2.39,72,97
6,ECON-002-005,002,005,LECTURE,2002A,JOHN,KNOWLES,2.46,2.40,2.75,...,2.42,2.22,2.30,2.31,2.44,,3.39,2.34,79,125
8,ECON-003-001,003,001,MICROECON THEORY,2002A,JULIO,DAVILA,2.33,2.78,2.85,...,2.07,2.07,3.08,2.70,2.44,,3.30,1.41,27,51
9,ECON-003-002,003,002,MICROECON THEORY,2002A,JULIO,DAVILA,2.64,3.11,2.64,...,2.17,2.14,2.85,3.06,2.09,,3.69,1.50,36,43
17,ECON-005-002,005,002,STAT FOR ECONSTS,2002A,ROBERTO S,MARIANO,1.94,2.18,3.00,...,2.53,1.94,2.53,2.00,2.12,,3.06,0.65,17,29


#### Now, which courses have been taught multiple sections in a given term?

In [6]:
# which courses are offered 
multiple['level'].unique()

array(['001', '002', '003', '005', '050', '006', '220', '101', '102',
       '103', '010', '104', '302'], dtype=object)

In [7]:
# how many sections are typically offered?
multiple_sections = multiple['Section'].unique()
multiple_sections

array(['001', '002', '003', '004', '005', '006', '401', '402'],
      dtype=object)

In [8]:
# who are the professors teaching multiple?
multiple_profs = multiple['instructor_last'].unique()
multiple_profs

array(['STEIN', 'EUDEY', 'KNOWLES', 'DAVILA', 'MARIANO', 'CHANG',
       'YEAPLE', 'SPIEGEL', 'BRUNETTI', 'DIEBOLD', 'SCHORFHEIDE',
       'MANOVSKII', 'NOCKE', 'MENZIO', 'DE PAULA', 'KRASNOKUTSKAYA',
       'GOLAN', 'OZMUCUR', 'LI', 'SONG', 'KIRCHER', 'TANG', 'JAO',
       'FIELER', 'OKADA', 'CHENG', 'BOSSI', 'BACHMANN', 'SAKA', 'DAVID',
       'DUCHENE'], dtype=object)

In [9]:
#let's do one analysis to see how it will work, 
#then iterate over our list of multiple_profs
multiple[multiple['instructor_last']=='STEIN'].groupby('Section').mean()

Unnamed: 0_level_0,CourseQuality,InstructorQuality,Difficulty,AmountLearned,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
Section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2.766071,3.184286,2.750714,2.927895,2.373214,2.919643,3.137857,3.193929,2.389474,2.828889,3.466316,2.615789,133.785714,178.321429
2,2.791379,3.202759,2.711724,2.926,2.355172,2.952759,3.112414,3.197586,2.3595,2.882222,3.4955,2.6445,160.827586,185.068966
3,3.014545,3.379091,2.693636,3.08,2.231818,3.138182,3.186364,3.350909,2.38875,2.78,3.67625,2.8775,163.0,191.272727
4,3.022,3.426,2.716,3.066,2.27,3.142,3.264,3.384,2.432,,3.67,2.904,156.6,185.0
6,2.73,3.3,2.71,2.86,2.35,2.82,3.17,3.24,2.24,,3.44,2.52,131.0,157.0


So Rebecca Stein has taught sections 001 through 006, and her means along various variables do have a somewhat wide range.

#### We will need to use a for loop to iterate over our list of sections AND our list of professors who teach multiple sections. 
* We will need a new dataframe to store the results
* We will need an empty array to store the means for each section
* We need to find where a given professor is teaching the same section more than once
* This is going to have to include a loop within a loop

In [10]:
mult_profs_revs = pd.DataFrame()
mult_profs_revs['instructor'] = multiple_profs

#for loop:

for sect in multiple_sections:
    means = []
    for prof in multiple_profs:
        p = multiple[(multiple['instructor_last'] == prof) & 
                     (multiple['Section'] == sect)]['InstructorQuality'].mean()
        means.append(p)
    mult_profs_revs[sect] = means

In [11]:
mult_profs_revs

Unnamed: 0,instructor,001,002,003,004,005,006,401,402
0,STEIN,3.184286,3.202759,3.379091,3.426,,3.3,,
1,EUDEY,3.073125,3.03125,3.194,,,,,
2,KNOWLES,2.38,2.75,,2.27,2.4,,,
3,DAVILA,2.56,2.92,,,,,,
4,MARIANO,,2.18,3.06,,,,,
5,CHANG,,,,2.27,1.68,,,
6,YEAPLE,3.353333,3.313333,,,,,,
7,SPIEGEL,2.195556,2.1575,2.64,2.606667,2.5125,2.596667,,
8,BRUNETTI,3.32,3.22,,,,,,
9,DIEBOLD,3.2275,3.2,,,,,,


#### Insights from above:
* NaN values are where a professor has not taught that section number
* But each professor in the dataframe has taught more than one
* Typically (cross checked with course rosters historically) the higher numbers are taught later in the day and lower numbers taught first 
* There are differences detectable between a professor's multiple sections taught of a given course within a semester
* Some professors improve as the day goes on, some not - case-by-case basis but still important for students to check it out
* It did not make sense to take the means down the columns of the above because a professor-by-professor breakdown is a more fair way to see how things change across sections taught, whereas 001 appears way more often than 006, etc. 

In [12]:
# this very rarely happens in COMM so it is not something that COMM majors must worry about
#does it ever happen in COMM?
comm_revs_df2 = pd.read_csv('../data/clean_data/comm_revs_df2')
comm_revs_df2[comm_revs_df2.duplicated(subset=['Semester','instructor_last','level'], keep=False)]

Unnamed: 0,CourseCode,level,Section,CourseTitle,Semester,instructor_first,instructor_last,CourseQuality,InstructorQuality,Difficulty,...,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students,hundred
278,COMM-281-301,281,301,RACE FILMS: SPIKE LEE,2008C,JOHN L.,JACKSON,2.88,3.5,2.19,...,3.31,3.38,3.31,2.88,,3.13,2.81,17,18,2
295,ENGL-281-401,281,401,Race Films: Spike Lee and his Interlocutors,2008C,JOHN L.,JACKSON,3.0,3.42,2.25,...,3.22,3.24,3.31,2.75,,3.25,2.75,65,72,2
452,COMM-399-001,399,1,"INDEPENDENT STUDY: Teenagers, Media, and Self-...",2012C,AMY B.,JORDAN,4.0,4.0,2.0,...,4.0,4.0,4.0,4.0,,4.0,4.0,1,1,3
453,COMM-399-002,399,2,INDEPENDENT STUDY,2012C,AMY B.,JORDAN,3.0,4.0,,...,,,,,,,,1,1,3
454,COMM-399-003,399,3,INDEPENDENT STUDY: Safe Sex Messages in Televi...,2012C,AMY B.,JORDAN,4.0,4.0,,...,,,,,,,,1,1,3
455,COMM-399-004,399,4,INDEPENDENT STUDY: Non-Profit Partnerships: Sc...,2012C,AMY B.,JORDAN,4.0,4.0,3.0,...,4.0,4.0,4.0,4.0,,4.0,3.0,1,1,3
482,ASAM-201-401,201,401,IMMIGRANT URBAN LABOR IN THE U.S.,2013A,TAMARA K.,NOPPER,2.76,3.14,2.0,...,3.07,2.87,3.53,,,,,22,24,2
483,ASAM-201-601,201,601,Asian Americans and Popular Culture,2013A,TAMARA K.,NOPPER,3.34,3.45,2.16,...,3.44,2.74,3.6,,3.5,,,30,32,2


Confirmed: this is not something COMM majors need be concerned with.