# Data Analysis (part 1)
## Importing, Cleaning, and Exploring COMM data 

In [1]:
%matplotlib inline

import json
import pandas as pd
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import requests
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import plotly_express as px
import math
import plotly

#### Importing COMM courses for analysis
* First we need to use the Penn Course Review API and the Python requests package to import a JSON file 
* The non-public API key was granted after application via Penn Labs, the student-run club that manages PCR. 

In [2]:
comm_revs1 = requests.get('http://api.penncoursereview.com/v1/depts/comm/reviews/?token=rPY7nzxkE0Dqjzlu9LUDX2mV30W0qo').json()

In [3]:
comm_revs1

{'result': {'values': [{'comments': '',
    'id': '292-001-277-JESSICA-FISHMAN',
    'instructor': {'first_name': 'JESSICA',
     'id': '277-JESSICA-FISHMAN',
     'last_name': 'FISHMAN',
     'name': 'JESSICA FISHMAN',
     'path': '/instructor/277-JESSICA-FISHMAN'},
    'num_reviewers': 83,
    'num_students': 89,
    'path': '/courses/292/sections/1/reviews/277-JESSICA-FISHMAN',
    'ratings': {'rAmountLearned': '2.63',
     'rCommAbility': '2.88',
     'rCourseQuality': '2.43',
     'rDifficulty': '2.28',
     'rInstructorAccess': '2.84',
     'rInstructorQuality': '2.88',
     'rReadingsValue': '2.24',
     'rRecommendMajor': '3.21',
     'rRecommendNonMajor': '2.53',
     'rStimulateInterest': '2.81',
     'rWorkRequired': '2.28'},
    'section': {'aliases': ['COMM-123-001'],
     'id': '292-001',
     'name': 'COMM & POPULAR CULTURE',
     'path': '/courses/292/sections/1',
     'primary_alias': 'COMM-123-001',
     'sectionnum': '001',
     'semester': '2002A'}},
   {'comments'

#### This is a layered JSON file. We need to normalize and parse the data:

In [4]:
comm_revs_df = json_normalize(comm_revs1['result']['values'])

Now let's take a look at the rows and columns.

In [5]:
comm_revs_df

Unnamed: 0,comments,id,instructor.first_name,instructor.id,instructor.last_name,instructor.name,instructor.path,num_reviewers,num_students,path,...,ratings.rStimulateInterest,ratings.rTAQuality,ratings.rWorkRequired,section.aliases,section.id,section.name,section.path,section.primary_alias,section.sectionnum,section.semester
0,,292-001-277-JESSICA-FISHMAN,JESSICA,277-JESSICA-FISHMAN,FISHMAN,JESSICA FISHMAN,/instructor/277-JESSICA-FISHMAN,83,89,/courses/292/sections/1/reviews/277-JESSICA-FI...,...,2.81,,2.28,[COMM-123-001],292-001,COMM & POPULAR CULTURE,/courses/292/sections/1,COMM-123-001,001,2002A
1,,293-001-278-VINCENT-PRICE,VINCENT,278-VINCENT-PRICE,PRICE,VINCENT PRICE,/instructor/278-VINCENT-PRICE,133,181,/courses/293/sections/1/reviews/278-VINCENT-PRICE,...,1.92,,2.32,[COMM-125-001],293-001,COMMUNICATION BEHAVIOR,/courses/293/sections/1,COMM-125-001,001,2002A
2,,293-601-279-MARIAELENA-BARTESAGHI,MARIAELENA,279-MARIAELENA-BARTESAGHI,BARTESAGHI,MARIAELENA BARTESAGHI,/instructor/279-MARIAELENA-BARTESAGHI,23,26,/courses/293/sections/601/reviews/279-MARIAELE...,...,3.30,,2.13,[COMM-125-601],293-601,COMMUNICATION BEHAVIOR,/courses/293/sections/601,COMM-125-601,601,2002A
3,,294-401-280-KATHRYN-KOLBERT,KATHRYN,280-KATHRYN-KOLBERT,KOLBERT,KATHRYN KOLBERT,/instructor/280-KATHRYN-KOLBERT,73,112,/courses/294/sections/401/reviews/280-KATHRYN-...,...,1.79,,1.83,[COMM-175-401],294-401,ARGUMENT & PUBL ADVOCACY,/courses/294/sections/401,COMM-175-401,401,2002A
4,,295-001-281-AMY-B--JORDAN,AMY B.,281-AMY-B--JORDAN,JORDAN,AMY B. JORDAN,/instructor/281-AMY-B--JORDAN,75,89,/courses/295/sections/1/reviews/281-AMY-B--JORDAN,...,3.07,,2.29,[COMM-225-001],295-001,CHILDREN & MEDIA,/courses/295/sections/1,COMM-225-001,001,2002A
5,,296-601-282-EMILY-WEST,EMILY,282-EMILY-WEST,WEST,EMILY WEST,/instructor/282-EMILY-WEST,23,24,/courses/296/sections/601/reviews/282-EMILY-WEST,...,2.13,,2.00,[COMM-262-601],296-601,VISUAL COMMUNICATION,/courses/296/sections/601,COMM-262-601,601,2002A
6,,297-601-283-JOSEPHINE-FERRIGNO,JOSEPHINE,283-JOSEPHINE-FERRIGNO,FERRIGNO,JOSEPHINE FERRIGNO,/instructor/283-JOSEPHINE-FERRIGNO,18,25,/courses/297/sections/601/reviews/283-JOSEPHIN...,...,3.11,,2.50,[COMM-280-601],297-601,COMM & GLOBALIZATION,/courses/297/sections/601,COMM-280-601,601,2002A
7,,298-001-284-RONA-J--BUCHALTER,RONA J.,284-RONA-J--BUCHALTER,BUCHALTER,RONA J. BUCHALTER,/instructor/284-RONA-J--BUCHALTER,13,14,/courses/298/sections/1/reviews/284-RONA-J--BU...,...,1.15,,2.54,[COMM-299-001],298-001,COMMUNICATIONS INTERNSHP,/courses/298/sections/1,COMM-299-001,001,2002A
8,,299-301-285-CAROLYN-A-MARVIN,CAROLYN A,285-CAROLYN-A-MARVIN,MARVIN,CAROLYN A MARVIN,/instructor/285-CAROLYN-A-MARVIN,17,18,/courses/299/sections/301/reviews/285-CAROLYN-...,...,3.65,,3.00,[COMM-322-301],299-301,FREEDOM OF EXPRESSION,/courses/299/sections/301,COMM-322-301,301,2002A
9,,299-601-286-MARGARET-LOUISE-WOODSTOCK,MARGARET LOUISE,286-MARGARET-LOUISE-WOODSTOCK,WOODSTOCK,MARGARET LOUISE WOODSTOCK,/instructor/286-MARGARET-LOUISE-WOODSTOCK,17,18,/courses/299/sections/601/reviews/286-MARGARET...,...,2.65,,2.18,[COMM-322-601],299-601,FREEDOM OF EXPRESSION,/courses/299/sections/601,COMM-322-601,601,2002A


Each row/observation seems to be one course, and the columns are units of analysis featuring different variables from student-submitted course evaluations.

In [6]:
# saving this to the raw data folder
comm_revs_df.to_csv('../data/raw_data/comm_revs_df',index=False)


#### Inspecting the columns

In [7]:
comm_revs_df.columns

Index(['comments', 'id', 'instructor.first_name', 'instructor.id',
       'instructor.last_name', 'instructor.name', 'instructor.path',
       'num_reviewers', 'num_students', 'path', 'ratings.rAmountLearned',
       'ratings.rCommAbility', 'ratings.rCourseQuality', 'ratings.rDifficulty',
       'ratings.rInstructorAccess', 'ratings.rInstructorQuality',
       'ratings.rReadingsValue', 'ratings.rRecommendMajor',
       'ratings.rRecommendNonMajor', 'ratings.rStimulateInterest',
       'ratings.rTAQuality', 'ratings.rWorkRequired', 'section.aliases',
       'section.id', 'section.name', 'section.path', 'section.primary_alias',
       'section.sectionnum', 'section.semester'],
      dtype='object')

#### This dataset needs some cleaning. Let's begin with the columns.
* There are some columns we simply do not need for this analysis.
* Others we will want to rename
* We will need the columns with ratings to be treated as numeric so we can do statistical analyses
* We'll add a column just called "level" to represent the course code
* And we will want to re-order the columns to be more user-friendly

In [8]:
# dropping some columns we do not need
comm_revs_df2 = comm_revs_df.drop(columns=['comments','id', 'instructor.id', 'instructor.name','instructor.path','path','section.aliases',
                          'section.id','section.path',])

In [9]:
# sanity check to make sure that they actually dropped cleanly
comm_revs_df2.columns

Index(['instructor.first_name', 'instructor.last_name', 'num_reviewers',
       'num_students', 'ratings.rAmountLearned', 'ratings.rCommAbility',
       'ratings.rCourseQuality', 'ratings.rDifficulty',
       'ratings.rInstructorAccess', 'ratings.rInstructorQuality',
       'ratings.rReadingsValue', 'ratings.rRecommendMajor',
       'ratings.rRecommendNonMajor', 'ratings.rStimulateInterest',
       'ratings.rTAQuality', 'ratings.rWorkRequired', 'section.name',
       'section.primary_alias', 'section.sectionnum', 'section.semester'],
      dtype='object')

We should inspect the dataframe now to see that we are getting closer to something we can work with.

In [10]:
comm_revs_df2.head(5)

Unnamed: 0,instructor.first_name,instructor.last_name,num_reviewers,num_students,ratings.rAmountLearned,ratings.rCommAbility,ratings.rCourseQuality,ratings.rDifficulty,ratings.rInstructorAccess,ratings.rInstructorQuality,ratings.rReadingsValue,ratings.rRecommendMajor,ratings.rRecommendNonMajor,ratings.rStimulateInterest,ratings.rTAQuality,ratings.rWorkRequired,section.name,section.primary_alias,section.sectionnum,section.semester
0,JESSICA,FISHMAN,83,89,2.63,2.88,2.43,2.28,2.84,2.88,2.24,3.21,2.53,2.81,,2.28,COMM & POPULAR CULTURE,COMM-123-001,1,2002A
1,VINCENT,PRICE,133,181,2.33,2.84,2.11,2.27,2.62,2.62,2.14,3.15,1.47,1.92,,2.32,COMMUNICATION BEHAVIOR,COMM-125-001,1,2002A
2,MARIAELENA,BARTESAGHI,23,26,2.65,3.17,2.74,2.13,3.61,3.17,3.17,3.17,2.09,3.3,,2.13,COMMUNICATION BEHAVIOR,COMM-125-601,601,2002A
3,KATHRYN,KOLBERT,73,112,1.68,2.37,1.89,1.49,2.34,2.26,1.99,2.42,1.86,1.79,,1.83,ARGUMENT & PUBL ADVOCACY,COMM-175-401,401,2002A
4,AMY B.,JORDAN,75,89,3.08,3.49,3.2,2.13,3.24,3.49,2.49,3.6,2.87,3.07,,2.29,CHILDREN & MEDIA,COMM-225-001,1,2002A


These column names are not so user-friendly. Making them more familiar and useful now:

In [11]:
#renaming columns
comm_revs_df2.columns = ['instructor_first', 'instructor_last', 
                         'num_reviewers','num_students', 
                      'AmountLearned',
       'CommAbility', 'CourseQuality', 'Difficulty',
       'InstructorAccess', 'InstructorQuality',
       'ReadingsValue', 'RecommendMajor',
       'RecommendNonMajor', 'StimulateInterest',
       'TAQuality', 'WorkRequired', 'CourseTitle',
       'CourseCode', 'Section', 'Semester']

Checking out the new dataframe.

In [12]:
comm_revs_df2

Unnamed: 0,instructor_first,instructor_last,num_reviewers,num_students,AmountLearned,CommAbility,CourseQuality,Difficulty,InstructorAccess,InstructorQuality,ReadingsValue,RecommendMajor,RecommendNonMajor,StimulateInterest,TAQuality,WorkRequired,CourseTitle,CourseCode,Section,Semester
0,JESSICA,FISHMAN,83,89,2.63,2.88,2.43,2.28,2.84,2.88,2.24,3.21,2.53,2.81,,2.28,COMM & POPULAR CULTURE,COMM-123-001,001,2002A
1,VINCENT,PRICE,133,181,2.33,2.84,2.11,2.27,2.62,2.62,2.14,3.15,1.47,1.92,,2.32,COMMUNICATION BEHAVIOR,COMM-125-001,001,2002A
2,MARIAELENA,BARTESAGHI,23,26,2.65,3.17,2.74,2.13,3.61,3.17,3.17,3.17,2.09,3.30,,2.13,COMMUNICATION BEHAVIOR,COMM-125-601,601,2002A
3,KATHRYN,KOLBERT,73,112,1.68,2.37,1.89,1.49,2.34,2.26,1.99,2.42,1.86,1.79,,1.83,ARGUMENT & PUBL ADVOCACY,COMM-175-401,401,2002A
4,AMY B.,JORDAN,75,89,3.08,3.49,3.20,2.13,3.24,3.49,2.49,3.60,2.87,3.07,,2.29,CHILDREN & MEDIA,COMM-225-001,001,2002A
5,EMILY,WEST,23,24,2.39,2.52,2.48,2.17,3.09,2.45,2.22,3.13,2.30,2.13,,2.00,VISUAL COMMUNICATION,COMM-262-601,601,2002A
6,JOSEPHINE,FERRIGNO,18,25,3.06,3.61,3.17,2.17,3.67,3.56,2.56,3.50,3.28,3.11,,2.50,COMM & GLOBALIZATION,COMM-280-601,601,2002A
7,RONA J.,BUCHALTER,13,14,1.58,2.31,1.46,2.08,3.08,2.15,1.86,2.46,1.15,1.15,,2.54,COMMUNICATIONS INTERNSHP,COMM-299-001,001,2002A
8,CAROLYN A,MARVIN,17,18,3.41,3.65,3.47,3.06,2.94,3.71,3.35,3.59,2.47,3.65,,3.00,FREEDOM OF EXPRESSION,COMM-322-301,301,2002A
9,MARGARET LOUISE,WOODSTOCK,17,18,2.88,2.94,2.82,2.47,3.50,3.06,3.06,3.12,2.06,2.65,,2.18,FREEDOM OF EXPRESSION,COMM-322-601,601,2002A


If we want to be able to do calculations and statistical analyses with the data, all the columns must be categorized as numeric and not string values.

In [13]:
# converting to numeric
comm_revs_df2[['CourseQuality', 'InstructorQuality', 'Difficulty',
       'AmountLearned', 'WorkRequired', 'StimulateInterest',
       'InstructorAccess', 'CommAbility', 'ReadingsValue', 'TAQuality',
       'RecommendMajor', 'RecommendNonMajor', 'num_reviewers',
            'num_students']] = comm_revs_df2[['CourseQuality', 'InstructorQuality', 'Difficulty',
       'AmountLearned', 'WorkRequired', 'StimulateInterest',
       'InstructorAccess', 'CommAbility', 'ReadingsValue', 'TAQuality',
       'RecommendMajor', 'RecommendNonMajor', 'num_reviewers', 'num_students']].apply(pd.to_numeric)

The actual course number (e.g. 123) is embedded in the CourseCode, but we should extract it to compare classes:

In [14]:
# splitting the course code column to extract just 'level' (course code)
comm_revs_df2['level']= comm_revs_df2['CourseCode'].apply(lambda v: v.split("-")[1])
comm_revs_df2

Unnamed: 0,instructor_first,instructor_last,num_reviewers,num_students,AmountLearned,CommAbility,CourseQuality,Difficulty,InstructorAccess,InstructorQuality,...,RecommendMajor,RecommendNonMajor,StimulateInterest,TAQuality,WorkRequired,CourseTitle,CourseCode,Section,Semester,level
0,JESSICA,FISHMAN,83,89,2.63,2.88,2.43,2.28,2.84,2.88,...,3.21,2.53,2.81,,2.28,COMM & POPULAR CULTURE,COMM-123-001,001,2002A,123
1,VINCENT,PRICE,133,181,2.33,2.84,2.11,2.27,2.62,2.62,...,3.15,1.47,1.92,,2.32,COMMUNICATION BEHAVIOR,COMM-125-001,001,2002A,125
2,MARIAELENA,BARTESAGHI,23,26,2.65,3.17,2.74,2.13,3.61,3.17,...,3.17,2.09,3.30,,2.13,COMMUNICATION BEHAVIOR,COMM-125-601,601,2002A,125
3,KATHRYN,KOLBERT,73,112,1.68,2.37,1.89,1.49,2.34,2.26,...,2.42,1.86,1.79,,1.83,ARGUMENT & PUBL ADVOCACY,COMM-175-401,401,2002A,175
4,AMY B.,JORDAN,75,89,3.08,3.49,3.20,2.13,3.24,3.49,...,3.60,2.87,3.07,,2.29,CHILDREN & MEDIA,COMM-225-001,001,2002A,225
5,EMILY,WEST,23,24,2.39,2.52,2.48,2.17,3.09,2.45,...,3.13,2.30,2.13,,2.00,VISUAL COMMUNICATION,COMM-262-601,601,2002A,262
6,JOSEPHINE,FERRIGNO,18,25,3.06,3.61,3.17,2.17,3.67,3.56,...,3.50,3.28,3.11,,2.50,COMM & GLOBALIZATION,COMM-280-601,601,2002A,280
7,RONA J.,BUCHALTER,13,14,1.58,2.31,1.46,2.08,3.08,2.15,...,2.46,1.15,1.15,,2.54,COMMUNICATIONS INTERNSHP,COMM-299-001,001,2002A,299
8,CAROLYN A,MARVIN,17,18,3.41,3.65,3.47,3.06,2.94,3.71,...,3.59,2.47,3.65,,3.00,FREEDOM OF EXPRESSION,COMM-322-301,301,2002A,322
9,MARGARET LOUISE,WOODSTOCK,17,18,2.88,2.94,2.82,2.47,3.50,3.06,...,3.12,2.06,2.65,,2.18,FREEDOM OF EXPRESSION,COMM-322-601,601,2002A,322


Re-ordering the columns to emphasize the information we primarily need:

In [15]:
# re-ordering the columns

comm_revs_df2 = comm_revs_df2[['CourseCode','level','Section','CourseTitle','Semester','instructor_first',
 'instructor_last','CourseQuality','InstructorQuality','Difficulty','AmountLearned','WorkRequired',
'StimulateInterest','InstructorAccess','CommAbility','ReadingsValue','TAQuality','RecommendMajor',
'RecommendNonMajor','num_reviewers','num_students']]

comm_revs_df2.head()

Unnamed: 0,CourseCode,level,Section,CourseTitle,Semester,instructor_first,instructor_last,CourseQuality,InstructorQuality,Difficulty,...,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
0,COMM-123-001,123,1,COMM & POPULAR CULTURE,2002A,JESSICA,FISHMAN,2.43,2.88,2.28,...,2.28,2.81,2.84,2.88,2.24,,3.21,2.53,83,89
1,COMM-125-001,125,1,COMMUNICATION BEHAVIOR,2002A,VINCENT,PRICE,2.11,2.62,2.27,...,2.32,1.92,2.62,2.84,2.14,,3.15,1.47,133,181
2,COMM-125-601,125,601,COMMUNICATION BEHAVIOR,2002A,MARIAELENA,BARTESAGHI,2.74,3.17,2.13,...,2.13,3.3,3.61,3.17,3.17,,3.17,2.09,23,26
3,COMM-175-401,175,401,ARGUMENT & PUBL ADVOCACY,2002A,KATHRYN,KOLBERT,1.89,2.26,1.49,...,1.83,1.79,2.34,2.37,1.99,,2.42,1.86,73,112
4,COMM-225-001,225,1,CHILDREN & MEDIA,2002A,AMY B.,JORDAN,3.2,3.49,2.13,...,2.29,3.07,3.24,3.49,2.49,,3.6,2.87,75,89


#### Exploring the data

In [16]:
comm_revs_df2.shape

(760, 21)

#### There are 760 rows (courses) here, and 21 remaining columns (variables). How can we view a list of the courses?

In [17]:
comm_courses = sorted(comm_revs_df2['CourseCode'].unique())
comm_courses

['AFRC-287-401',
 'AFRC-306-401',
 'AFRC-387-401',
 'ANTH-123-401',
 'ANTH-141-401',
 'ANTH-459-401',
 'ASAM-201-401',
 'ASAM-201-601',
 'CINE-150-401',
 'CINE-296-401',
 'CINE-296-402',
 'COMM-105-401',
 'COMM-108-301',
 'COMM-112-301',
 'COMM-123-001',
 'COMM-123-900',
 'COMM-123-910',
 'COMM-123-920',
 'COMM-125-001',
 'COMM-125-601',
 'COMM-125-910',
 'COMM-125-920',
 'COMM-130-001',
 'COMM-130-910',
 'COMM-130-920',
 'COMM-140-001',
 'COMM-140-401',
 'COMM-140-900',
 'COMM-140-910',
 'COMM-175-401',
 'COMM-205-301',
 'COMM-206-401',
 'COMM-207-401',
 'COMM-208-301',
 'COMM-209-301',
 'COMM-210-001',
 'COMM-210-301',
 'COMM-211-301',
 'COMM-213-301',
 'COMM-217-301',
 'COMM-218-910',
 'COMM-221-001',
 'COMM-224-001',
 'COMM-224-301',
 'COMM-225-001',
 'COMM-225-601',
 'COMM-225-910',
 'COMM-225-920',
 'COMM-226-001',
 'COMM-226-401',
 'COMM-226-601',
 'COMM-226-670',
 'COMM-226-920',
 'COMM-230-001',
 'COMM-237-001',
 'COMM-237-301',
 'COMM-237-900',
 'COMM-237-910',
 'COMM-238-001

#### Note there are some cross-listed courses here listed under different departments. Next, can we get an alphabetical list of instructors?

In [18]:
sorted(comm_revs_df2['instructor_last'].unique())

['AGHA',
 'AL-MARASHI',
 'ALBARRACIN',
 'ALLEN',
 'ARSENAULT',
 'BALAJI',
 'BARMADA',
 'BARRETT',
 'BARTESAGHI',
 'BARUH',
 'BELL',
 'BEN-PORATH',
 'BERGER',
 'BERGERE',
 'BERMAN',
 'BEUTIN',
 'BIGMAN-GALIMORE',
 'BLEAKLEY',
 'BLONDHEIM',
 'BLUMENTHAL',
 'BOCK',
 'BOOTH',
 'BRECHMAN',
 'BRIDGES',
 'BROAD',
 'BRUNEAU',
 'BRUTGER',
 'BUCHALTER',
 'BUMGARNER',
 'CAPPELLA',
 'CARLSON',
 'CENTOLA',
 'CHAN',
 'CHATMAN',
 'CHEN',
 'CHERNIN',
 'CHESTER',
 'CHOMSKY',
 'CHRISTENSEN',
 'COPPA',
 'COYER',
 'CROCCO',
 'DANILLER',
 'DECHERNEY',
 'DELLI CARPINI',
 'DILLIPLANE',
 'DUFFY',
 'DUTWIN',
 'EISENHOWER',
 'EL ZEIN',
 'ERDENER',
 'ERIGHA',
 'FALK',
 'FALZONE',
 'FARRALL',
 'FELDMAN',
 'FELSENTHAL',
 'FELZENBERG',
 'FERRARI',
 'FERRIGNO',
 'FINEMAN',
 'FISHMAN',
 'FRAZE',
 'FRERES',
 'GABBADON',
 'GANDY',
 'GARFIELD',
 'GARRY',
 'GIBSON',
 'GILBOA',
 'GILLIAM',
 'GIRGINOVA',
 'GLANVILLE',
 'GOLDMAN',
 'GOLDTHWAITE YOUNG',
 'GONZALEZ-BAILON',
 'GOTTFRIED',
 'GRIEBLING',
 'GRINBERG',
 'GROSS',
 

#### Generating some initial summary statistics on the dataset:

In [19]:
comm_revs_df2.describe()

Unnamed: 0,CourseQuality,InstructorQuality,Difficulty,AmountLearned,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
count,760.0,760.0,757.0,701.0,757.0,757.0,757.0,757.0,697.0,266.0,701.0,700.0,760.0,760.0
mean,2.969829,3.225632,2.367649,2.955478,2.505931,2.988322,3.155033,3.200898,2.622281,2.966316,3.246091,2.377171,29.692105,34.132895
std,0.578296,0.543973,0.474776,0.499488,0.497504,0.680073,0.55478,0.528837,0.504454,0.651513,0.518802,0.802207,29.269093,36.217059
min,0.79,0.62,1.0,1.11,1.17,0.43,0.95,0.6,0.67,0.0,0.85,0.0,1.0,1.0
25%,2.58,2.86,2.04,2.64,2.18,2.5,2.83,2.88,2.29,2.5725,2.94,1.83,13.0,15.0
50%,3.0,3.33,2.38,3.0,2.47,3.09,3.22,3.29,2.63,3.0,3.32,2.47,20.0,21.0
75%,3.4,3.67,2.67,3.31,2.79,3.55,3.56,3.61,2.93,3.35,3.67,3.0,32.0,35.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,158.0,194.0


#### We can see some initial insights from looking at the above:
* Not every variable has a count of 760, although there were 760 courses here. Not every course was reviewed along every dimension.
* The mean values are pretty close to median values, suggesting there may not be such skewed distribution in many of the data.
* The maximum is 4.00 on the Penn Course Review scale. In general the analyses to come are working within a pretty small range (with minimum reviews at 0 and maximum at 4.00 - so effect sizes are going to be small. Something to keep in mind.
* Also import to note that the reviews begin in 2002, which is not the totality of historical data but is relevant to modern course analysis.

#### What if we just wanted the means for each category?

In [20]:
comm_revs_df2.mean()

CourseQuality         2.969829
InstructorQuality     3.225632
Difficulty            2.367649
AmountLearned         2.955478
WorkRequired          2.505931
StimulateInterest     2.988322
InstructorAccess      3.155033
CommAbility           3.200898
ReadingsValue         2.622281
TAQuality             2.966316
RecommendMajor        3.246091
RecommendNonMajor     2.377171
num_reviewers        29.692105
num_students         34.132895
dtype: float64

#### And how did these means change over time?

In [21]:
comm_revs_df2.groupby('Semester').mean()

Unnamed: 0_level_0,CourseQuality,InstructorQuality,Difficulty,AmountLearned,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
Semester,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002A,2.803684,3.074211,2.326842,2.737368,2.437895,2.720526,3.169474,3.086316,2.645263,,3.262632,2.297368,34.631579,45.578947
2002C,2.824118,3.115882,2.357059,2.794706,2.57,2.728235,3.106471,3.053529,2.594118,,3.159412,2.277059,33.117647,38.764706
2003A,2.802143,3.070714,2.395,2.825714,2.452857,2.81,2.958571,3.119286,2.484286,,3.207143,2.439286,31.571429,44.142857
2003B,3.195,3.195,2.395,3.15,2.51,3.045,3.435,3.07,3.12,,3.64,2.52,12.5,14.0
2003C,2.871765,3.228824,2.219412,2.829412,2.384118,2.898235,3.141765,3.134118,2.508235,,3.300588,2.357647,34.176471,43.235294
2004A,3.001429,3.232857,2.417857,2.92,2.47,2.945,3.007857,3.241429,2.645,,3.263571,2.456429,45.214286,57.142857
2004C,3.067059,3.330588,2.402353,2.999412,2.517059,2.982941,3.166471,3.295882,2.662353,,3.429412,2.585882,36.941176,47.0
2005A,2.908125,3.189375,2.4925,2.82875,2.55125,2.9425,3.044375,3.171875,2.57875,,3.24125,2.405,32.9375,41.75
2005B,3.015714,3.288571,2.445714,3.162857,2.674286,3.218571,3.631429,3.251429,2.691429,,3.31,2.608571,14.714286,16.714286
2005C,2.900588,3.204118,2.459412,2.818824,2.475294,2.881176,3.189412,3.126471,2.475882,,3.237059,2.114118,42.352941,50.411765


#### Summer courses may have a different vibe altogether. We can do the above analysis procedure without summer courses.

We will have to make a non-summer courses dataframe by looking for the letters A (for spring) and C (for fall) in the semesters column.

In [22]:
# Create a subset of the data that does not include summer courses
# look for the letter A (spring) or C (fall) 
#at the end of the 'Semester' string

comm_nosummer = comm_revs_df2[comm_revs_df2['Semester'].str.contains("A|C")]

comm_nosummer.groupby('Semester').mean()

Unnamed: 0_level_0,CourseQuality,InstructorQuality,Difficulty,AmountLearned,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
Semester,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002A,2.803684,3.074211,2.326842,2.737368,2.437895,2.720526,3.169474,3.086316,2.645263,,3.262632,2.297368,34.631579,45.578947
2002C,2.824118,3.115882,2.357059,2.794706,2.57,2.728235,3.106471,3.053529,2.594118,,3.159412,2.277059,33.117647,38.764706
2003A,2.802143,3.070714,2.395,2.825714,2.452857,2.81,2.958571,3.119286,2.484286,,3.207143,2.439286,31.571429,44.142857
2003C,2.871765,3.228824,2.219412,2.829412,2.384118,2.898235,3.141765,3.134118,2.508235,,3.300588,2.357647,34.176471,43.235294
2004A,3.001429,3.232857,2.417857,2.92,2.47,2.945,3.007857,3.241429,2.645,,3.263571,2.456429,45.214286,57.142857
2004C,3.067059,3.330588,2.402353,2.999412,2.517059,2.982941,3.166471,3.295882,2.662353,,3.429412,2.585882,36.941176,47.0
2005A,2.908125,3.189375,2.4925,2.82875,2.55125,2.9425,3.044375,3.171875,2.57875,,3.24125,2.405,32.9375,41.75
2005C,2.900588,3.204118,2.459412,2.818824,2.475294,2.881176,3.189412,3.126471,2.475882,,3.237059,2.114118,42.352941,50.411765
2006A,2.98,3.265,2.43125,2.895625,2.556875,2.9175,3.13,3.25625,2.62625,,3.245,2.324375,37.375,47.5625
2006C,3.024375,3.3375,2.488125,2.94875,2.64,3.0675,3.225625,3.313125,2.576875,,3.388125,2.469375,40.375,52.5


#### What correlations can we see between reviews in different categories?

In [23]:
comm_nosummer.corr()

Unnamed: 0,CourseQuality,InstructorQuality,Difficulty,AmountLearned,WorkRequired,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students
CourseQuality,1.0,0.919831,0.092403,0.89966,0.108361,0.912868,0.546393,0.889546,0.684639,0.348113,0.86837,0.709451,-0.159587,-0.170281
InstructorQuality,0.919831,1.0,0.064127,0.821691,0.105471,0.919087,0.632354,0.929821,0.595534,0.313895,0.82841,0.633118,-0.155523,-0.165039
Difficulty,0.092403,0.064127,1.0,0.222857,0.779739,0.061489,-0.0425,0.071461,0.153255,-0.003397,0.014992,-0.145721,0.034093,0.038213
AmountLearned,0.89966,0.821691,0.222857,1.0,0.21626,0.829168,0.499151,0.82038,0.726558,0.276898,0.794125,0.609065,-0.165588,-0.176756
WorkRequired,0.108361,0.105471,0.779739,0.21626,1.0,0.076057,0.112877,0.096095,0.168426,0.147535,0.006789,-0.252627,-0.189109,-0.180321
StimulateInterest,0.912868,0.919087,0.061489,0.829168,0.076057,1.0,0.521308,0.905718,0.628297,0.29159,0.831403,0.703331,-0.089101,-0.11672
InstructorAccess,0.546393,0.632354,-0.0425,0.499151,0.112877,0.521308,1.0,0.586202,0.421563,0.305334,0.508155,0.315645,-0.340277,-0.340139
CommAbility,0.889546,0.929821,0.071461,0.82038,0.096095,0.905718,0.586202,1.0,0.641001,0.349735,0.822837,0.651621,-0.100136,-0.108248
ReadingsValue,0.684639,0.595534,0.153255,0.726558,0.168426,0.628297,0.421563,0.641001,1.0,0.293327,0.564114,0.491229,-0.185204,-0.188079
TAQuality,0.348113,0.313895,-0.003397,0.276898,0.147535,0.29159,0.305334,0.349735,0.293327,1.0,0.290244,0.145523,-0.157227,-0.165168


#### Initial insights from above:
* Predictably, course quality and instructor quality are very tightly correlated (.92)
* Difficulty is not very highly correlated with course quality or instructor quality at all. COMM students are capable of distinguishing between a 'good' class and an easy one.
* Value of Readings correlates pretty highly with Amount Learned, suggesting readings are an important part of a course.
* As Number of Students increases, Instructor Access (predictably) falls, and there are small negative correlations with Course Quality and Instructor Quality. 


#### How many students enroll in COMM classes each semester?

Grouping by semester and giving the number of students for each:

In [24]:
comm_numstudents = comm_nosummer.groupby('Semester')[['num_students']].sum()
comm_numstudents.reset_index()

Unnamed: 0,Semester,num_students
0,2002A,866
1,2002C,659
2,2003A,618
3,2003C,735
4,2004A,800
5,2004C,799
6,2005A,668
7,2005C,857
8,2006A,761
9,2006C,840


#### How many COMM courses has each instructor taught?

In [25]:
comm_revs_df2.groupby('instructor_last').size().sort_values(ascending=False)

instructor_last
EISENHOWER           44
JORDAN               42
TUROW                25
MESSARIS             25
PAXTON               21
MARVIN               21
ROMANO               21
FELZENBERG           20
HUNT                 18
JAMIESON             17
BLEAKLEY             17
CAPPELLA             14
PEARL                13
HAAS                 12
JACKSON              12
THEOPHANO            12
LINEBARGER           10
KATZ                  9
SENDER                9
PRICE                 9
ZELIZER               9
YANG                  9
GARRY                 9
WOOLF                 9
WINNEG                9
HART                  9
HAMPTON               8
DUTWIN                8
BOOTH                 8
MOEHLER               7
                     ..
SHAW                  1
HENRICHSEN            1
THORSON               1
MAURANTONIO           1
RICKEY                1
FISHMAN               1
FRAZE                 1
FRERES                1
GABBADON              1
WALDMAN               1


#### How many times has each COMM course been offered?

In [26]:
comm_revs_df2['level'].value_counts()

395    35
125    32
130    31
225    28
123    26
299    25
398    25
275    19
226    19
323    17
290    16
322    16
262    15
300    15
491    14
339    12
340    12
374    11
397    11
237    10
495    10
494     9
230     9
399     9
390     9
210     9
377     8
140     8
413     8
330     7
       ..
349     1
175     1
302     1
206     1
445     1
205     1
421     1
315     1
278     1
191     1
287     1
348     1
326     1
409     1
294     1
381     1
260     1
531     1
418     1
105     1
353     1
213     1
368     1
561     1
420     1
337     1
241     1
382     1
307     1
352     1
Name: level, Length: 161, dtype: int64

#### Which level of COMM classes is offered the most often?

In [27]:
# creating a new column to determine what hundred level each course is 
comm_revs_df2['hundred'] = comm_revs_df2.level.str[:1]

In [28]:
# within those what does the breakdown look like?
by_level = comm_revs_df2.groupby('hundred')
by_level.size()

hundred
0      3
1    112
2    207
3    301
4    135
5      2
dtype: int64

In [29]:
# within those levels which courses are offered how many times?
counts_by_level = by_level['level'].value_counts()
counts_by_level

hundred  level
0        024       3
1        125      32
         130      31
         123      26
         140       8
         141       7
         150       2
         105       1
         108       1
         110       1
         112       1
         175       1
         191       1
2        225      28
         299      25
         226      19
         275      19
         290      16
         262      15
         237      10
         210       9
         230       9
         238       6
         292       5
         245       4
         211       3
         224       3
         281       3
         282       3
         296       3
                  ..
4        408       3
         415       3
         416       3
         481       3
         496       3
         406       2
         407       2
         411       2
         419       2
         433       2
         441       2
         454       2
         459       2
         485       2
         409       1
         412       

We have finally arrived at a solid understanding of the cleaned data:

In [30]:
comm_revs_df2

Unnamed: 0,CourseCode,level,Section,CourseTitle,Semester,instructor_first,instructor_last,CourseQuality,InstructorQuality,Difficulty,...,StimulateInterest,InstructorAccess,CommAbility,ReadingsValue,TAQuality,RecommendMajor,RecommendNonMajor,num_reviewers,num_students,hundred
0,COMM-123-001,123,001,COMM & POPULAR CULTURE,2002A,JESSICA,FISHMAN,2.43,2.88,2.28,...,2.81,2.84,2.88,2.24,,3.21,2.53,83,89,1
1,COMM-125-001,125,001,COMMUNICATION BEHAVIOR,2002A,VINCENT,PRICE,2.11,2.62,2.27,...,1.92,2.62,2.84,2.14,,3.15,1.47,133,181,1
2,COMM-125-601,125,601,COMMUNICATION BEHAVIOR,2002A,MARIAELENA,BARTESAGHI,2.74,3.17,2.13,...,3.30,3.61,3.17,3.17,,3.17,2.09,23,26,1
3,COMM-175-401,175,401,ARGUMENT & PUBL ADVOCACY,2002A,KATHRYN,KOLBERT,1.89,2.26,1.49,...,1.79,2.34,2.37,1.99,,2.42,1.86,73,112,1
4,COMM-225-001,225,001,CHILDREN & MEDIA,2002A,AMY B.,JORDAN,3.20,3.49,2.13,...,3.07,3.24,3.49,2.49,,3.60,2.87,75,89,2
5,COMM-262-601,262,601,VISUAL COMMUNICATION,2002A,EMILY,WEST,2.48,2.45,2.17,...,2.13,3.09,2.52,2.22,,3.13,2.30,23,24,2
6,COMM-280-601,280,601,COMM & GLOBALIZATION,2002A,JOSEPHINE,FERRIGNO,3.17,3.56,2.17,...,3.11,3.67,3.61,2.56,,3.50,3.28,18,25,2
7,COMM-299-001,299,001,COMMUNICATIONS INTERNSHP,2002A,RONA J.,BUCHALTER,1.46,2.15,2.08,...,1.15,3.08,2.31,1.86,,2.46,1.15,13,14,2
8,COMM-322-301,322,301,FREEDOM OF EXPRESSION,2002A,CAROLYN A,MARVIN,3.47,3.71,3.06,...,3.65,2.94,3.65,3.35,,3.59,2.47,17,18,3
9,COMM-322-601,322,601,FREEDOM OF EXPRESSION,2002A,MARGARET LOUISE,WOODSTOCK,2.82,3.06,2.47,...,2.65,3.50,2.94,3.06,,3.12,2.06,17,18,3


In [31]:
# saving this df to the cleaned data folder
comm_revs_df2.to_csv('../data/clean_data/comm_revs_df2',index=False)