# Feature Learning

The final step of our analysis will use likert scale data submitted with the reviews to examine teacher ratings. First we will review the 

In [1]:
import os
import pandas as pd
import bear_necessities as bn
import visuals as vs 
import numpy as np 

from importlib import reload
vs = reload(vs)

**Lead the data:**

In [2]:
data = bn.decompress_pickle(os.getcwd() + '/data/review_stats.pbz2')

# Create dummy variables for year to illustrate data availability over time 
data=pd.concat([data,
                pd.get_dummies(data['DateTime'].apply(lambda x: x.split('-')[0]),
                               prefix='Year')],
                axis = 1)

**List the variables you want to use and display their availability over time:**

In [3]:
characteristics = ['Clarity',
                   'Easiness',
                   'Exam Difficulty',
                   'Helpfulness',
                   'Knowledge',
                   'Textbook Use',
                   'Determination',
                   'Effective',
                   'Empathy',
                   'Homework',
                   'Integrity',
                   'Parent Relation',
                   'Respect']

# Show the availability of variables across years 
yvars = [c for c in data.columns if 'Year_' in c]
corrs = data[characteristics + yvars].corr()
corrs.loc[characteristics, yvars]

Unnamed: 0,Year_2001,Year_2002,Year_2003,Year_2004,Year_2005,Year_2006,Year_2007,Year_2008,Year_2009,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018
Clarity,0.000527,0.001016,0.006069,-0.008172,-0.004438,-0.008973,-0.008382,-0.009673,0.0016,0.041908,0.023779,0.004017,-0.003509,0.001357,-0.021754,-0.013722,-0.016546,-0.004719
Easiness,-0.013498,-0.003028,-0.001202,-0.013927,-0.015329,-0.015031,-0.015084,-0.013463,0.037498,0.08891,0.058987,0.017889,-0.047095,-0.027665,-0.036051,-0.03422,-0.029205,-0.006071
Exam Difficulty,,,,,,,,,,,,,-0.014538,0.01067,-0.003946,-0.000193,0.000283,0.001059
Helpfulness,-0.00144,-0.000578,-0.001631,-0.015709,-0.009532,-0.012074,-0.011983,-0.014139,0.000373,0.040003,0.024262,0.006052,-0.000961,0.008771,-0.009562,0.003273,-0.001053,-0.002187
Knowledge,,,,,,,,,,,0.013741,0.010849,-0.00494,0.017439,-0.01909,-0.003748,-0.010738,-0.004327
Textbook Use,,,,,,,,,,,,,0.031407,0.059667,0.003305,-0.022251,-0.036976,-0.007009
Determination,,,,,,,,,,,,,,,0.001733,0.052486,-0.050895,-0.010435
Effective,,,,,,,,,,,,,,,0.002596,0.05532,-0.054122,-0.010677
Empathy,,,,,,,,,,,,,,,0.001939,0.054898,-0.0532,-0.011403
Homework,,,,,,,,,,,,,,,0.003382,0.034523,-0.034457,-0.007902


The above table shows that only `Helpfulness`, `Easiness` and `Clarity` have been around since 2001. The other variables were not added until after 2010. 

## Using Likert Scale Data 

We do not know whether a score of 1 for a single individual is equivalent to a score of 1 for another. To address this we will discretize the data, considering that a score above 3 is positive (0) and a score <= 3 is negative (1). This will make it easier to observe correlations as well as train a supervised learning language model. 

In [4]:
# discretize the variables and show their correlations 
df = data[characteristics]
for c in characteristics:
    df[c] = np.where(df[c]<=3, 1, df[c])
    df[c] = np.where((df[c]==4)|(df[c]==5), 0, df[c])    
    
vs = reload(vs)
df.corr()
# Graph the correlation table (not great with dark background behin the image)
# vs.correlation_table(df.corr())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,Clarity,Easiness,Exam Difficulty,Helpfulness,Knowledge,Textbook Use,Determination,Effective,Empathy,Homework,Integrity,Parent Relation,Respect
Clarity,1.0,0.269749,0.027049,0.741692,0.610178,0.114842,,,,,,,
Easiness,0.269749,1.0,0.094211,0.259937,0.206888,0.097866,,,,,,,
Exam Difficulty,0.027049,0.094211,1.0,0.035284,0.056171,0.236351,,,,,,,
Helpfulness,0.741692,0.259937,0.035284,1.0,0.652967,0.112798,,,,,,,
Knowledge,0.610178,0.206888,0.056171,0.652967,1.0,0.11739,,,,,,,
Textbook Use,0.114842,0.097866,0.236351,0.112798,0.11739,1.0,,,,,,,
Determination,,,,,,,1.0,0.927498,0.905849,0.695262,0.918491,0.926107,0.916368
Effective,,,,,,,0.927498,1.0,0.912802,0.695258,0.928452,0.94758,0.920622
Empathy,,,,,,,0.905849,0.912802,1.0,0.683193,0.926653,0.924614,0.92753
Homework,,,,,,,0.695262,0.695258,0.683193,1.0,0.68773,0.692589,0.688529


In [6]:
df['Clarity'].value_counts()

0.0    3770511
1.0    1090804
Name: Clarity, dtype: int64