# ICE 2: Natural Language Processing
### Nikita Tejwani
### HUDK 4051: Learning Analytics

In [20]:
#Import necessary packages
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
df = pd.read_csv('ICE2_data_eval.csv')

#data wrangling
#the category of the evaluation is made into its own column
#the scores and comments for each category are concatenated into only two columns
evals_teaching = pd.DataFrame()
evals_teaching = pd.concat([evals_teaching, df.iloc[:,0:2]], axis = 1)
evals_teaching['category'] = 'teaching'
evals_teaching.rename(columns = {'teaching':'score', 'teaching.1':'comments'}, inplace = True)

evals_content = pd.DataFrame()
evals_content = pd.concat([evals_content, df.iloc[:,2:4]], axis = 1)
evals_content['category'] = 'content'
evals_content.rename(columns = {'coursecontent':'score', 'coursecontent.1':'comments'}, inplace = True)

evals_exams = pd.DataFrame()
evals_exams = pd.concat([evals_exams, df.iloc[:,4:6]], axis = 1)
evals_exams['category'] = 'exams'
evals_exams.rename(columns = {'examination':'score', 'Examination':'comments'}, inplace = True)

evals_lab = pd.DataFrame()
evals_lab = pd.concat([evals_lab, df.iloc[:,6:8]], axis = 1)
evals_lab['category'] = 'lab'
evals_lab.rename(columns = {'labwork':'score', 'labwork.1':'comments'}, inplace = True)

evals_library = pd.DataFrame()
evals_library = pd.concat([evals_library, df.iloc[:,8:10]], axis = 1)
evals_library['category'] = 'library'
evals_library.rename(columns = {'library_facilities':'score', evals_library.columns[1]:'comments'}, inplace = True)

evals_extracurricular = pd.DataFrame()
evals_extracurricular = pd.concat([evals_extracurricular, df.iloc[:,10:12]], axis = 1)
evals_extracurricular['category'] = 'extracurricular'
evals_extracurricular.rename(columns = {'extracurricular':'score', 'extracurricular.1':'comments'}, inplace = True)

In [3]:
#concatenate all the dataframes containing evals for each category
evals = pd.concat([evals_teaching, evals_content, evals_exams, evals_lab, evals_library, evals_extracurricular], axis = 0)

In [18]:
#define a function that will clean the text of the comments
def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[0-9]+', '', text)
    return text

#use the new function to clean the comments
evals.comments = evals.comments.apply(lambda x:clean_text(x))

In [22]:
#Count vectorizer finds the count of each word and creates a document-term matrix
cv = CountVectorizer(stop_words = 'english')
comment_cv = cv.fit_transform(evals.comments)
dtm = pd.DataFrame(comment_cv.toarray(), columns = cv.get_feature_names())
dtm.index = evals.comments.index
dtm

Unnamed: 0,abilities,ability,able,abroad,absolutely,absurd,abt,academic,accessable,accitivties,...,works,world,worth,write,writing,wrong,yeah,year,years,yes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
182,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
#exploratory analysis
evals.groupby(['score', 'category']).count().unstack()
#comments are overwhelmingly positive
word_counts = dtm.sum()
dtm[word_counts.sort_values(ascending = False).index[:30]].sum()

good           654
excellent       74
students        62
university      61
library         48
books           47
course          43
pattern         40
teachers        39
lab             39
activities      37
knowledge       36
time            32
teaching        31
content         31
work            30
paper           30
checking        30
courses         29
average         29
exam            29
practical       27
marks           27
facilities      26
delivery        26
interaction     25
material        24
like            23
depth           23
lecture         22
dtype: int64

In [19]:
evals.head()

Unnamed: 0,score,comments,category
0,0,teacher are punctual but they should also give...,teaching
1,1,good,teaching
2,1,excellent lectures are delivered by teachers a...,teaching
3,1,good,teaching
4,1,teachers give us all the information required ...,teaching
