In [1]:
import pandas as pd
import textstat
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
# from nltk.sentiment import SentimentIntensityAnalyzer

  from .autonotebook import tqdm as notebook_tqdm
2023-05-13 20:46:18.816439: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-13 20:46:19.337042: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-13 20:46:19.337103: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", device=0)

In [3]:
def get_vocabulary(text):
    vectorizer = CountVectorizer()
    model = vectorizer.fit_transform([text])
    terms = vectorizer.get_feature_names_out()
    return len(terms)

def get_vocab_counter(text):
    vectorizer = CountVectorizer()
    model = vectorizer.fit_transform([text])
    terms = vectorizer.get_feature_names_out()
    return Counter(terms)

def get_n_grams(text, numgram):
    vectorizer = CountVectorizer(lowercase=False,token_pattern = '[a-zA-Z0-9|\']+', analyzer='word', ngram_range=(numgram, numgram))
    model = vectorizer.fit_transform([text])
    terms = vectorizer.get_feature_names_out()
    scores = model.toarray().flatten().tolist()
    data = list(zip(terms,scores))
    sorted_data = sorted(data, key=lambda x: x[1],reverse=True)
    return sorted_data[0][0]

def get_word_cloud(text):
    vectorizer = CountVectorizer(stop_words='english')
    model = vectorizer.fit_transform([text])
    terms = vectorizer.get_feature_names_out()
    scores = model.toarray().flatten().tolist()
    data = list(zip(terms,scores))
    sorted_data = sorted(data, key=lambda x: x[1],reverse=True)
    return dict(sorted_data[:20])

def get_emotion(text):
    result = classifier(text, **tokenizer_kwargs)[0]
    return result['label']

# def get_sentiment(text):
#     sia = SentimentIntensityAnalyzer()
#     return sia.polarity_scores(text)['compound']

In [4]:
# import and transform raw data to pivot answers into single column
df = pd.read_excel('data/data.xlsx')
df = df.rename({'ans1': 1, 'ans2': 2, 'ans3': 3}, axis=1)
df = pd.melt(df, id_vars=['model', 'type', 'question'], value_vars=[1, 2, 3])
df = df.rename({'variable': 'run', 'value': 'text'}, axis=1)

# define dataset subsets from raw data by grouping and combining text strings
overall = df.groupby(['model', 'run'])['text'].apply(lambda x: '. '.join(x)).reset_index()
agg_overall = df.groupby(['model'])['text'].apply(lambda x: '. '.join(x)).reset_index()
categories = df.groupby(['model', 'type', 'run'])['text'].apply(lambda x: '. '.join(x)).reset_index()
agg_categories = df.groupby(['model', 'type'])['text'].apply(lambda x: '. '.join(x)).reset_index()
questions = df.copy()
agg_questions = df.groupby(['model', 'type', 'question'])['text'].apply(lambda x: '. '.join(x)).reset_index()

# get flesch kincaid grade level for each answer or group of answers
overall['readibility'] = overall.apply(lambda x: textstat.flesch_kincaid_grade(x['text']), axis=1)
agg_overall['readibility'] = agg_overall.apply(lambda x: textstat.flesch_kincaid_grade(x['text']), axis=1)
categories['readibility'] = categories.apply(lambda x: textstat.flesch_kincaid_grade(x['text']), axis=1)
agg_categories['readibility'] = agg_categories.apply(lambda x: textstat.flesch_kincaid_grade(x['text']), axis=1)
questions['readibility'] = questions.apply(lambda x: textstat.flesch_kincaid_grade(x['text']), axis=1)
agg_questions['readibility'] = agg_questions.apply(lambda x: textstat.flesch_kincaid_grade(x['text']), axis=1)

# get word count for each answer then get mean of answer length for each grouping
questions['word_count'] = questions.apply(lambda x: textstat.lexicon_count(x['text']), axis=1)
overall['word_count'] = questions.groupby(['model', 'run'])['word_count'].mean().reset_index()['word_count']
agg_overall['word_count'] = questions.groupby(['model'])['word_count'].mean().reset_index()['word_count']
categories['word_count'] = questions.groupby(['model','type', 'run'])['word_count'].mean().reset_index()['word_count']
agg_categories['word_count'] = questions.groupby(['model','type'])['word_count'].mean().reset_index()['word_count']
agg_questions['word_count'] = questions.groupby(['model', 'type', 'question'])['word_count'].mean().reset_index()['word_count']

# get vocabulary size for each answer or grouping of answers
overall['vocab'] = overall['text'].apply(lambda x: get_vocabulary(x))
agg_overall['vocab'] = agg_overall['text'].apply(lambda x: get_vocabulary(x))
categories['vocab'] = categories['text'].apply(lambda x: get_vocabulary(x))
agg_categories['vocab'] = agg_categories['text'].apply(lambda x: get_vocabulary(x))
questions['vocab'] = questions['text'].apply(lambda x: get_vocabulary(x))
agg_questions['vocab'] = agg_questions['text'].apply(lambda x: get_vocabulary(x))

# get each run's unique vocabulary count
overall['vocab_count'] = overall['text'].apply(lambda x: get_vocab_counter(x))
categories['vocab_count'] = categories['text'].apply(lambda x: get_vocab_counter(x))
questions['vocab_count'] = questions['text'].apply(lambda x: get_vocab_counter(x))

overall['total'] = pd.merge(overall, overall.groupby(['model'])['vocab_count'].sum().reset_index(), on='model')['vocab_count_y']
overall['diff'] = overall['total'] - overall['vocab_count']
overall['unique_vocab'] = overall.apply(lambda x: len(set(x['total']).difference(set(x['diff']))), axis=1)
overall.drop(columns=['vocab_count', 'total', 'diff'], inplace=True)

categories['total'] = pd.merge(categories, categories.groupby(['model', 'type'])['vocab_count'].sum().reset_index(), on='model')['vocab_count_y']
categories['diff'] = categories['total'] - categories['vocab_count']
categories['unique_vocab'] = categories.apply(lambda x: len(set(x['total']).difference(set(x['diff']))), axis=1)
categories.drop(columns=['vocab_count', 'total', 'diff'], inplace=True)

questions['total'] = pd.merge(questions, questions.groupby(['model', 'type', 'question'])['vocab_count'].sum().reset_index(), on='model')['vocab_count_y']
questions['diff'] = questions['total'] - questions['vocab_count']
questions['unique_vocab'] = questions.apply(lambda x: len(set(x['total']).difference(set(x['diff']))), axis=1)
questions.drop(columns=['vocab_count', 'total', 'diff'], inplace=True)

# get most common ngram for aggregated answers
agg_overall['ngram'] = agg_overall['text'].apply(lambda x: get_n_grams(x, 8))
agg_categories['ngram'] = agg_categories['text'].apply(lambda x: get_n_grams(x, 6))
agg_questions['ngram'] = agg_questions['text'].apply(lambda x: get_n_grams(x, 4))

# get set of most common words for aggregated answers
agg_overall['word_cloud'] = agg_overall['text'].apply(lambda x: get_word_cloud(x))
agg_categories['word_cloud'] = agg_categories['text'].apply(lambda x: get_word_cloud(x))
agg_questions['word_cloud'] = agg_questions['text'].apply(lambda x: get_word_cloud(x))

# get emotion of each answer
questions['emotion'] = questions['text'].apply(lambda x: get_emotion(x))

# drop text column
questions.drop(columns=['text'],inplace=True)
agg_questions.drop(columns=['text'],inplace=True)
categories.drop(columns=['text'],inplace=True)
agg_categories.drop(columns=['text'],inplace=True)
overall.drop(columns=['text'],inplace=True)
agg_overall.drop(columns=['text'],inplace=True)



In [5]:
# output to csv
questions.to_csv('data/questions.csv', index=False)
agg_questions.to_csv('data/agg_questions.csv', index=False)
categories.to_csv('data/categories.csv', index=False)
agg_categories.to_csv('data/agg_categories.csv', index=False)
overall.to_csv('data/overall.csv', index=False)
agg_overall.to_csv('data/agg_overall.csv', index=False)