In [2]:
import pandas as pd
import numpy as np
from umap import UMAP
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_full = pd.read_parquet('./train_preprocessed.parquet')
df_text_embeddings = pd.read_parquet('./text_emb.parquet')
df_question_embeddings = pd.read_parquet('./question_emb.parquet')
df_summary_embeddings = pd.read_parquet('./sum_emb.parquet')

In [5]:
display(df_text_embeddings.head(2), df_question_embeddings.head(2), df_summary_embeddings.head(2))

Unnamed: 0,39c16e,3b9047,814d6b,ebad26
0,-0.044322,-0.083477,-0.009068,-0.052936
1,0.080737,0.105279,-0.04355,0.096409


Unnamed: 0,39c16e,3b9047,814d6b,ebad26
0,0.06266,-0.106909,-0.068061,-0.045714
1,0.040994,0.075356,0.020131,0.103395


Unnamed: 0,000e8c3c7ddb,0020ae56ffbf,004e978e639e,005ab0199905,0070c9e7af47,0071d51dab6d,0072b649a88c,00746c7c79c3,00791789cc1f,0086ef22de8f,...,ff53b94f7ce0,ff5b8d659ca6,ff5e9e6068da,ff66def9f97c,ff78813d4f7b,ff7c7e70df07,ffc34d056498,ffd1576d2e1b,ffe4a98093b2,fffbccfd8a08
0,-0.032302,-0.021375,-0.094899,-0.030733,-0.045873,-0.043784,-0.059126,-0.030213,0.047673,0.064928,...,-0.028008,-0.018983,0.013535,-0.042548,-0.009163,-0.077723,-0.023761,-0.077284,0.048355,-0.016168
1,-0.061624,0.085844,0.086755,0.038335,-0.079049,0.07982,0.066839,0.046194,0.006142,-0.006347,...,0.065659,-0.012204,0.030358,0.053454,0.057063,0.104824,0.02909,0.086182,-0.002546,0.102556


In [6]:
df_full.head(2)

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,word_overlap_ratio
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,0.09697,14,4,0.063492,0,0.0,0,0.21875
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,0.050186,18,22,0.415094,10,0.192308,0,0.333333


In [7]:
df_full = df_full.assign(
    cos_text = [
        cosine_similarity(np.array(df_text_embeddings.get(pid)).reshape(1, -1),
                          np.array(df_summary_embeddings.get(sid)).reshape(1, -1))[0][0] for sid, pid in zip(df_full.student_id, df_full.prompt_id)
        ],
    cos_question = [
        cosine_similarity(np.array(df_question_embeddings.get(pid)).reshape(1, -1),
                          np.array(df_summary_embeddings.get(sid)).reshape(1, -1))[0][0] for sid, pid in zip(df_full.student_id, df_full.prompt_id)
    ]
)


In [8]:
df_full.head(2)

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,splling_err_num,prompt_question,prompt_title,...,length_ratio,word_overlap_count,bigram_overlap_count,bigram_overlap_ratio,trigram_overlap_count,trigram_overlap_ratio,quotes_count,word_overlap_ratio,cos_text,cos_question
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,5,Summarize how the Third Wave developed over su...,The Third Wave,...,0.09697,14,4,0.063492,0,0.0,0,0.21875,0.75926,0.720335
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,...,0.050186,18,22,0.415094,10,0.192308,0,0.333333,0.488938,0.61743


In [12]:
text_embeddings = df_text_embeddings.T.reset_index(names='prompt_id')
summary_embeddings = df_summary_embeddings.T.reset_index(names='student_id')
question_embeddings = df_question_embeddings.T.reset_index(names='prompt_id')

ids_lookup = pd.Series({sid: pid for sid,pid in zip(df_full['student_id'], df_full['prompt_id'])})

summary_embeddings.insert(1, 'prompt_id', ids_lookup.values)

In [14]:
display(text_embeddings.head(2), question_embeddings.head(2), summary_embeddings.head(2))

Unnamed: 0,prompt_id,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,39c16e,-0.044322,0.080737,0.031065,-0.011584,-0.009389,0.052562,0.034647,0.024612,0.009266,...,0.066465,-0.074595,-0.01164,0.113665,-0.070106,0.037373,0.124775,0.00318,0.01088,-0.092459
1,3b9047,-0.083477,0.105279,0.030022,0.02278,-0.049433,-0.031991,0.033304,-0.00734,-0.004248,...,-0.028389,-0.129694,-0.083241,0.066339,0.051831,-0.064965,0.088054,0.030493,-0.020967,-0.048807


Unnamed: 0,prompt_id,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,39c16e,0.06266,0.040994,0.00929,-0.045406,-0.014459,0.048434,0.043146,-0.018924,0.070947,...,0.100834,-0.092985,0.056858,0.070753,0.006946,-0.022716,0.077385,0.060104,0.017773,-0.043886
1,3b9047,-0.106909,0.075356,-0.005969,-0.024445,-0.07035,-0.029551,-0.021563,-0.023368,-0.07202,...,-0.012987,-0.135993,-0.062401,0.074946,0.054726,-0.058614,0.058162,0.110013,0.003424,-0.006537


Unnamed: 0,student_id,prompt_id,0,1,2,3,4,5,6,7,...,374,375,376,377,378,379,380,381,382,383
0,000e8c3c7ddb,814d6b,-0.032302,-0.061624,-0.039432,-0.024158,0.010305,-0.018682,-0.032586,-0.018416,...,0.038178,0.051087,-0.078121,-8.2e-05,-0.135634,0.000321,0.088186,0.019669,-0.067355,0.07477
1,0020ae56ffbf,ebad26,-0.021375,0.085844,0.033094,0.07163,0.079395,-0.051939,-0.025265,-0.069342,...,0.017595,0.003423,0.043537,0.00686,0.093196,-0.034472,0.054477,-0.008644,-0.05842,0.015333
