In [1]:
import pandas as pd
import numpy as np
import joblib
from keras.models import load_model
import ast
import gdown

In [3]:
# https://drive.google.com/file/d/1-yhQR5_KRVQhnc1YZxrIbuc-JZnPmpbu/view?usp=sharing
gdown.download('https://drive.google.com/uc?id=1-yhQR5_KRVQhnc1YZxrIbuc-JZnPmpbu', '04_all_emails_with_labelled_samples.csv', quiet=False)
# all_emails = pd.read_csv('../data/04_all_emails_with_labelled_samples.csv')
all_emails = pd.read_csv('04_all_emails_with_labelled_samples.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1-yhQR5_KRVQhnc1YZxrIbuc-JZnPmpbu
From (redirected): https://drive.google.com/uc?id=1-yhQR5_KRVQhnc1YZxrIbuc-JZnPmpbu&confirm=t&uuid=a29ffe38-3014-4a36-b586-cd35ab402e95
To: /Users/pkchoy/code/data_science_bootcamp/predictor-with-files/notebooks/04_all_emails_with_labelled_samples.csv
100%|██████████| 136M/136M [00:07<00:00, 18.2MB/s] 


## Mask all embeddings that are not labelled

In [4]:
# check type of DISC_final
print(all_emails['DISC_final'].apply(lambda x: type(x)).value_counts())

DISC_final
<class 'str'>    64442
Name: count, dtype: int64


In [5]:
# convert DISC_final to list
all_emails['DISC_final'] = all_emails['DISC_final'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else [])
print(all_emails['DISC_final'].value_counts())

DISC_final
[]     62054
[S]      674
[C]      629
[I]      582
[D]      503
Name: count, dtype: int64


In [6]:
not_labelled_mask = all_emails['DISC_final'].apply(lambda x: len(x) == 0)
print(all_emails[not_labelled_mask]['DISC_final'].value_counts())


DISC_final
[]    62054
Name: count, dtype: int64


In [7]:
# https://drive.google.com/file/d/1gQgUlhyQBV239iHLdj5q5KvnxorliPgV/view?usp=sharing
gdown.download('https://drive.google.com/uc?id=1gQgUlhyQBV239iHLdj5q5KvnxorliPgV', 'X_all_bert.npy', quiet=False)
# X = np.load('../model_data/X_all_bert.npy')
X = np.load('X_all_bert.npy')
print(X)

Downloading...
From (original): https://drive.google.com/uc?id=1gQgUlhyQBV239iHLdj5q5KvnxorliPgV
From (redirected): https://drive.google.com/uc?id=1gQgUlhyQBV239iHLdj5q5KvnxorliPgV&confirm=t&uuid=89c268df-8a2b-4634-89e5-7b3c9076cf48
To: /Users/pkchoy/code/data_science_bootcamp/predictor-with-files/notebooks/X_all_bert.npy
100%|██████████| 198M/198M [00:10<00:00, 18.3MB/s] 


[[-0.7299717  -0.49167195 -0.8787571  ... -0.8831387  -0.66493493
   0.798456  ]
 [-0.62676775 -0.42787698 -0.92575437 ... -0.74408275 -0.58848137
   0.7045142 ]
 [-0.61534685 -0.40817738 -0.69530636 ... -0.2173199  -0.6889094
   0.74455506]
 ...
 [-0.5463044  -0.50168735 -0.97612244 ... -0.85623616 -0.59687084
   0.34431025]
 [-0.74502814 -0.53911716 -0.97416407 ... -0.92620146 -0.6500866
   0.65626997]
 [-0.7310124  -0.5802631  -0.9622662  ... -0.8861105  -0.66297936
   0.69729215]]


In [8]:
X_not_labelled = X[not_labelled_mask]

## Predict DISC Labels with Logistic Regression

In [9]:
log_reg = joblib.load('../models/log_reg_bootstrap.pkl')

In [10]:
y_pred_probi_log_reg = log_reg.predict_proba(X_not_labelled)
print(y_pred_probi_log_reg)

threshold_log_reg = 0.2
y_pred_log_reg = (y_pred_probi_log_reg >= threshold_log_reg).astype(int)

print(y_pred_log_reg)

[[0.22537586 0.32320042 0.14260739 0.13101938]
 [0.14248023 0.09783818 0.25768252 0.5436792 ]
 [0.25281834 0.10374325 0.26084311 0.17657075]
 ...
 [0.15141419 0.05027271 0.1471198  0.75255273]
 [0.09834303 0.0935619  0.49034689 0.30100052]
 [0.05465415 0.23861551 0.16082456 0.64775935]]
[[1 1 0 0]
 [0 0 1 1]
 [1 0 1 0]
 ...
 [0 0 0 1]
 [0 0 1 1]
 [0 1 0 1]]


In [11]:
disc_probi = ['']

In [12]:
disc_labels = ['D', 'I', 'S', 'C']
log_reg_df = pd.DataFrame(y_pred_log_reg, columns=disc_labels)

In [13]:
log_reg_df['DISC'] = log_reg_df.apply(lambda x: [disc_labels[i] for i in range(4) if x[disc_labels[i]] == 1], axis=1)

In [14]:
log_reg_df['msg_embeddings'] = X_not_labelled.tolist()
log_reg_df.sample(10)

Unnamed: 0,D,I,S,C,DISC,msg_embeddings
47589,0,1,0,0,[I],"[-0.5900834202766418, -0.36523717641830444, -0..."
43960,0,0,1,1,"[S, C]","[-0.4259207546710968, -0.4765876531600952, -0...."
46737,0,0,1,1,"[S, C]","[-0.754563570022583, -0.6029690504074097, -0.9..."
35473,0,1,0,1,"[I, C]","[-0.6870654821395874, -0.2957246005535126, -0...."
53237,0,0,1,1,"[S, C]","[-0.6548557281494141, -0.4832277297973633, -0...."
2790,0,0,1,1,"[S, C]","[-0.8878628611564636, -0.5804175138473511, -0...."
19538,0,1,1,0,"[I, S]","[-0.3763449788093567, -0.4020101726055145, -0...."
3823,1,0,1,0,"[D, S]","[-0.6466488838195801, -0.5353425741195679, -0...."
31495,0,0,0,1,[C],"[-0.5648511648178101, -0.5091506838798523, -0...."
51034,1,1,0,0,"[D, I]","[-0.3132682144641876, -0.3871641159057617, -0...."


In [15]:
log_reg_df.to_csv('../data/07_predicted_labels_all_emails.csv', index=False)