In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_rows = 100
pd.options.display.width = 150
RANDOM_SEED = 696


In [26]:
train_df = pd.read_csv('assets/WikiLarge_Train.csv')
train_feats_df = pd.read_csv('assets/train_features_df.csv').rename({'Unnamed: 0': 'idx'}, axis=1).set_index('idx')
score_feats_df = pd.read_csv('assets/score_features.csv').rename({'Unnamed: 0': 'idx'}, axis=1).set_index('idx')

In [27]:
df = train_df.join(train_feats_df).join(score_feats_df)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416768 entries, 0 to 416767
Data columns (total 61 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   original_text             416768 non-null  object 
 1   label                     416768 non-null  int64  
 2   sentence_word_count       416768 non-null  int64  
 3   sentence_dc_word_count    416768 non-null  int64  
 4   percent_dc_words          416768 non-null  float64
 5   sentence_dc_stword_count  416768 non-null  int64  
 6   percent_dc_stwords        416768 non-null  float64
 7   syl_count                 416768 non-null  int64  
 8   FK_Read_Ease              416768 non-null  float64
 9   FK_Read_Grade             416768 non-null  float64
 10  Dom_PoS_SUBTLEX           416768 non-null  object 
 11  AoA_NoMatch               416768 non-null  int64  
 12  AoA_Freq_pm_median        408218 non-null  float64
 13  AoA_Freq_pm_mean          408218 non-null  f

In [28]:
train_df, dev_df, test_df = np.split(df.sample(frac=1, random_state= RANDOM_SEED), 
                       [int(.8*len(df)), int(.9*len(df))], axis = 0)

## tfidf dbscan

In [29]:
custom_stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",     "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',     'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',     'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',     'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',     'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',     'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',     'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',     'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',     'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't",     'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',    "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",     'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',     "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'rrb', 'lrb']


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=25, max_df= .9, strip_accents='ascii',  analyzer='word', ngram_range=(1,3), lowercase=False, stop_words=custom_stop_words)

# Transform data and labels
X_train = vectorizer.fit_transform(train_df.original_text)

In [31]:
X_train.shape

(333414, 27095)

In [32]:
samp_size = int(X_train.shape[0] * 0.1)

In [33]:
samp = np.random.choice(X_train.shape[0], size=samp_size,replace=False)
samp.shape

(33341,)

In [10]:
X_train_samp = X_train[samp]

In [11]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=1)

neigh.fit(X_train_samp)

In [12]:
# np.median(neigh.kneighbors(X_train_samp)[:1])

In [13]:
from sklearn import metrics
# metrics.silhouette_score(df, df['labels'])

In [14]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, random_state=RANDOM_SEED).fit(X_train)
data2D = svd.transform(X_train) 

In [15]:
data2D

array([[ 0.00973815, -0.00166164],
       [ 0.01730446,  0.03953571],
       [ 0.10813404, -0.02413826],
       ...,
       [ 0.16321413, -0.01593165],
       [ 0.        ,  0.        ],
       [ 0.09643596,  0.35162675]])

In [16]:
data2D.max()

0.8781081744144819

## Classify using sent2vec by SBERT

In [35]:
# !pip install -U sentence-transformers

In [39]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [40]:
import re
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)    

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [42]:
# split_pat = re.compile(r'\b\s+\b')
split_pat = re.compile(r'\w+')

df['og_split'] = df['original_text'].parallel_apply(lambda x: re.findall(split_pat, x))
df['cleaned_text'] = df['og_split'].parallel_apply(lambda x: ' '.join(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=52096), Label(value='0 / 52096')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=52096), Label(value='0 / 52096')))…

In [48]:
# Takes 10 years to run
# df['sent_vec'] = df['cleaned_text'].apply(lambda x: model.encode([x]))

In [49]:
# also takes 10 years to run
# df['sent_vec'].to_csv('assets/sent_vec.csv')

In [53]:
import numpy as np

In [60]:
df['sent_vec'] = df['sent_vec'].apply(np.ndarray.flatten)

In [65]:
sent_vec_np_array = np.array(df['sent_vec'].values.tolist())

In [72]:
np.savetxt('sent_vec_np_array.csv', sent_vec_np_array, delimiter=',')

In [68]:
normed_sent_vec_np_array = sent_vec_np_array / np.linalg.norm(sent_vec_np_array, axis=1, keepdims=True)

In [86]:
# https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py
from sentence_transformers import util
import torch
from tqdm import tqdm

normed_sent_vec_torch_array = torch.from_numpy(normed_sent_vec_np_array)

cls = util.community_detection(normed_sent_vec_torch_array, min_community_size=8, threshold=0.60)

cluster_sizes = [len(cluster) for cluster in cls]
print(f'number of obs clustered {sum(cluster_sizes)}')
print(f'number of clusters {len(cluster_sizes)}')

number of obs clustered 245827
number of clusters 13592


In [87]:
len(sent_vec_np_array)

416768

In [88]:
clust_dict = {n:-1 for n in range(len(sent_vec_np_array))}

for i, cluster in enumerate(cls):
    for n in cluster:
        if clust_dict.get(n) == -1:
            clust_dict[n] = i
        else:
            print('collision')

In [96]:
pd.DataFrame.from_dict(clust_dict, orient='index').rename({0:'sent_vec_class'}, axis=1).to_csv('assets/sent_vec_class.csv')

## SVC

In [2]:
sent_vec_np_array = np.loadtxt('sent_vec_np_array.csv', delimiter=',', dtype=np.float64)

In [3]:
sent_vec_np_array

array([[-0.14739437, -0.14400522,  0.30520254, ...,  0.186005  ,
         0.13216972,  0.0271847 ],
       [-0.3423788 ,  0.64353764,  0.09641959, ...,  0.03769291,
         0.25474536,  0.45148695],
       [ 0.11564089,  0.34492534, -0.12513059, ...,  0.12841401,
        -0.02132292, -0.15871231],
       ...,
       [ 0.2169555 ,  0.92392933, -0.21317533, ...,  0.4750323 ,
        -0.06613453,  0.3433581 ],
       [ 0.03890196, -0.05102391, -0.30264756, ...,  0.11241401,
         0.14131893,  0.16626678],
       [-0.05067663, -1.06192613, -0.72202456, ...,  0.11790838,
         0.18318109, -0.1881723 ]])

In [4]:
df = pd.DataFrame()
df['col_vecs'] = sent_vec_np_array.tolist()

In [5]:
train_df = pd.read_csv('assets/WikiLarge_Train.csv')

In [6]:
df = train_df.join(df)

In [7]:
df = df.drop('original_text', axis=1)
df

Unnamed: 0,label,col_vecs
0,1,"[-0.14739437401294708, -0.14400522410869598, 0..."
1,1,"[-0.34237879514694214, 0.6435376405715942, 0.0..."
2,1,"[0.11564088612794876, 0.3449253439903259, -0.1..."
3,1,"[0.19127874076366425, 0.21434128284454346, -0...."
4,1,"[0.25611141324043274, 0.002085210755467415, -0..."
...,...,...
416763,0,"[-0.2092808485031128, 0.1518959254026413, 0.27..."
416764,0,"[-0.21063463389873505, -0.08211549371480942, -..."
416765,0,"[0.2169554978609085, 0.9239293336868286, -0.21..."
416766,0,"[0.03890196233987808, -0.05102391168475151, -0..."


In [8]:
train_df, dev_df, test_df = np.split(df.sample(frac=1, random_state= RANDOM_SEED), 
                       [int(.8*len(df)), int(.9*len(df))], axis = 0)

In [9]:
import re
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)    

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
y_train = train_df['label'].parallel_apply(lambda x: np.array(x))
X_train = train_df['col_vecs'].parallel_apply(lambda x: np.array(x))
y_dev = train_df['label'].parallel_apply(lambda x: np.array(x))
X_dev = train_df['col_vecs'].parallel_apply(lambda x: np.array(x))
y_test = test_df['label'].parallel_apply(lambda x: np.array(x))
X_test = test_df['col_vecs'].parallel_apply(lambda x: np.array(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=41677), Label(value='0 / 41677')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=41677), Label(value='0 / 41677')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=41677), Label(value='0 / 41677')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=41677), Label(value='0 / 41677')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5210), Label(value='0 / 5210'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5210), Label(value='0 / 5210'))), …

In [11]:
X_train_np = np.concatenate(X_train.values).reshape(len(X_train.index), 384)

In [13]:
X_test_np = np.concatenate(X_test.values).reshape(len(X_test.index), 384)

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

svc_clf = make_pipeline(StandardScaler(), BaggingClassifier(estimator=LinearSVC(max_iter=2000), n_estimators=20, random_state=RANDOM_SEED))
svc_clf.fit(X_train_np, y_train)



In [16]:
svc_clf.score(X_test_np, y_test)

0.6324351560812919

In [4]:
# !pip install umap-learn

In [5]:
import umap

clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=10,
    random_state=RANDOM_SEED
).fit_transform(sent_vec_np_array)

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


In [23]:
import hdbscan

labels = hdbscan.HDBSCAN(
    min_samples=100,
    min_cluster_size=500
).fit_predict(clusterable_embedding)

In [24]:
from collections import Counter

Counter(labels)

Counter({67: 2068,
         50: 4423,
         49: 709,
         -1: 215502,
         15: 3265,
         70: 975,
         7: 2364,
         41: 1878,
         29: 2578,
         32: 6772,
         31: 29698,
         72: 2083,
         37: 8580,
         81: 629,
         63: 1293,
         76: 5120,
         20: 5567,
         13: 4290,
         75: 1238,
         39: 2761,
         52: 5419,
         44: 1176,
         53: 540,
         16: 1737,
         59: 733,
         19: 3225,
         46: 3057,
         62: 1111,
         33: 1625,
         17: 1734,
         78: 13001,
         25: 1653,
         26: 1675,
         60: 1962,
         66: 8711,
         57: 3152,
         73: 1522,
         54: 1526,
         27: 619,
         35: 526,
         10: 3456,
         18: 1097,
         12: 828,
         28: 4320,
         2: 820,
         9: 1555,
         51: 808,
         40: 3153,
         56: 1928,
         58: 2490,
         64: 3081,
         61: 1323,
         22: 1698,
  

## DBSCAN

In [None]:
from collections import Counter
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.3, min_samples=2)
cls = dbscan.fit_predict(data2D)
print(f'Cluster membership values: \n{cls}')
Counter(cls)

## Analysis

In [37]:
not_a_feature_list = ['original_text', 'label', 'syl_list', 'Dom_PoS_SUBTLEX.1', 'Dom_Pos', 'Dom_PoS_SUBTLEX', '']
feats_list = [x for x in df.columns.tolist() if x not in not_a_feature_list]

In [40]:
X = train_df[feats_list]

In [51]:
X.isna().sum()

sentence_word_count            0
sentence_dc_word_count         0
percent_dc_words               0
sentence_dc_stword_count       0
percent_dc_stwords             0
syl_count                      0
FK_Read_Ease                   0
FK_Read_Grade                  0
AoA_NoMatch                    0
AoA_Freq_pm_median          6852
AoA_Freq_pm_mean            6852
AoA_Nletter_median          6839
AoA_Nletter_mean            6839
AoA_Nphon_median            6839
AoA_Nphon_mean              6839
AoA_Nsyll_median            6839
AoA_Nsyll_mean              6839
AoA_Pct_known_lem_median    6839
AoA_Pct_known_lem_mean      6839
AoA_Pct_known_median        9807
AoA_Pct_known_mean          9807
Conc_M__median              9244
Conc_M__mean                9244
Conc_SD_median              9244
Conc_SD_mean                9244
Conc_Uknown_median          9244
Conc_Uknown_mean            9244
Conc_Pct_known_median       9244
Conc_Pct_known_mean         9244
Conc_SUBTLEX_median         9244
Conc_SUBTL

How to impute?
- We could drop nans
- Could impute with column mean
- Could use sklearn imputer with rfregressor
- Need to inspect rows with nans

In [54]:
zero_words_mask = X['sentence_word_count']==0
X.loc[zero_words_mask] = X.loc[zero_words_mask].fillna(0)

In [55]:
X.isna().sum()

sentence_word_count            0
sentence_dc_word_count         0
percent_dc_words               0
sentence_dc_stword_count       0
percent_dc_stwords             0
syl_count                      0
FK_Read_Ease                   0
FK_Read_Grade                  0
AoA_NoMatch                    0
AoA_Freq_pm_median          6614
AoA_Freq_pm_mean            6614
AoA_Nletter_median          6601
AoA_Nletter_mean            6601
AoA_Nphon_median            6601
AoA_Nphon_mean              6601
AoA_Nsyll_median            6601
AoA_Nsyll_mean              6601
AoA_Pct_known_lem_median    6601
AoA_Pct_known_lem_mean      6601
AoA_Pct_known_median        9569
AoA_Pct_known_mean          9569
Conc_M__median              9006
Conc_M__mean                9006
Conc_SD_median              9006
Conc_SD_mean                9006
Conc_Uknown_median          9006
Conc_Uknown_mean            9006
Conc_Pct_known_median       9006
Conc_Pct_known_mean         9006
Conc_SUBTLEX_median         9006
Conc_SUBTL

In [57]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=RANDOM_SEED)

imp.fit(X)




array([[19.        , 13.        ,  0.68      , ...,  1.37312922,
         1.36842105,  0.48237639],
       [10.        ,  4.        ,  0.4       , ...,  2.17638889,
         1.8       ,  0.74833148],
       [51.        , 29.        ,  0.57      , ...,  1.71411184,
         1.57142857,  0.82065181],
       ...,
       [22.        , 16.        ,  0.73      , ...,  1.30747518,
         1.15789474,  0.36464228],
       [ 0.        ,  0.        , -1.        , ...,  0.        ,
         0.        ,  0.        ],
       [17.        , 10.        ,  0.59      , ...,  1.42713855,
         1.41176471,  0.69102001]])

In [59]:
imp.set_output(transform='pandas')

In [61]:
X_imp = imp.transform(X)

## Get DBSCAN Params

In [125]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=3)
samp = X_imp.sample(frac=0.1, random_state=RANDOM_SEED)

neigh.fit(samp)

In [127]:
np.median(neigh.kneighbors(samp)[:1])

544.9467059789703

## Do DBSCAN

In [133]:
from collections import Counter
from sklearn.cluster import DBSCAN

In [138]:
dbscan = DBSCAN(eps=1000, min_samples=2)
cls = dbscan.fit_predict(X_imp)
print(f'Cluster membership values: \n{cls}')

Cluster membership values: 
[    0     1     0 ... 17609     0     6]


In [139]:
Counter(cls)

Counter({0: 144359,
         1: 80,
         2: 9555,
         3: 39,
         4: 2,
         -1: 38631,
         5: 855,
         6: 10583,
         7: 985,
         8: 3266,
         9: 4428,
         10: 2,
         11: 4,
         12: 10,
         13: 2844,
         14: 148,
         15: 3,
         16: 6,
         17: 2,
         18: 50,
         19: 3,
         20: 5,
         21: 2,
         22: 64,
         23: 350,
         24: 11,
         25: 4,
         26: 341,
         27: 2,
         28: 2,
         29: 7,
         30: 3,
         31: 20,
         32: 2,
         33: 18,
         34: 111,
         35: 5,
         36: 2,
         37: 4603,
         38: 2,
         39: 29,
         40: 2,
         41: 17,
         42: 43,
         43: 13,
         44: 2,
         45: 65,
         46: 94,
         47: 10,
         48: 2,
         49: 57,
         50: 1153,
         51: 40,
         52: 103,
         53: 4,
         54: 5,
         55: 81,
         56: 2,
         57: 110,
 