# Imports

In [1]:
import pandas as pd
import nltk
import re, string
import multiprocessing
import numpy as np
import gensim.downloader as api

from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from pymilvus import connections
from pymilvus import CollectionSchema, FieldSchema, DataType
from pymilvus import Collection
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from joblib import dump, load

tqdm.pandas()
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

print(f'Total {cores} CPU Cores')

Total 16 CPU Cores


# Load Dataset

In [2]:
columns = ['ID', 'TITLE', 'ABSTRACT']

# Read CSV
df_train = pd.read_csv('inputs/train_tm/train.csv', delimiter=',', usecols=columns)
df_test = pd.read_csv('inputs/test_tm/test.csv', delimiter=',', usecols=columns)

# Rename columns to lower case
df_train.columns = df_train.columns.str.lower()
df_test.columns = df_test.columns.str.lower()

df_train.head()

Unnamed: 0,id,title,abstract
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...


In [3]:
df_train['abstract_word_count'] = df_train['abstract'].apply(lambda x: len(x.split()))
df_train['abstract_word_count'].mean()

148.40487316421897

In [4]:
df_train['abstract_char_count'] = df_train['abstract'].apply(lambda x: len(str(x)))
df_train['abstract_char_count'].mean()

1009.1033759298111

In [5]:
df_train[df_train['abstract'].duplicated()]

Unnamed: 0,id,title,abstract,abstract_word_count,abstract_char_count


In [6]:
df_train[df_train['title'].duplicated()]

Unnamed: 0,id,title,abstract,abstract_word_count,abstract_char_count


# Preprocessing

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

print('nltk package is ready...')

nltk package is ready...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\myxzlpltk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\myxzlpltk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\myxzlpltk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\myxzlpltk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\myxzlpltk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
sentences_splitter = nltk.data.load('tokenizers/punkt/english.pickle')

df_train['text'] = df_train['abstract'].progress_apply(lambda x: sentences_splitter.tokenize(x))
df_train = df_train.explode('text', ignore_index=True)
df_train.head()

100%|██████████| 20972/20972 [00:02<00:00, 10230.22it/s]


Unnamed: 0,id,title,abstract,abstract_word_count,abstract_char_count,text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,Predictive models allow subject-specific inf...
1,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,"Given a subject's data, inference can\nbe made..."
2,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,identifiying condition presence for the\nsubje...
3,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,detecting condition effect on each individual\...
4,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,"While global inference is widely\nused, local ..."


In [9]:
df_train['word_count'] = df_train['text'].apply(lambda x: len(x.split()))
df_train['word_count'].mean()

23.471694129180648

In [10]:
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_train['char_count'].mean()

158.59453263451604

In [11]:
df_train = df_train[df_train['word_count'] > 5]

In [12]:
# Cleaning
def cleaning(s):
  # Lowercase text
  s = s.lower()
  # Trim text
  s = s.strip()
  # Remove punctuations, special characters, URLs & hashtags
  s = re.compile('<.*?>').sub('', s)
  s = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', s)
  s = re.sub('\s+', ' ', s)
  s = re.sub(r'\[[0-9]*\]', ' ', s)
  s = re.sub(r'[^\w\s]', '', str(s).lower().strip())
  s = re.sub(r'\d', ' ', s)
  s = re.sub(r'\s+', ' ', s)

  return s

# Remove stopword
def stopword(s):
  a = [i for i in s.split() if i not in stopwords.words('english')]
  return ' '.join(a)

#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
  if tag.startswith('J'):
    return wordnet.ADJ
  elif tag.startswith('V'):
    return wordnet.VERB
  elif tag.startswith('N'):
    return wordnet.NOUN
  elif tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(s):
  word_pos_tags = nltk.pos_tag(word_tokenize(s)) # Get position tags
  a = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
  return " ".join(a)

# Preprocessing
def preprocess(s):
  s = cleaning(s)
  s = stopword(s)
  s = lemmatizer(s)
  return s

In [13]:
df_train['clean_text'] = df_train['text'].progress_apply(lambda x: preprocess(x))
df_train.head()

100%|██████████| 131034/131034 [08:42<00:00, 250.96it/s]


Unnamed: 0,id,title,abstract,abstract_word_count,abstract_char_count,text,word_count,char_count,clean_text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,Predictive models allow subject-specific inf...,13,117,predictive model allow subject specific infere...
1,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,"Given a subject's data, inference can\nbe made...",13,73,give subject data inference make two level glo...
2,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,identifiying condition presence for the\nsubje...,9,64,identifiying condition presence subject local e
3,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,detecting condition effect on each individual\...,12,92,detect condition effect individual measurement...
4,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,265,1912,"While global inference is widely\nused, local ...",32,219,global inference widely use local inference us...


In [14]:
df_train[['id', 'title', 'text', 'clean_text']].to_csv('outputs/data.csv', index=False)

In [15]:
df_train = pd.read_csv('outputs/data.csv')
df_train.head()

Unnamed: 0,id,title,text,clean_text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,predictive model allow subject specific infere...
1,1,Reconstructing Subject-Specific Effect Maps,"Given a subject's data, inference can\nbe made...",give subject data inference make two level glo...
2,1,Reconstructing Subject-Specific Effect Maps,identifiying condition presence for the\nsubje...,identifiying condition presence subject local e
3,1,Reconstructing Subject-Specific Effect Maps,detecting condition effect on each individual\...,detect condition effect individual measurement...
4,1,Reconstructing Subject-Specific Effect Maps,"While global inference is widely\nused, local ...",global inference widely use local inference us...


# Feature Extraction

In [16]:
wv = api.load('word2vec-google-news-300')

In [17]:
index2word_set = set(wv.index_to_key)

def avg_feature_vector(sentence, model, num_features, index):
  words = sentence.split()
  feature_vec = np.zeros((num_features, ), dtype='float32')
  n_words = 0
  for word in words:
    if word in index:
      n_words += 1
      feature_vec = np.add(feature_vec, model[word])
  if n_words > 0:
    feature_vec = np.divide(feature_vec, n_words)
  return feature_vec

In [18]:
vectors = []
for text in tqdm(df_train['clean_text']):
  vec = avg_feature_vector(text, model=wv, num_features=300, index=index2word_set)
  vectors.append(vec)

scaler = Pipeline(steps=[
  ('std', StandardScaler()),
  ('minmax', MinMaxScaler((-1, 1)))
])

vectors = scaler.fit_transform(vectors)
vectors = np.array([vector / np.linalg.norm(vector) for vector in vectors])
print(np.min(vectors), np.max(vectors))

100%|██████████| 131034/131034 [00:04<00:00, 29908.20it/s]


-0.2636920722395563 0.2584988769652656


In [19]:
# Export vectors
np.save('outputs/vectors.npy', vectors)

In [20]:
# Export scaler
dump(scaler, 'models/scaler.joblib')

['models/scaler.joblib']

# Export data

In [21]:
connections.connect(
  alias="default",
  host='localhost',
  port='19530'
)

In [22]:
document_id = FieldSchema(
  name="id",
  dtype=DataType.INT64,
  is_primary=True,
  auto_id=True
)
document_vector = FieldSchema(
  name="vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=300
)
document_title = FieldSchema(
  name="title",
  dtype=DataType.VARCHAR,
  max_length=2048,
)
document_text = FieldSchema(
  name="text",
  dtype=DataType.VARCHAR,
  max_length=2048,
)
document_hash = FieldSchema(
  name="hash",
  dtype=DataType.INT64,
)
schema = CollectionSchema(
  fields=[document_id, document_vector, document_title, document_text, document_hash],
  description="All Documents",
)

collection_name = "documents"
collection = Collection(
  name=collection_name,
  schema=schema,
  using='default',
  shards_num=2,
)

In [23]:
step = 10 ** 4
for start in tqdm(range(0, vectors.shape[0], step)):
  end = start + step
  data = [
    vectors[start:end].tolist(),
    df_train['title'][start:end].str.strip().tolist(),
    df_train['text'][start:end].str.strip().tolist(),
    df_train['id'][start:end].astype(np.int64).tolist(),
  ]
  mr = collection.insert(data)

100%|██████████| 14/14 [00:12<00:00,  1.08it/s]


In [25]:
index_params = {
  "index_type": "IVF_FLAT",
  "metric_type": "IP",
  "params": {
    "nlist": 1024
  }
}

collection.create_index(
  field_name="vector",
  index_params=index_params,
  index_name="vector_index"
)
collection.load()