In [None]:
!pip install email
!pip install catboost
!pip install python-Levenshtein

In [25]:
import re
import pandas as pd
from Levenshtein import ratio as levenshtein_ratio
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
def create_training_data(df_annots):
  training_data = []
  for i in df_annots.index:
    row = df_annots.loc[i]
    if pd.isnull(row['email_text']) or pd.isnull(row['Event Name']):
      continue
    text = row['email_text'].replace('\n', ' ').replace('\t', ' ')
    event_name = row['Event Name']
    if event_name not in text:
      print(event_name, row['ID'])
      continue
    event_name = event_name.strip()
    start = text.index(event_name)
    end = start + len(event_name)
    spacy_row = (text, [(start, end, "EVENT")])
    training_data.append(spacy_row)
  return training_data

In [14]:
# Initial Clean of Data
df_annots = pd.read_csv('reannotated_emails.csv')
print(len(df_annots))
df_annots = df_annots.dropna(subset=['Event Name'])
print(len(df_annots))
df_annots['email_text'] = df_annots['email_text'].str.replace("(?<=[a-z])(?=[A-Z0-9])", " ", regex=True).str.replace("[\{\(\[].*?[\)\]\}]", "", regex=True).str.replace('\n', ' ').str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.replace('Conf.', 'Conf').str.replace('Symp.', 'Symp').str.strip()
df_annots['Event Name'] = df_annots['Event Name'].str.replace("(?<=[a-z])(?=[A-Z0-9])", " ", regex=True).str.replace("[\{\(\[].*?[\)\]\}]", "", regex=True).str.replace('\n', ' ').str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.replace('Conf.', 'Conf').str.replace('Symp.', 'Symp').str.strip()
all_data = create_training_data(df_annots)

440
425
Global Journal of Engineering Science and Research Management (GJESRM 24
The 2015 International Confrence on Health Informatics and Medical Systems 105
The 2015 International Congress on Scientific Computing 144
The 2015 Confrence on Artificial Intelligence ICAI'15 153
The 2015 International Confrence on Modeling, Simulation and Visualizsation Methods MSV'15 155
The 2015 World Congress in Computer Science, Computer Engineering, and Applied Computing WORLDCOMP'15 156
9th INTERNATIONAL CONFERENCE ON COMMUNICATION AND SIGNAL PROCESSING 339
International Journal of Information Research And Review 375
INTERNATIONAL JOURNAL OF ENGINEERING SCIENCES AND RESEARCH TECHNOLOGY 488
The 14th IEEE International Confrence on Machine Learning And Applications 406
International Journal of Emerging Technology & Advanced Engineering 419
International Journal of Emerging Technology & Advanced Engineering 420
International Journal of Emerging Technology & Advanced Engineering 421
The Second Internat

In [17]:
# Lemmatize, tokenize, and label sentences
lemmatizer = WordNetLemmatizer()
all_sents = []
for data in all_data:
  text = data[0]
  name = text[data[1][0][0]:data[1][0][1]]
  sents = sent_tokenize(text)
  for sent in sents:
    if name in sent:
      name_in_sent = 1
    else:
      name_in_sent = 0
    words = [lemmatizer.lemmatize(w) for w in sent.split()]
    sent = " ".join(words)
    all_sents.append((sent, name_in_sent))
df_all_sents = pd.DataFrame(all_sents, columns=['sent', 'name_in_sent'])
df_all_sents

Unnamed: 0,sent,name_in_sent
0,:: CFP:: ICCTIM2014:: Dubai- UAE.,0
1,Please Accept our Apology for Multiple Posting.,0
2,THANK YOU.,0
3,==============================================...,1
4,The conference welcome paper on the following ...,0
...,...,...
15938,Data Center and related issue O. Visualization...,0
15939,Big Data Analytics MEASURABLE SCIENTIFIC IMPAC...,0
15940,Citation data is obtained from Microsoft Acade...,0
15941,The citation data doe not even include more th...,0


In [18]:
# Create tfidf vector matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2)
tfidf_wm = vectorizer.fit_transform(df_all_sents['sent'].to_list())
columns = vectorizer.get_feature_names_out()
stop_i = 0
for i in range(len(columns)):
  colname = columns[i]
  if colname[0] == 'a':
    stop_i = i
    break

df_sent_vecs = pd.DataFrame(tfidf_wm.toarray(), columns=columns).iloc[:, stop_i:]
df_sent_vecs

Unnamed: 0,a4,aa,aaai,aadl,abda,abell,abet,abhishek,abidin,ability,able,about,above,abroad,abstract,abstracts,abuse,ac,academia,academic,academician,academicians,academy,accelerator,accenture,accept,acceptable,acceptance,accepted,access,accessable,accessed,accessibility,accessible,accessing,accordance,according,account,accounting,accreditation,...,wsn,www,xhost,xia,xml,xpl,xplore,xplorehttp,xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yahoo,yang,year,yearly,years,yen,yes,yet,yfantis,york,you,young,your,yours,yourself,yuan,yuanyuan,ywfk,zadeh,zainal,zakopane,zero,zhou,zig,zone,zurada,zurich,宋体,電子豹,页眉,页脚
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.467347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.522858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.125214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393967,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Train test split
from sklearn.model_selection import train_test_split
X = df_sent_vecs
y = df_all_sents['name_in_sent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)
len(X_train), len(X_val), len(X_test)

(10840, 1914, 3189)

In [20]:
# Train the catboost classifier
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=100,
    random_seed=42,
    learning_rate=0.50,
    custom_loss=['AUC', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
)

0:	learn: 0.1645867	test: 0.1621014	best: 0.1621014 (0)	total: 174ms	remaining: 17.2s
1:	learn: 0.0876947	test: 0.0863984	best: 0.0863984 (1)	total: 264ms	remaining: 13s
2:	learn: 0.0669227	test: 0.0678145	best: 0.0678145 (2)	total: 353ms	remaining: 11.4s
3:	learn: 0.0599360	test: 0.0645074	best: 0.0645074 (3)	total: 447ms	remaining: 10.7s
4:	learn: 0.0550917	test: 0.0639670	best: 0.0639670 (4)	total: 538ms	remaining: 10.2s
5:	learn: 0.0513026	test: 0.0626780	best: 0.0626780 (5)	total: 624ms	remaining: 9.78s
6:	learn: 0.0463298	test: 0.0583523	best: 0.0583523 (6)	total: 715ms	remaining: 9.5s
7:	learn: 0.0440081	test: 0.0564321	best: 0.0564321 (7)	total: 802ms	remaining: 9.22s
8:	learn: 0.0422963	test: 0.0568263	best: 0.0564321 (7)	total: 887ms	remaining: 8.97s
9:	learn: 0.0410012	test: 0.0562867	best: 0.0562867 (9)	total: 986ms	remaining: 8.87s
10:	learn: 0.0387568	test: 0.0545898	best: 0.0545898 (10)	total: 1.07s	remaining: 8.69s
11:	learn: 0.0376395	test: 0.0539270	best: 0.0539270 (1

<catboost.core.CatBoostClassifier at 0x7f8777bd2f50>

In [21]:
# Score the classifier
from sklearn.metrics import precision_recall_fscore_support
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.9239324586004274, 0.9104526059456722, 0.9170736046861743, None)

In [22]:
# Gets the list of potenial names for a given sentence
stop_words = stopwords.words('english')
def get_consecutive_words(sent):
  months = {'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'}
  sent = sent.replace('-', '')
  sent = sent.replace('/', ' ')
  words = nltk.word_tokenize(sent)
  phrases = []
  current_phrase = []
  for word in words:
    if bool(re.match("[A-Z]+'\d+", word)):
      if len(current_phrase) > 0:
          # remove extra stopwords
          i = len(current_phrase) - 1
          while (current_phrase[i] in stop_words):
            del current_phrase[i]
            i -= 1
          phrases.append(' '.join(current_phrase))
          current_phrase = []
      phrases.append(word)
    else:
      if (bool(re.match(r'\w*[A-Z]\w*', word)) or bool(re.search(r'\d', word))) and word.lower() not in months:
        current_phrase.append(word)
      else:
        if word in stop_words and len(current_phrase) > 0:
          current_phrase.append(word)
        else:
          if len(current_phrase) > 0:
            # remove extra stopwords
            i = len(current_phrase) - 1
            while (current_phrase[i] in stop_words):
              del current_phrase[i]
              i -= 1
            phrases.append(' '.join(current_phrase))
            current_phrase = []
  better_phrases = []
  for phrase in phrases:
    if len(phrase.split()) > 1:
      better_phrases.append(phrase)
  if len(better_phrases) == 0:
    return phrases
  return better_phrases

In [23]:
# Selects the most likely potential name
def select_conference(phrases):
  if len(phrases) == 0:
    return ""
  for phrase in phrases:
    if 'conference' in phrase.lower() or 'confrence' in phrase.lower():
      return phrase
  for phrase in phrases:
    if 'conf' in phrase.lower():
      return phrase
  for phrase in phrases:
    if 'event' in phrase.lower():
      return phrase
  for phrase in phrases:
    if 'congress' in phrase.lower():
      return phrase
  return max(phrases, key=len)

In [24]:
# Run the whole model on all data
df_results = pd.DataFrame()
i = 0
for text, annotations in all_data:
  name = text[annotations[0][0]:annotations[0][1]]
  sents = sent_tokenize(text)
  lemmatized_sents = []
  for sent in sents:
    words = [lemmatizer.lemmatize(w) for w in sent.split()]
    sent = " ".join(words)
    lemmatized_sents.append(sent)
  sents_wm = vectorizer.transform(lemmatized_sents)
  df_sents_vecs = pd.DataFrame(sents_wm.toarray(), columns=columns).iloc[:, stop_i:]
  predictions = clf.predict_proba(df_sents_vecs)
  df_predictions = pd.DataFrame(predictions, columns=['not_in_sent', 'in_sent'])
  df_predictions['compound'] = df_predictions['in_sent'] - df_predictions['not_in_sent']
  best = list(df_predictions['compound'].sort_values(ascending=False).index)[0]
  best_sent = sents[best]
  if name in best_sent:
    name_in_best_sent = True
  else:
    name_in_best_sent = False
  pred_name = select_conference(get_consecutive_words(best_sent))
  df_results = df_results.append({'pred_name': pred_name, 'actual_name': name, 'name_in_best_sent': name_in_best_sent}, ignore_index=True)
  i += 1
df_results

Unnamed: 0,actual_name,name_in_best_sent,pred_name
0,The International Confrence on Computing,1.0,The International Confrence on Computing Techn...
1,The First International Confrence on Green Com...,1.0,The First International Confrence on Green Com...
2,2016 IEEE World Congress on Computational Inte...,1.0,2016 IEEE World Congress on Computational Inte...
3,Asia-Pacific Confrence on Engineering and Appl...,1.0,AsiaPacific Confrence on Engineering and Appli...
4,Global Engineering & Applied Science Confrence,1.0,Applied Science Confrence Secretariat
...,...,...,...
403,The 2015 International Confrence on Health Inf...,1.0,The 2015 International Confrence on Health Inf...
404,The 2015 International Confrence on Image Proc...,1.0,The 2015 International Confrence on Image Proc...
405,The 2015 International Confrence on Wireless N...,1.0,The 2015 International Confrence on Wireless N...
406,The 2015 International Confrence on Frontiers ...,1.0,The 2015 International Confrence on Frontiers ...


In [30]:
# Get the levenshtein ratio for each prediction
df_results['ratio'] = 0
for index, row in df_results.iterrows():
  ratio = levenshtein_ratio(row['actual_name'], row['pred_name'])
  df_results.loc[index, 'ratio'] = ratio
df_results

Unnamed: 0,actual_name,name_in_best_sent,pred_name,ratio
0,The International Confrence on Computing,1.0,The International Confrence on Computing Techn...,0.666667
1,The First International Confrence on Green Com...,1.0,The First International Confrence on Green Com...,0.793893
2,2016 IEEE World Congress on Computational Inte...,1.0,2016 IEEE World Congress on Computational Inte...,0.794118
3,Asia-Pacific Confrence on Engineering and Appl...,1.0,AsiaPacific Confrence on Engineering and Appli...,0.949153
4,Global Engineering & Applied Science Confrence,1.0,Applied Science Confrence Secretariat,0.602410
...,...,...,...,...
403,The 2015 International Confrence on Health Inf...,1.0,The 2015 International Confrence on Health Inf...,1.000000
404,The 2015 International Confrence on Image Proc...,1.0,The 2015 International Confrence on Image Proc...,0.712329
405,The 2015 International Confrence on Wireless N...,1.0,The 2015 International Confrence on Wireless N...,1.000000
406,The 2015 International Confrence on Frontiers ...,1.0,The 2015 International Confrence on Frontiers ...,0.729560


In [31]:
# Average levenshtein ratio -- effectively the accuracy
df_results['ratio'].mean()

0.8005023294484871