In [41]:
import numpy as np
import pandas as pd
import os

import gensim

import lightgbm as lgb

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from time import time

from sklearn import feature_selection

In [2]:
df = []
  
for f in os.listdir('dataset'):
  file1 = open(f'dataset/{f}', 'r')
  lines = file1.readlines()

  # Strips the newline character
  for line in lines:
      line = line.strip()
      if(line[:4] == "MISC"):
        df.append([line[5:], 'MISC'])
      elif(line[:4] == 'CONT'):
        df.append([line[5:], 'CONT'])
      elif(line[:4] == 'AIMX'):
        df.append([line[5:], 'AIMX'])
      elif(line[:4] == 'OWNX'):
        df.append([line[5:], 'OWNX'])
      elif(line[:4] == 'BASE'):
        df.append([line[5:], 'BASE'])
    # count += 1
    # print("Line{}: {}".format(count, line.strip()))

In [3]:
df = pd.DataFrame(df, columns = ['text', 'label'])

In [4]:
from tqdm import tqdm

In [5]:
labels = {
    'MISC': 0,
    'CONT': 1,
    'AIMX': 2, 
    'OWNX': 3,
    'BASE': 4
}

In [28]:
from sklearn import feature_extraction
import re, nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [25]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text

Obtained from https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [26]:
stopwords = nltk.corpus.stopwords.words("english")

In [29]:
df["text_clean"] = df["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=stopwords))
df.head()

Unnamed: 0,text,label,text_clean
0,defensive forecasting is a method of transform...,MISC,defensive forecasting method transforming law ...
1,there are two known varieties of defensive for...,MISC,two known variety defensive forecasting contin...
2,this note shows that the randomized variety ca...,MISC,note show randomized variety obtained continuo...
3,new as compared to version NUMBER NUMBER ...,OWNX,new compared version number number august numb...
4,in the case where is finite it is shown tha...,OWNX,case finite shown forecaster choose randomized...


In [30]:
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [48]:
np.random.seed(40)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

2493 312 312


In [32]:
corpus = df_train["text_clean"]
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [42]:
y = df_train["label"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.95
dtf_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "y":cat}))
    dtf_features = dtf_features.sort_values(["y","score"], 
                    ascending=[True,False])
    dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
X_names = dtf_features["feature"].unique().tolist()



In [43]:
for cat in np.unique(y):
   print("# {}:".format(cat))
   print("  . selected features:",
         len(dtf_features[dtf_features["y"]==cat]))
   print("  . top features:", ",".join(
dtf_features[dtf_features["y"]==cat]["feature"].values[:10]))
   print(" ")

# AIMX:
  . selected features: 483
  . top features: paper,continuous variable,describe,present new,current study,introduce,approach based,based machine,introduce radically,new approach
 
# BASE:
  . selected features: 316
  . top features: extend,retracted,retracted second,reuse,reuse extend,aspect theorem,calibration result,citation noticed,important aspect,little randomization
 
# CONT:
  . selected features: 351
  . top features: substantial,study induction,clinical use,variability,library,currently,laboratory,early study,insufficient,lasted
 
# MISC:
  . selected features: 40
  . top features: citation,section,paper,show,channel,symbol mixing,mixing,proof,dendritic,result
 
# OWNX:
  . selected features: 57
  . top features: section,citation,finally,proof,exon,classify,result,loss rank,fitted,internal face
 


In [44]:
vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [51]:
X_train, y_train = X_train, df_train.label

X_test, y_test = df_test.text, df_test.label
X_test = vectorizer.transform(X_test)

In [52]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [53]:
y_pred=clf.predict(X_test)

In [54]:
# view accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LightGBM Model accuracy score: 0.6987


In [55]:
from sklearn.metrics import f1_score, classification_report

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        AIMX       0.83      0.62      0.71        24
        BASE       1.00      0.20      0.33         5
        CONT       0.14      0.07      0.10        14
        MISC       0.71      0.95      0.81       186
        OWNX       0.68      0.30      0.42        83

    accuracy                           0.70       312
   macro avg       0.67      0.43      0.47       312
weighted avg       0.69      0.70      0.66       312

