In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('./data/wos.xlsx')

In [3]:
df.head()

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


**Goal:** Predict domain of a scientific article using the abstract information.

In [4]:
set(df['Domain'])

{'CS ', 'Civil ', 'ECE ', 'MAE ', 'Medical ', 'Psychology  ', 'biochemistry '}

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer 

In [9]:
vec = CountVectorizer(stop_words='english') # can pass a list of stop words instead (which you can find online)

In [10]:
vec.fit(df['Abstract'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [11]:
vec.vocabulary_

{'dimensional': 35622,
 'non': 78779,
 'linear': 66174,
 'optical': 82286,
 'waves': 121975,
 'coherently': 27018,
 'excited': 43064,
 'resonant': 97851,
 'medium': 70129,
 'doped': 37289,
 'erbium': 41871,
 'atoms': 15302,
 'described': 34287,
 'schrodinger': 102412,
 'equation': 41764,
 'coupled': 30048,
 'self': 103240,
 'induced': 58670,
 'transparency': 115645,
 'equations': 41768,
 'hirota': 53971,
 'method': 71056,
 'symbolic': 111003,
 'computation': 27888,
 'forms': 46661,
 'soliton': 106528,
 'solutions': 106558,
 'obtained': 81050,
 'asymptotic': 15093,
 'analysis': 12134,
 'conducted': 28492,
 'suggests': 109942,
 'interaction': 59746,
 'solitons': 106530,
 'elastic': 39767,
 'bright': 20582,
 'fields': 45179,
 'dark': 32521,
 'ones': 81922,
 'field': 45173,
 'electric': 39842,
 'polarization': 89540,
 'population': 90119,
 'inversion': 60691,
 'profile': 92127,
 'dopant': 37284,
 'head': 52717,
 'bidirectional': 18308,
 'overtaking': 83573,
 'unidirectional': 118390,
 'see

In [12]:
len(vec.vocabulary_)

124529

In [14]:
df.shape

(46985, 7)

In [33]:
abstract = 'In this paper we do computing. It is very quantum hybrid computing'

In [34]:
vec.transform([abstract]).todense().nonzero()

(array([0, 0, 0, 0], dtype=int64),
 array([27906, 55573, 84515, 94614], dtype=int64))

In [37]:
vec.transform([abstract]).todense()[0,27906]

2

In [36]:
vec.vocabulary_['computing']

27906

In [22]:
X = vec.transform(df['Abstract'])

In [23]:
X.shape

(46985, 124529)

In [24]:
df.shape

(46985, 7)

# Building a classifier

**Important!!!** The vectorizer needs to be created on the training set, and then used on the testing.

In [40]:
import numpy as np

In [41]:
np.random.seed(123)

In [56]:
train_idxs = np.random.choice(df.index.values, size=int(df.shape[0]*7))

In [57]:
test_idxs = [ix for ix in df.index.values if ix not in train_idxs]

In [58]:
train_df = df.iloc[train_idxs, :]
test_df = df.iloc[test_idxs, :]

In [59]:
set(train_df['Domain'])


{'CS ', 'Civil ', 'ECE ', 'MAE ', 'Medical ', 'Psychology  ', 'biochemistry '}

In [60]:
set(test_df['Domain'])

{'CS ', 'Civil ', 'ECE ', 'MAE ', 'Medical ', 'Psychology  ', 'biochemistry '}

**Now we can fit the vectorizer on the train**

In [61]:
X_train = vec.fit_transform(train_df['Abstract']) # 
X_test = vec.transform(test_df['Abstract']) # Use the freshly fit vectorizer with transform only

**Also need to correctly encode the labels**

In [62]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [63]:
y_train = le.fit_transform(train_df['Domain'])
y_test = le.transform(test_df['Domain'])

In [64]:
set(y_train)

{0, 1, 2, 3, 4, 5, 6}

In [65]:
set(y_test)

{0, 1, 2, 3, 4, 5, 6}

**Apply SVC:** SVC works well with less than 100k observations and can handle a high number of variables (hundreds of thousands).

In [66]:
from sklearn.svm import SVC

In [68]:
svc = SVC()

In [None]:
svc.fit(X_train, y_train)



In [None]:
# Re-using the model / Use in production
pipe = Pipeline([('vec',CountVectorizer()), ('svm', SVC())])
pipe.fit(X_train, y_train)
dump.save(pipe, open('model.pkl', 'wb')) # save the model for persistance
# To call from API
# Load the pickled object
pipe = dump.load('model.pkl')
pipe.predict(['Here is my new abstract'])