In [4]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import accuracy_score,hamming_loss,classification_report,f1_score
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import BinaryRelevance,ClassifierChain,LabelPowerset
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import re

df = pd.read_csv('./Train.csv')
test = pd.read_csv('./Test.csv')
df

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,1824,a ever-growing datasets inside observational a...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3094,we propose the framework considering optimal $...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8463,nanostructures with open shell transition meta...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2082,stars are self-gravitating fluids inside which...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8687,deep neural perception and control networks ar...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13999,8699,a methodology of automatic detection of a even...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
14000,11912,we consider a case inside which the robot has ...,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
14001,4842,despite being usually considered two competing...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
14002,12507,we present the framework and its implementatio...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.loc[:,['ABSTRACT']]

Unnamed: 0,ABSTRACT
0,a ever-growing datasets inside observational a...
1,we propose the framework considering optimal $...
2,nanostructures with open shell transition meta...
3,stars are self-gravitating fluids inside which...
4,deep neural perception and control networks ar...
...,...
13999,a methodology of automatic detection of a even...
14000,we consider a case inside which the robot has ...
14001,despite being usually considered two competing...
14002,we present the framework and its implementatio...


In [6]:
target = ['Analysis of PDEs', 'Applications','Artificial Intelligence', 'Astrophysics of Galaxies','Computation and Language', 'Computer Vision and Pattern Recognition',
'Cosmology and Nongalactic Astrophysics','Data Structures and Algorithms', 'Differential Geometry','Earth and Planetary Astrophysics','Fluid Dynamics',
       'Information Theory', 'Instrumentation and Methods for Astrophysics','Machine Learning', 'Materials Science','Methodology','Number Theory',
       'Optimization and Control', 'Representation Theory', 'Robotics','Social and Information Networks', 'Statistics Theory',
       'Strongly Correlated Electrons', 'Superconductivity','Systems and Control']
topic_col = ['Computer Science','Mathematics', 'Physics','Statistics']

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14004 entries, 0 to 14003
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   id                                            14004 non-null  int64 
 1   ABSTRACT                                      14004 non-null  object
 2   Computer Science                              14004 non-null  int64 
 3   Mathematics                                   14004 non-null  int64 
 4   Physics                                       14004 non-null  int64 
 5   Statistics                                    14004 non-null  int64 
 6   Analysis of PDEs                              14004 non-null  int64 
 7   Applications                                  14004 non-null  int64 
 8   Artificial Intelligence                       14004 non-null  int64 
 9   Astrophysics of Galaxies                      14004 non-null  int64 
 10

In [8]:
df[target].sum(axis = 1).max() # Maximum categories an article belongs to

4

In [9]:
100*(df[target].sum(axis = 0)/df.shape[0]).sort_values(ascending=False)

Machine Learning                                27.313625
Artificial Intelligence                          9.825764
Robotics                                         6.812339
Computer Vision and Pattern Recognition          6.705227
Strongly Correlated Electrons                    6.376750
Materials Science                                5.534133
Computation and Language                         4.605827
Cosmology and Nongalactic Astrophysics           4.520137
Optimization and Control                         4.434447
Social and Information Networks                  4.420166
Analysis of PDEs                                 4.334476
Applications                                     4.227364
Astrophysics of Galaxies                         4.098829
Methodology                                      4.098829
Systems and Control                              4.041702
Differential Geometry                            3.963153
Superconductivity                                3.870323
Statistics The

In [10]:
df.nunique()

id                                              14004
ABSTRACT                                        14004
Computer Science                                    2
Mathematics                                         2
Physics                                             2
Statistics                                          2
Analysis of PDEs                                    2
Applications                                        2
Artificial Intelligence                             2
Astrophysics of Galaxies                            2
Computation and Language                            2
Computer Vision and Pattern Recognition             2
Cosmology and Nongalactic Astrophysics              2
Data Structures and Algorithms                      2
Differential Geometry                               2
Earth and Planetary Astrophysics                    2
Fluid Dynamics                                      2
Information Theory                                  2
Instrumentation and Methods 

In [11]:
null_values_per_variable = 100 * (df.isnull().sum()/df.shape[0]).round(3)#.reset_index()
null_values_per_variable.sort_values(ascending=False)

id                                              0.0
Fluid Dynamics                                  0.0
Superconductivity                               0.0
Strongly Correlated Electrons                   0.0
Statistics Theory                               0.0
Social and Information Networks                 0.0
Robotics                                        0.0
Representation Theory                           0.0
Optimization and Control                        0.0
Number Theory                                   0.0
Methodology                                     0.0
Materials Science                               0.0
Machine Learning                                0.0
Instrumentation and Methods for Astrophysics    0.0
Information Theory                              0.0
Earth and Planetary Astrophysics                0.0
ABSTRACT                                        0.0
Differential Geometry                           0.0
Data Structures and Algorithms                  0.0
Cosmology an

In [9]:
vec = CountVectorizer(min_df = 3)
combined = list(df['ABSTRACT']) + list(test['ABSTRACT'])
vec.fit(combined)

CountVectorizer(max_features=10000)

In [10]:
trn, val = train_test_split(df, test_size=0.2, random_state=2021)

trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])

In [11]:
trn_abs.toarray().shape,val_abs.toarray().shape,tst_abs.toarray().shape

((11203, 10000), (2801, 10000), (6002, 10000))

In [17]:
from scipy.sparse import csr_matrix
vec = CountVectorizer(max_features = 10000)
combined = list(df['ABSTRACT']) + list(test['ABSTRACT'])
vec.fit(combined)
    
trn, val = train_test_split(df, test_size=0.2, random_state=2021)

trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])
    
trn2 = np.hstack((trn_abs.toarray(), trn[topic_col]))
val2 = np.hstack((val_abs.toarray(), val[topic_col]))
tst2 = np.hstack((tst_abs.toarray(), test[topic_col]))
    
trn2 = csr_matrix(trn2.astype('int16'))
val2 = csr_matrix(val2.astype('int16'))
tst2 = csr_matrix(tst2.astype('int16'))

In [12]:
def corpus_formation(vectorizer,df,test,val,max_feat,max_=None):
    vec = vectorizer(max_features = max_feat,max_df=max_)
    combined = list(df['ABSTRACT']) + list(test['ABSTRACT'])
    vec.fit(combined)
    
    trn, val = train_test_split(df, test_size=0.2, random_state=2021)

    trn_abs = vec.transform(trn['ABSTRACT'])
    val_abs = vec.transform(val['ABSTRACT'])
    tst_abs = vec.transform(test['ABSTRACT'])
    
    trn2 = np.hstack((trn_abs.toarray(), trn[topic_col]))
    val2 = np.hstack((val_abs.toarray(), val[topic_col]))
    tst2 = np.hstack((tst_abs.toarray(), test[topic_col]))
    
    trn2 = csr_matrix(trn2.astype('int16'))
    val2 = csr_matrix(val2.astype('int16'))
    tst2 = csr_matrix(tst2.astype('int16'))
    
    return trn2,val2,tst2

In [13]:
x,y,z = corpus_formation(CountVectorizer,df,test,val,10000)

TypeError: '<' not supported between instances of 'NoneType' and 'int'

In [None]:
x.shape,y.shape,z.shape

In [None]:
x.shape,y.shape,z.shape

In [None]:
%%time

from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(LogisticRegression(C = 10, n_jobs=-1))
clf.fit(trn_abs, trn[target])

In [None]:
val_preds = clf.predict(val_abs)
f1_score(val[target], val_preds, average='micro')

In [None]:
vec = TfidfVectorizer(max_features=10000)
_ = vec.fit(list(df['ABSTRACT']) + list(test['ABSTRACT']))

trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])

clf = OneVsRestClassifier(LogisticRegression(C = 10, n_jobs=-1))
_ = clf.fit(trn_abs, trn[target])

val_preds = clf.predict(val_abs)
f1_score(val[target], val_preds, average='micro')

In [None]:
vec = CountVectorizer(max_features=10000)
_ = vec.fit(list(df['ABSTRACT']) + list(test['ABSTRACT']))

trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])
print(trn_abs.shape, val_abs.shape, tst_abs.shape)

trn2 = np.hstack((trn_abs.toarray(), trn[topic_col]))
val2 = np.hstack((val_abs.toarray(), val[topic_col]))
tst2 = np.hstack((tst_abs.toarray(), test[topic_col]))

print(trn2.shape, val2.shape, tst2.shape)

In [None]:
from scipy.sparse import csr_matrix

trn2 = csr_matrix(trn2.astype('int16'))
val2 = csr_matrix(val2.astype('int16'))
tst2 = csr_matrix(tst2.astype('int16'))

In [None]:
clf = OneVsRestClassifier(LogisticRegression(C = 10, n_jobs=-1))
_  = clf.fit(trn2, trn[target])

val_preds = clf.predict(val2)
f1_score(val[target], val_preds, average='micro')

In [None]:
vec = TfidfVectorizer(max_features=12500)
_ = vec.fit(list(df['ABSTRACT']) + list(test['ABSTRACT']))

trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])
print(trn_abs.shape, val_abs.shape, tst_abs.shape)

trn2 = np.hstack((trn_abs.toarray(), trn[topic_col]))
val2 = np.hstack((val_abs.toarray(), val[topic_col]))
tst2 = np.hstack((tst_abs.toarray(), test[topic_col]))

print(trn2.shape, val2.shape, tst2.shape)

In [None]:
trn2 = csr_matrix(trn2.astype('int16'))
val2 = csr_matrix(val2.astype('int16'))
tst2 = csr_matrix(tst2.astype('int16'))

In [None]:
clf = OneVsRestClassifier(LogisticRegression(C = 10, n_jobs=-1))
_  = clf.fit(trn2, trn[target])

val_preds = clf.predict(val2)
f1_score(val[target], val_preds, average='micro')

In [None]:
val[target]

In [None]:
pd.DataFrame(val_preds,columns=val[target].columns)

In [None]:
clf = OneVsRestClassifier(MultinomialNB(alpha = 0.5))
_  = clf.fit(trn2, trn[target])

val_preds = clf.predict(val2)
f1_score(val[target], val_preds, average='micro')