In [7]:
import pandas as pd

from sklearn.manifold import TSNE
from sklearn.svm import SVC, OneClassSVM
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score, recall_score

from sklearn.decomposition import NMF
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

In [4]:
seen_df = pd.read_csv('data/0a_labelled_documents.csv')
seen_df.head()

Unnamed: 0,id,content,title,PY,wosarticle__de,wosarticle__wc,relevant,seen,1 - Adaptation,1 - Impact,1 - Mitigation
0,282602,Background: Among the many challenges faced by...,Health coping strategies of the people vulnera...,2013.0,Health coping strategies; Choice of care; Unqu...,"['Public, Environmental & Occupational Health']",1.0,1.0,1.0,0.0,0.0
1,773200,The projected rise in food-related greenhouse ...,Mitigation potential and global health impacts...,2017.0,0,['Environmental Sciences; Environmental Studie...,1.0,1.0,0.0,0.0,1.0
2,3446429,Background: Dengue fever control in the tropic...,Spatial and temporal variation of dengue incid...,2019.0,Dengue; Bali; Spatial analysis; Conditional au...,"['Public, Environmental & Occupational Health;...",1.0,1.0,0.0,1.0,0.0
3,1672609,Vector-borne infectious diseases continue to b...,Gene drives as a response to infection and res...,2019.0,CRISPR; gene drive; gene editing; vector-borne...,['Infectious Diseases; Pharmacology & Pharmacy'],0.0,1.0,0.0,0.0,0.0
4,3393270,A chronic symptomatic acromioclavicular joint ...,Arthroscopically assisted stabilization of chr...,2013.0,Chronic acromioclavicular joint dislocation; A...,['Orthopedics; Surgery'],0.0,1.0,0.0,0.0,0.0


In [14]:
unseen_df = pd.read_csv('data/0b_unlabelled_documents.csv')

In [17]:
df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
     )

seen_index = df[df['seen']==1].index
print(df.shape)
df.head()

(699525, 14)


Unnamed: 0,id,content,title,PY,wosarticle__de,wosarticle__wc,relevant,seen,1 - Adaptation,1 - Impact,1 - Mitigation,wosarticle__dt,tslug,UT__UT
0,2734050,An Ejector heat pump-boosted District Heating ...,Study of an innovative ejector heat pump-boost...,2013.0,,,0.0,0.0,,,,Article,studyofaninnovativeejectorheatpumpboosteddistr...,WOS:000322051300010
1,4964870,Cervical spine spondylosis is so prevalent in ...,Cervical spinal canal body ratio in normal ind...,2020.0,,,0.0,0.0,,,,Article,cervicalspinalcanalbodyratioinnormalindividual...,2-s2.0-85092231046
2,908214,Considering the effect of ash and moisture on ...,Modified respiratory quotient to evaluate the ...,2018.0,,,0.0,0.0,,,,Article,modifiedrespiratoryquotienttoevaluatetheenviro...,WOS:000427373200007
3,4871627,Recent Al Qaeda threats and related jihadi pro...,The oForest Jihado,2009.0,,,0.0,0.0,,,,Article,theoforestjihado,WOS:000269100200003
4,3736036,This paper aims to improve medical strategies ...,Single step synthesis of glutamic/tartaric aci...,2020.0,,,0.0,0.0,,,,Article,singlestepsynthesisofglutamictartaricacidstabi...,2-s2.0-85081347753


In [18]:
vec = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=4, max_df=0.8, strip_accents='unicode', 
    max_features=10000,
    use_idf=1,
    smooth_idf=1, sublinear_tf=1,
    stop_words="english"
)
vec.fit(df.loc[seen_index,'content'])
ab_X = vec.transform(df['content'])

In [19]:
clf = SVC(kernel='rbf',class_weight='balanced',probability=True)
y = df['relevant']
clf.fit(ab_X[seen_index],y[seen_index])

SVC(class_weight='balanced', probability=True)

In [21]:
unseen_index = df[df['seen']==0].index
y_pred = clf.predict_proba(ab_X[unseen_index])

In [22]:
import numpy as np
np.save("data/predictions",y_pred)

In [20]:
df['prediction'] = df['relevant']
df.loc[unseen_index,"prediction"] = y_pred[:,1]

rel_index = df[df['relevant']==1].index
pred_rel_index = df[(df['prediction']>=0.5) & (df['prediction']<1)].index

clf = OneVsRestClassifier(SVC(kernel='linear', class_weight="balanced", probability=True))
classes = ["1 - Mitigation","1 - Adaptation", "1 - Impact"]
y = np.matrix(df[classes])

In [21]:
clf.fit(ab_X[rel_index],y[rel_index]) #fit on the labeled as relevant data
m_y_pred = clf.predict_proba(ab_X[pred_rel_index]) #get probability scores for those predicted to be relevant

np.save("data/m_predictions",m_y_pred)