# GtR Topic Classifier

## Preamble

In [None]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [None]:
import ast
import seaborn as sns
from itertools import chain
from collections import Counter, defaultdict
import itertools
import re

from eu_funding.visualization.visualize import pdf_cdf
from eu_funding.utils.nlp_utils import remove_markup, normalise_digits, lemmatize, bigram, stringify_docs
# from src.visualization.visualize import pdf_cdf

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.feature_selection import chi2

import networkx as nx
import community

import warnings

warnings.simplefilter('ignore', UserWarning)

In [None]:
from nesta.packages.nlp_utils import preprocess

In [None]:
list_cols = ['research_topics', 'research_subjects']

gtr_projects_df = pd.read_csv(
    os.path.join(ext_data_path, 'gtr', 'gtr_projects.csv'),
    converters={k: ast.literal_eval for k in list_cols}
)

In [None]:
gtr_projects_df.head()

In [None]:
research_subject_counter = Counter(chain(*gtr_projects_df['research_subjects']))
research_topic_counter = Counter(chain(*gtr_projects_df['research_topics']))

In [None]:
print('There are {} unique research subjects in the GtR projects dataset.'.format(len(research_subject_counter)))
print('There are {} unique research topics in the GtR projects dataset.'.format(len(research_topic_counter)))

In [None]:
research_subject_counter.most_common(40)

### Field Definition Through Community Detection

In [None]:
combos = list(chain(*[sorted(itertools.combinations(d, 2)) for d in gtr_projects_df['research_topics']]))

In [None]:
research_topic_edge_counter = Counter(combos)

In [None]:
total_research_topics = len(list(chain(*gtr_projects_df['research_topics'])))

In [None]:
def association_strength(combo, occurrences, cooccurrences, total):
    return (2 * total * cooccurrences[combo]) / (occurrences[combo[0]] * occurrences[combo[1]])

In [None]:
edges = set(combos)

In [None]:
assoc_strengths = [association_strength(
    edge,
    research_topic_counter, 
    research_topic_edge_counter, 
    total_research_topics) for edge in edges]

In [None]:
plt.hist(np.log10(assoc_strengths), bins=100)
plt.show()

In [None]:
edge_df = pd.DataFrame()
edge_df['source'] = [e[0] for e in edges]
edge_df['target'] = [e[1] for e in edges]
edge_df['weight'] = np.log10(assoc_strengths)
g = nx.from_pandas_edgelist(edge_df, edge_attr='weight')

In [None]:
class CommunityPartition:
    def __init__(self, graph):
        self.graph = graph
    
    def edgelist_to_cooccurrence(self, repeats, **best_partition_kwargs):
        edge_counter = Counter()
        for i in range(repeats):
            partition = community.best_partition(self.graph, **best_partition_kwargs)
            edgelist = self.partition_to_edgelist(partition)
            edge_counter.update(edgelist)

        g = nx.Graph()
        g.add_weighted_edges_from([(e[0][0], e[0][1], e[1]) for e in edge_counter.items()])
        return g
    
    def partition_to_edgelist(self, partition):
        partition_reverse_mapping = self.reverse_index_partition(partition)
        edgelist = []
        for community, elements in partition_reverse_mapping.items():
            combos = [tuple(sorted(e)) for e in itertools.combinations(elements, 2)]
            edgelist.extend(combos)
        return edgelist
     
    def reverse_index_partition(self, partition):
        partition_reverse_mapping = defaultdict(list)
        for k, v in partition.items():
            partition_reverse_mapping[v].append(k)
        return partition_reverse_mapping

In [None]:
cp = CommunityPartition(g)

In [None]:
co = cp.edgelist_to_cooccurrence(3, resolution=.4)

In [None]:
nx.draw(co)

In [None]:
#Extract the best partition
part = community.best_partition(g, resolution=0.4, random_state=0, weight='weight')

In [None]:
set(part.values())

In [None]:
size = float(len(set(part.values())))
pos = nx.spring_layout(co)
count = 0.
for com in set(part.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in part.keys()
                                if part[nodes] == com]
    nx.draw_networkx_nodes(co, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(co, pos, alpha=0.5)
plt.show()

In [None]:
pd.Series(part).reset_index(drop=False).groupby(0)['index'].apply(lambda x: print(', '.join(list(x))+'\n'))

In [None]:
import pickle

In [None]:
category_name_lookup = {
    0: 'social_sciences',
    1: 'arts_linguistics',
    2: 'chem_mater_phys_eng',
    3: 'social_sciences',
    4: 'maths_computing_ee',
    5: 'social_sciences',
    6: 'social_sciences',
    7: 'biological_sciences',
    8: 'social_sciences',
    9: 'humanities',
    10: 'arts_linguistics',
    11: 'humanities',
    12: 'social_sciences',
    13: 'physics',
    14: 'environmental_sciences',
    15: 'social_sciences',
    16: 'humanities'
}

topic_discipline_lookup = {top:category_name_lookup[disc] for top, disc in part.items()}

In [None]:
with open(os.path.join(model_path, 'communities_partition.pkl'), 'wb') as f:
    pickle.dump(part, f)

with open(os.path.join(model_path, 'communities_partition_labels.pkl'), 'wb') as f:
    pickle.dump(category_name_lookup, f)

In [None]:
gtr_projects_df['discipline'] = gtr_projects_df['research_topics'].apply(
    lambda x: [topic_discipline_lookup[val] for val in x])

gtr_projects_df['discipline_sets'] = [set(x) for x in gtr_projects_df['discipline']]

gtr_projects_df['single_disc'] = [True if len(x)==1 else np.nan if len(x)==0 else False for x in gtr_projects_df['discipline_sets']]

gtr_projects_df['single_disc'].mean()

In [None]:
gtr_projects_df['discipline_sets'] = [
    set(['medical_sciences']) if f =='MRC' else x for f,x in zip(
        gtr_projects_df['funder_name'],
           gtr_projects_df['discipline_sets'])]

In [None]:
def modal_value(l):
    c = Counter(l)
    try:
        return c.most_common(1)[0][0]
    except:
        return np.nan

gtr_projects_df['modal_discipline'] = [modal_value(d) for d in gtr_projects_df['discipline_sets']]

In [None]:
gtr_projects_df['modal_discipline'].value_counts()

In [None]:
Counter(chain(*gtr_projects_df['discipline_sets'])).most_common()

In [None]:
n_labels = [True if len(s) > 0 else False for s in gtr_projects_df['discipline_sets']]

In [None]:
# remove projects without abstracts
gtr_projects_df = gtr_projects_df[~pd.isnull(gtr_projects_df['abstract_texts'])]
# remove projects with short abstracts
gtr_projects_df = gtr_projects_df[gtr_projects_df['abstract_texts'].str.len() > 250]
# remove projects with no labels
n_labels = [True if len(s) > 0 else False for s in gtr_projects_df['discipline_sets']]
gtr_projects_df = gtr_projects_df[n_labels]

### Text Preprocessing

In [None]:
import spacy
from gensim.models.phrases import Phraser

In [None]:
nlp = spacy.load('en')
nlp.remove_pipe('parser')
nlp.remove_pipe('ner')

with open(os.path.join(raw_data_path, 'stopwords_en_long.txt'), 'r') as f:
    stopwords = f.read().splitlines()

for stopword in stopwords:
    nlp.vocab[stopword.lower()].is_stop = True
    nlp.vocab[stopword.upper()].is_stop = True
    nlp.vocab[stopword.title()].is_stop = True

In [None]:
abstracts = [remove_markup(a) for a in gtr_projects_df['abstract_texts']]
abstracts = [normalise_digits(a) for a in abstracts]
abstracts = lemmatize(abstracts, nlp)

In [None]:
bigrammer = Phraser.load(os.path.join(model_path, 'gtr_discipline_bigrammer.pkl'))
abstracts = bigram(abstracts, phraser=bigrammer)
abstracts_str = list(stringify_docs(abstracts))

In [None]:
gtr_projects_df['abstract_processed'] = abstracts_str

### Single Label

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

In [None]:
gtr_projects_df['n_disciplines'] = [len(a) for a in gtr_projects_df['discipline_sets']]
gtr_pure_df = gtr_projects_df[gtr_projects_df['n_disciplines'] == 1]

In [None]:
tfidf_single = TfidfVectorizer(
    max_df=0.4, 
    min_df=5,
    sublinear_tf=True, 
    norm='l2'
)
tfidf_pure_vecs = tfidf_single.fit_transform(gtr_pure_df['abstract_processed'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_pure_vecs, gtr_pure_df['modal_discipline'], train_size=0.8, test_size=0.2)

In [None]:
sp = SelectPercentile(chi2, 40)
sp_vecs = sp.fit_transform(X_train, y_train)
sp_vecs_test = sp.transform(X_test)

In [None]:
tfidf_vecs.shape

In [None]:
sp_vecs.shape

In [None]:
lr = LogisticRegression(C=10, solver='lbfgs', multi_class='auto')
mnb = MultinomialNB()
cmb = ComplementNB()
voter = VotingClassifier(estimators=[('lr', lr), ('cmb', cmb), ('mnb', mnb)])

In [None]:
pipe = Pipeline??

In [None]:
from sklearn.model_selection import GridSearchCV

#### Optimise

In [None]:
lr_params = {'C': [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100]}
lr_grid = GridSearchCV(lr, param_grid=lr_params, cv=5, verbose=2)

In [None]:
lr_grid.fit(sp_vecs, y_train)
lr_best = lr_grid.best_estimator_
print(classification_report(y_test, lr_best.predict(sp_vecs_test)))

In [None]:
sns.heatmap(
    pd.DataFrame(confusion_matrix(y_test, lr_best.predict(sp_vecs_test)), columns=mlb.classes_, index=mlb.classes_),
    annot=True,
    fmt='d'
)

#### Export Pipe

In [None]:
tfidf_single = TfidfVectorizer(
    max_df=0.4, 
    min_df=5,
    sublinear_tf=True, 
    norm='l2'
)
sp = SelectPercentile(chi2, 40)
lr = LogisticRegression(C=10, solver='lbfgs', multi_class='auto')
pipe = Pipeline(
    steps=[
        ('tfidf', tfidf),
        ('sp', sp),
        ('lr', lr)
    ]
)

In [None]:
pipe.fit(gtr_pure_df['abstract_processed'], gtr_pure_df['modal_discipline'])

In [None]:
joblib.dump(pipe, os.path.join(model_path, 'gtr_discipline_lvl9_lr_20190222.pkl'))

### Multilabel

In [None]:
tfidf = TfidfVectorizer(
    max_df=0.3, 
    min_df=5,
    sublinear_tf=True, 
    norm='l2'
)
tfidf_vecs = tfidf.fit_transform(abstracts_str)

In [None]:
classes = list(set(chain(*gtr_projects_df['discipline_sets'])))
mlb = MultiLabelBinarizer(classes=classes)
target_binarized = mlb.fit_transform(gtr_projects_df['discipline_sets'])
target_binarized_df = pd.DataFrame(target_binarized, columns=mlb.classes_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs, target_binarized_df, train_size=0.8, test_size=0.2)
sp = SelectPercentile(chi2, 40)
sp_vecs = sp.fit_transform(X_train, y_train)
sp_vecs_test = sp.transform(X_test)

In [None]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto')
mnb = MultinomialNB()
cmb = ComplementNB()
voter = VotingClassifier(estimators=[('lr', lr), ('cmb', cmb), ('mnb', mnb)])

In [None]:
for cls in mlb.classes_:
    print(cls)
    lr.fit(sp_vecs, y_train[cls])
    print(classification_report(y_test[cls], lr.predict(sp_vecs_test)))

In [None]:
print(classification_report(y_test, voter.predict(sp_vecs_test)))

### Train RF

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf_params = {
     'bootstrap': [True, False],
     'max_depth': [10, 20, 50, 100, None],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10],
     'n_estimators': [100, 200, 500]
}

In [None]:
rf = RandomForestClassifier(n_jobs=3)

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_params,
    n_iter=100,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=3
)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, rf_random.best_estimator_.predict(X_test)))

In [None]:
rf_random.best_estimator_.fit(tfidf_vecs_filt, target_binarized)

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(rf_random.best_estimator_, os.path.join(model_path, 'gtr_discipline_classifier_rf_8_20192102.pkl'))
joblib.dump(tfidf, os.path.join(model_path, 'gtr_discipline_tfidf_20192102.pkl'))
bigrammer.save(os.path.join(model_path, 'gtr_discipline_bigrammer.pkl'))
nlp.vocab.to_disk(os.path.join(model_path, 'gtr_discipline_vocab'))

## Apply to CORDIS

In [None]:
cordis_projects_df = pd.read_csv(os.path.join(inter_data_path, 'fp7_h2020_projects.csv'))

In [None]:
cordis_abstracts = [remove_markup(a) for a in cordis_projects_df['objective'][:25]]
cordis_abstracts = [normalise_digits(a) for a in cordis_abstracts]
cordis_abstracts = lemmatize(cordis_abstracts, nlp)
cordis_abstracts = bigram(cordis_abstracts, phraser=bigrammer)
cordis_abstracts = list(stringify_docs(cordis_abstracts))

In [None]:
for abstract, pred in zip(cordis_projects_df['objective'][:25], pipe.predict(cordis_abstracts)):
    print(pred)
    print(abstract)
    print('\n==============')

In [None]:
cordis_tfidf_vecs = tfidf.transform(cordis_abstracts)

In [None]:
cordis_subject_probs = rf_random.best_estimator_.predict_proba(cordis_tfidf_vecs)
cordis_subjects = rf_random.best_estimator_.predict(cordis_tfidf_vecs)

In [None]:
subject_probs = np.zeros((len(cordis_projects_df), 8))

In [None]:
for i in range(8):
    subject_probs[:, i] = cordis_subject_probs[i][:, 0]

In [None]:
n = 101

In [None]:
cordis_projects_df['objective'][n]

In [None]:
pd.DataFrame(cordis_subjects, columns=mlb.classes_).sum()

## Alternative Feature Selection

In [None]:
feature_terms = []
indices = np.array(range(0, X_train.shape[1]))
for discipline in y_train.columns:
    features_chi2 = chi2(X_train, y_train[discipline])[0]
    threshold = np.percentile(features_chi2[~pd.isnull(features_chi2)], 90)
    discipline_indices = indices[features_chi2 > threshold]
    feature_terms.extend(np.array(tfidf.get_feature_names())[discipline_indices])

In [None]:
tfidf_stop_words = set(tfidf.get_feature_names()).difference(set(feature_terms))

In [None]:
tfidf = TfidfVectorizer(
#     max_df=0.5, 
    min_df=5, 
    sublinear_tf=True, 
    norm='l2',
    stop_words=tfidf_stop_words
)
tfidf_vecs_filt = tfidf.fit_transform(abstracts_str)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs_filt, target_binarized, train_size=0.9, test_size=0.1)