In [42]:
import os.path
import sys
import django
sys.path.append('/home/galm/software/django/tmv/BasicBrowser')
sys.path.append('/home/max/software/django-tmv/tmv_mcc-apsis/BasicBrowser')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()
import scoping
from scoping.models import *
from utils.text import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC

import pandas as pd
import numpy as np

import pickle
import scipy.sparse

import matplotlib.pyplot as plt

In [43]:
seen_df = pd.read_csv('../data/0_labelled_documents.csv')
relevance = pd.read_csv('../data/1_document_relevance.csv')

rel_ids = relevance.loc[relevance['0 - relevance - upper_pred']>0.5,'id']

unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')
unseen_df = unseen_df[unseen_df['id'].isin(rel_ids)]

df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
)
print(df.shape)

seen_index = df[df['seen']==1].index
unseen_index = df[df['seen']==0].index
new_index = df[(df['seen']==1) & (df['ar5']==0)].index
rel_index = df[df['relevant']==1].index
r_index = df[df["random_sample"]==1].index
physical_index = df[df['physical_tags']==1].index

(95916, 243)


In [44]:
from alterations import postfix_data
df = postfix_data(df)

In [45]:
revectorize = False

X_exists = os.path.isfile(f'../data/X_{df.shape[0]}.npz')

if revectorize is True or X_exists is False:
    print("running vectorisation again")
    vec = TfidfVectorizer(
        ngram_range=(1,2),
        min_df=10, max_df=0.8, strip_accents='unicode', 
        max_features=20000,
        tokenizer=snowball_stemmer()
    )
    vec.fit(df.loc[seen_index,"content"].astype("str"))

    X = vec.transform(df['content'].astype("str"))   
    with open (f'../data/vec_{seen_df.shape[0]}.pickle','wb') as f:
        pickle.dump(vec, f)
    import scipy.sparse
    scipy.sparse.save_npz(f'../data/X_{df.shape[0]}.npz', X)
else:
    print("loading feature matrix")
    with open (f'../data/vec_{seen_df.shape[0]}.pickle','rb') as f:
        vec = pickle.load(f)
        X = scipy.sparse.load_npz(f'../data/X_{df.shape[0]}.npz')
        
X.shape

loading feature matrix


(95916, 7450)

In [46]:
broad_cats = [x for x in df.columns if "12 - " in x and "Physical systems" not in x and "prediction" not in x]
y = np.matrix(df[broad_cats])

In [47]:
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier

kf = KFold(n_splits=10)
kfs = kf.split(X[rel_index],y[rel_index])
y_preds = []
for k_train, k_test in kfs:
    clf = OneVsRestClassifier(SVC(kernel='linear', class_weight="balanced", probability=True))
    k_train = rel_index[k_train]
    clf.fit(X[k_train],y[k_train])
    y_preds.append(clf.predict(X[unseen_index]))
    
y_preds = np.array(y_preds)
np.save("../data/y_preds_broad_cats.npz",y_preds)

In [48]:
y_preds = np.load("../data/y_preds_broad_cats.npz.npy")
y_preds.shape

(10, 93604, 5)

In [49]:
for i, c in enumerate(broad_cats):
    mean_pred = np.mean(y_preds[:,:,i], axis=0)
    std_pred = np.std(y_preds[:,:,i], axis=0)
    preds_upper = np.minimum(mean_pred + std_pred, 1)
    preds_lower = np.maximum(mean_pred - std_pred, 0)
    
    print(c)
    print(np.where(mean_pred>=0.5,1,0).sum())


    df.loc[unseen_index,f'{c} - mean_prediction'] = mean_pred
    df.loc[unseen_index,f'{c} - std_prediction'] = std_pred
    df.loc[unseen_index,f'{c} - lower_pred'] = preds_lower
    df.loc[unseen_index,f'{c} - upper_pred'] = preds_upper

12 - Coastal and marine Ecosystems
12661
12 - Human and managed
9293
12 - Mountains, snow and ice
4630
12 - Rivers, lakes, and soil moisture
11872
12 - Terrestrial ES
30235


In [50]:
basic_cols = [x for x in df.columns if " - " not in x]
cat_cols = [x for x in df.columns if x not in basic_cols and "12 - " in x or "2 - " in x or "6 - " in x]


In [51]:
df[basic_cols+cat_cols].to_csv('../data/1_predicted_category_documents.csv')

## Drivers

In [53]:
driver_unseen = unseen_index.union(
    df[
        (df['seen']==1) & 
        (df['physical_tags']==0) & 
        (df['relevant']==1)
    ].index
)
print(len(driver_unseen), len(unseen_index))

94075 93604


In [54]:
## New summary cats
df['6 - Precipitation'] = 0
df.loc[(df['6 - 05 Changes in precipitation']==1) | (df['6 - 08 Changes in strong precipitation']==1),"6 - Precipitation"] = 1

df['Human drivers'] = 0
df.loc[
    (df['6 - 76 Human water use']==1) |
    (df['6 - 77 Land use change']==1),
    '6 - Human drivers' 
] = 1
 
df.loc[
    (df['6 - 03 Extreme temperature']==1) & 
    (df['6 - 16 Sea surface temperature']==0) & 
    (df['6 - 21 Water temperature (freshwater)']==0),
    '6 - 02 Air or land surface temperature changes'
] = 1

df['6 - Temperature'] = 0
df.loc[
    (df['6 - 03 Extreme temperature']==1) | 
    (df['6 - 16 Sea surface temperature']==1) | 
    (df['6 - 02 Air or land surface temperature changes']==1) |
    (df['6 - 21 Water temperature (freshwater)']==1),
    '6 - Temperature'
] = 1

df['6 - Water availability'] = 0
df.loc[
    (df['6 - 26 Drought frequency and intensity']==1) |
    (df['6 - 07 Aridity/dryness']==1) |
    (df['6 - 28 River runoff']==1),
    '6 - Water availability'
] = 1

df['6 - Other'] = 0
df.loc[
    (df['6 - Temperature']==0) & 
    (df['6 - Precipitation']==0),
    '6 - Other'
] = 1

new_rel_index = physical_index.intersection(rel_index)

driver_selection = [
    #'6 - 01 CO2 concentration',
    '6 - Temperature',
    '6 - Precipitation',#'6 - 16 Sea surface temperature',
    '6 - Other'
    #'6 - 09 Atmospheric/marine circulation or teleconnections',
    #'6 - 10 Wind speed','6 - 11 Storms','6 - 14 Sea level change',#'6 - 26 Drought frequency and intensity',
    #'6 - 04 Radiation',
    #'6 - 33 Sea ice retreat',
    #'6 - No drivers', '6 - 30 Snow', '6 - Human drivers', 
    #'6 - 12 Seasonality',
    #'6 - Water availability',
]

In [55]:
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier

new_rel_index = physical_index.intersection(rel_index)

y = np.matrix(df[driver_selection])

kf = KFold(n_splits=10)
kfs = kf.split(X[new_rel_index],y[new_rel_index])
y_preds = []
for k_train, k_test in kfs:
    clf = OneVsRestClassifier(SVC(kernel='linear', class_weight="balanced", probability=True))
    k_train = new_rel_index[k_train]
    clf.fit(X[k_train],y[k_train])
    y_preds.append(clf.predict(X[driver_unseen]))
    
y_preds = np.array(y_preds)
np.save("../data/y_preds_drivers.npz",y_preds)

In [56]:
for i, c in enumerate(driver_selection):
    mean_pred = np.mean(y_preds[:,:,i], axis=0)
    std_pred = np.std(y_preds[:,:,i], axis=0)
    preds_upper = np.minimum(mean_pred + std_pred, 1)
    preds_lower = np.maximum(mean_pred - std_pred, 0)


    df.loc[driver_unseen,f'{c} - mean_prediction'] = mean_pred
    df.loc[driver_unseen,f'{c} - std_prediction'] = std_pred
    df.loc[driver_unseen,f'{c} - lower_pred'] = preds_lower
    df.loc[driver_unseen,f'{c} - upper_pred'] = preds_upper

In [57]:
basic_cols = [x for x in df.columns if " - " not in x]
cat_cols = [x for x in df.columns if x not in basic_cols and "12 - " in x or "2 - " in x or "6 - " in x and "hidden" not in x]
print(cat_cols)

df.to_csv('../data/1_predicted_category_documents.csv')

['12 - Coastal and marine Ecosystems', '12 - Human and managed', '12 - Mountains, snow and ice', '12 - Rivers, lakes, and soil moisture', '12 - Terrestrial ES', '16 - Climate Impact attribution', '16 - Climate attribution', '16 - Climate event attribution', '16 - Experimental evidence', '16 - Future/modelled impacts', '16 - Impact event attribution', '16 - No Climate Impact attribution', '16 - Unclear', '16 - Weather sensitivity', '2 - 2.1. Climate change attribution', '2 - 2.2 Trend attribution', '2 - 2.3. Attribution to extreme event', '2 - 2.3. Impact trend attribution<hidden>', '2 - 2.4. Sensitivity', '2 - 2.5. Detection of a regional climate trend (no attribution)', '2 - 2.6. Null results', '6 - 01 CO2 concentration', '6 - 02 Air or land surface temperature changes', '6 - 03 Extreme temperature', '6 - 04 Radiation', '6 - 05 Changes in precipitation', '6 - 06 Humidity', '6 - 07 Aridity/dryness', '6 - 08 Changes in strong precipitation', '6 - 09 Atmospheric/marine circulation or tel

## Attribution classes

In [58]:
df['2 - Trend or climate change attribution'] = 0
df.loc[
    (df['2 - 2.2 Trend attribution']==1) | (df['2 - 2.1. Climate change attribution']==1),
    '2 - Trend or climate change attribution'
] = 1

attribution_classes =  [
 '2 - Trend or climate change attribution',
 '2 - 2.4. Sensitivity',
 '2 - 2.5. Detection of a regional climate trend (no attribution)'
]

y = np.matrix(df[attribution_classes])

In [59]:
kf = KFold(n_splits=10)
kfs = kf.split(X[new_rel_index],y[new_rel_index])
y_preds = []
for k_train, k_test in kfs:
    clf = OneVsRestClassifier(SVC(kernel='linear', class_weight="balanced", probability=True))
    k_train = new_rel_index[k_train]
    clf.fit(X[k_train],y[k_train])
    y_preds.append(clf.predict(X[driver_unseen]))
    
y_preds = np.array(y_preds)
np.save("../data/y_preds_attribution.npz",y_preds)

In [60]:
for i, c in enumerate(attribution_classes):
    mean_pred = np.mean(y_preds[:,:,i], axis=0)
    std_pred = np.std(y_preds[:,:,i], axis=0)
    preds_upper = np.minimum(mean_pred + std_pred, 1)
    preds_lower = np.maximum(mean_pred - std_pred, 0)


    df.loc[driver_unseen,f'{c} - mean_prediction'] = mean_pred
    df.loc[driver_unseen,f'{c} - std_prediction'] = std_pred
    df.loc[driver_unseen,f'{c} - lower_pred'] = preds_lower
    df.loc[driver_unseen,f'{c} - upper_pred'] = preds_upper

In [61]:
basic_cols = [x for x in df.columns if " - " not in x]
cat_cols = [x for x in df.columns if x not in basic_cols and "12 - " in x or "2 - " in x or "6 - " in x and "hidden" not in x]
print(cat_cols)

df.to_csv('../data/1_predicted_category_documents.csv')

['12 - Coastal and marine Ecosystems', '12 - Human and managed', '12 - Mountains, snow and ice', '12 - Rivers, lakes, and soil moisture', '12 - Terrestrial ES', '16 - Climate Impact attribution', '16 - Climate attribution', '16 - Climate event attribution', '16 - Experimental evidence', '16 - Future/modelled impacts', '16 - Impact event attribution', '16 - No Climate Impact attribution', '16 - Unclear', '16 - Weather sensitivity', '2 - 2.1. Climate change attribution', '2 - 2.2 Trend attribution', '2 - 2.3. Attribution to extreme event', '2 - 2.3. Impact trend attribution<hidden>', '2 - 2.4. Sensitivity', '2 - 2.5. Detection of a regional climate trend (no attribution)', '2 - 2.6. Null results', '6 - 01 CO2 concentration', '6 - 02 Air or land surface temperature changes', '6 - 03 Extreme temperature', '6 - 04 Radiation', '6 - 05 Changes in precipitation', '6 - 06 Humidity', '6 - 07 Aridity/dryness', '6 - 08 Changes in strong precipitation', '6 - 09 Atmospheric/marine circulation or tel