## Modelling Scratchpad

This is a scratchpad for modelling. code present in this notebook should not be assumed to be production grade.
refer to the models package of this project for the production version of this model

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../") 
import models.utility as utils


In [2]:
import warnings
warnings.filterwarnings('once')

import pandas as pd
import numpy as np
from sklearn import preprocessing as preprocessing
import sklearn.linear_model as linear_model
import sklearn.neighbors as neighbors
import sklearn.svm as svm
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibratedClassifierCV

In [3]:
raw_articles, raw_annotations, raw_locations = utils.load_dataset()

articles count: 9959
annotations count: 3221
locations count: 3562


In [4]:
uri_index = utils.get_annotation_uri_index(raw_annotations, raw_locations)
location_index = utils.get_location_index(raw_locations)
article_to_loc_index = utils.get_article_to_loc_index(raw_articles, raw_locations, uri_index)

In [5]:

uri_index['be48009bf7ec91d46983f1044db47f76']['locations']

[{'iid': '7fa1c86a-591d-46ea-a1f7-fec79e667b22',
  'council_annotation_uri': '0280893123d7785441eff9fa1641170b',
  'county': 'Derbyshire',
  'local_government_area': 'City of Derby',
  'nuts_region': 'East Midlands',
  'lng': -1.44377,
  'lat': 52.91562,
  'country': 'England',
  'name': 'Pride Park',
  'lastModifiedTime': 1516791066877,
  'origin_type': 'Other Settlement',
  'annotation_uri': ['be48009bf7ec91d46983f1044db47f76'],
  'id': 'ec79cba6-d6a6-485d-ab33-65ac0412f54b',
  'type': 'locations',
  'postcode_sector': 'DE24 8'}]

In [6]:
locations = pd.DataFrame(raw_locations)[["id", "lat", "lng", "nuts_region"]]
locations.set_index("id", inplace=True)

In [7]:
locations.head()

Unnamed: 0_level_0,lat,lng,nuts_region
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ec79cba6-d6a6-485d-ab33-65ac0412f54b,52.91562,-1.44377,East Midlands
64be7f94-cfcc-49c9-88f0-0dcace732a0f,51.44012,-2.61655,South West
5ceeb5cc-e1e4-4011-a3d8-3f874b793915,53.72422,-2.48623,North West
eeb33821-a955-403b-b732-5f5c169370eb,52.46249,-1.85391,West Midlands
19ee6c1b-75bb-4c85-bd39-22367e9fb9d3,53.65412,-1.75504,Yorkshire and the Humber


In [8]:
locations = utils.get_dummies(locations, "nuts_region")

In [9]:
def form_feature_set(raw_articles, uri_index):
    pub_feature_set = []
    publishers = []
    for article in raw_articles:
        for annotation in article["annotation_uri"]:
            if annotation in uri_index.keys():
                locs = uri_index[annotation]["locations"]
                for loc in locs:
                    publishers.append(utils.clean_publisher(article["publisher"]))
                    pub_feature_set.append(locations.loc[loc["id"], :])
                    
    pub_feature_set = pd.DataFrame(pub_feature_set)
    pub_feature_set["publisher"] = publishers
    return pub_feature_set

In [10]:
pub_feature_set = form_feature_set(raw_articles, uri_index)
pub_feature_set.head(2)

Unnamed: 0,lat,lng,East Midlands,Eastern,Isle of Man,London,North East,North West,Northern Ireland,Scotland,South East,South West,Wales,West Midlands,Yorkshire and the Humber,publisher
bd5a1b69-a99a-4934-9441-98fad30ec993,51.51437,-0.09229,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nwemail
eeb33821-a955-403b-b732-5f5c169370eb,52.46249,-1.85391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,rotherhamadvertiser


In [None]:
[a for a in raw_articles if a["publisher"] == "birminghammail"]

In [21]:
pub_feature_set[pub_feature_set["publisher"] == "birminghammail"].head()

Unnamed: 0,lat,lng,East Midlands,Eastern,Isle of Man,London,North East,North West,Northern Ireland,Scotland,South East,South West,Wales,West Midlands,Yorkshire and the Humber,publisher
2e1d9ff5-cbde-4eae-a311-ec4def1ec76e,51.56452,-0.10892,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,birminghammail
a4a85890-6f23-463a-afee-7356110f900c,52.43788,-1.78288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,birminghammail
a52b540f-765d-4696-831a-4fe12bf587b2,52.41275,-1.77755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,birminghammail
fbe7bb85-5ee9-4172-955c-ec2f9b9ac3b4,52.47928,-1.90294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,birminghammail
ae979cef-5667-44d0-b283-d811f8b42714,51.51279,-0.2057,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,birminghammail


In [11]:
n = 100
publishers = pub_feature_set["publisher"].unique()
publishers.sort()
large_publishers = publishers[pub_feature_set.groupby("publisher")["publisher"].count() > n]
print(large_publishers)

['belfastlive' 'birminghammail' 'blackpoolgazette' 'bournemouthecho'
 'chroniclelive' 'coventrytelegraph' 'dailyrecord' 'devonlive'
 'eastangliandailytimes' 'easterndailypress' 'gazettelive' 'getsurrey'
 'glasgowlive' 'inyourarea' 'lep' 'liverpoolecho' 'manchestereveningnews'
 'mirror' 'plymouthherald' 'somersetlive' 'theargus' 'thenorthernecho'
 'thestar' 'thisislocallondon' 'walesonline' 'yorkshirepost']


In [12]:
pub_feature_set.columns

Index(['lat', 'lng', 'East Midlands', 'Eastern', 'Isle of Man', 'London',
       'North East', 'North West', 'Northern Ireland', 'Scotland',
       'South East', 'South West', 'Wales', 'West Midlands',
       'Yorkshire and the Humber', 'publisher'],
      dtype='object')

In [13]:
def form_sample(dataset, locations):
    known_ids = set(dataset.index.values)
    dataset.drop("publisher", axis=1, inplace=True)
    dataset.reset_index(inplace=True, drop=True)
    
    available_location = locations.loc[~locations.index.isin(known_ids)]
    sample_size = dataset.shape[0]
    negative_sample = available_location.sample(sample_size).copy()
    negative_sample = negative_sample.reset_index(drop=True)
    negative_sample["Y"] = 0
    
    positive_sample = dataset.copy()
    positive_sample["Y"] = 1
    
    sample = pd.concat((positive_sample, negative_sample))
    return sample

In [14]:
for publisher in large_publishers:
    print("\n-------------------------------------------------")
    print(f"Publisher: {publisher}\n")
    dataset = pub_feature_set[pub_feature_set["publisher"] == publisher]
    sample = form_sample(dataset, locations)
    clf = svm.SVC()
    clf_sigmoid = CalibratedClassifierCV(clf, method='sigmoid')
    train, test = model_selection.train_test_split(sample, test_size=0.3)
    
    model = clf_sigmoid.fit(train.drop("Y", axis=1), y=train["Y"])
    y_hat = model.predict(test.drop("Y", axis=1))
    y_hat_prob = model.predict_proba(test.drop("Y", axis=1))
    print(pd.DataFrame(metrics.confusion_matrix(test["Y"], y_hat), 
                       index=["actual neg", "actual pos"], 
                       columns=["pred neg", "pred pos"]))
    
    print(metrics.classification_report(test["Y"], y_hat))


-------------------------------------------------
Publisher: belfastlive

            pred neg  pred pos
actual neg        31         2
actual pos         2        31
             precision    recall  f1-score   support

          0       0.94      0.94      0.94        33
          1       0.94      0.94      0.94        33

avg / total       0.94      0.94      0.94        66


-------------------------------------------------
Publisher: birminghammail

            pred neg  pred pos
actual neg       108         4
actual pos        22       112
             precision    recall  f1-score   support

          0       0.83      0.96      0.89       112
          1       0.97      0.84      0.90       134

avg / total       0.90      0.89      0.89       246


-------------------------------------------------
Publisher: blackpoolgazette



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        27         6
actual pos         4        41
             precision    recall  f1-score   support

          0       0.87      0.82      0.84        33
          1       0.87      0.91      0.89        45

avg / total       0.87      0.87      0.87        78


-------------------------------------------------
Publisher: bournemouthecho

            pred neg  pred pos
actual neg        64         2
actual pos         7        61
             precision    recall  f1-score   support

          0       0.90      0.97      0.93        66
          1       0.97      0.90      0.93        68

avg / total       0.94      0.93      0.93       134


-------------------------------------------------
Publisher: chroniclelive

            pred neg  pred pos
actual neg        59         1
actual pos         6        65
             precision    recall  f1-score   support

          0       0.91      0.98      0.94        60
          1       0.98      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        77         3
actual pos        18        72
             precision    recall  f1-score   support

          0       0.81      0.96      0.88        80
          1       0.96      0.80      0.87        90

avg / total       0.89      0.88      0.88       170


-------------------------------------------------
Publisher: dailyrecord

            pred neg  pred pos
actual neg        44         4
actual pos        10        43
             precision    recall  f1-score   support

          0       0.81      0.92      0.86        48
          1       0.91      0.81      0.86        53

avg / total       0.87      0.86      0.86       101


-------------------------------------------------
Publisher: devonlive

            pred neg  pred pos
actual neg        44         4
actual pos         3        47
             precision    recall  f1-score   support

          0       0.94      0.92      0.93        48
          1       0.92      0.94    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        33         2
actual pos         0        38
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        35
          1       0.95      1.00      0.97        38

avg / total       0.97      0.97      0.97        73


-------------------------------------------------
Publisher: easterndailypress

            pred neg  pred pos
actual neg        58         2
actual pos         2        52
             precision    recall  f1-score   support

          0       0.97      0.97      0.97        60
          1       0.96      0.96      0.96        54

avg / total       0.96      0.96      0.96       114


-------------------------------------------------
Publisher: gazettelive

            pred neg  pred pos
actual neg        31         3
actual pos         3        41
             precision    recall  f1-score   support

          0       0.91      0.91      0.91        34
          1       0.93      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        39        11
actual pos         4        57
             precision    recall  f1-score   support

          0       0.91      0.78      0.84        50
          1       0.84      0.93      0.88        61

avg / total       0.87      0.86      0.86       111


-------------------------------------------------
Publisher: glasgowlive

            pred neg  pred pos
actual neg        27         1
actual pos         0        35
             precision    recall  f1-score   support

          0       1.00      0.96      0.98        28
          1       0.97      1.00      0.99        35

avg / total       0.98      0.98      0.98        63


-------------------------------------------------
Publisher: inyourarea

            pred neg  pred pos
actual neg        36         5
actual pos         0        37
             precision    recall  f1-score   support

          0       1.00      0.88      0.94        41
          1       0.88      1.00   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        36         4
actual pos         3        37
             precision    recall  f1-score   support

          0       0.92      0.90      0.91        40
          1       0.90      0.93      0.91        40

avg / total       0.91      0.91      0.91        80


-------------------------------------------------
Publisher: liverpoolecho

            pred neg  pred pos
actual neg        41         7
actual pos         1        62
             precision    recall  f1-score   support

          0       0.98      0.85      0.91        48
          1       0.90      0.98      0.94        63

avg / total       0.93      0.93      0.93       111


-------------------------------------------------
Publisher: manchestereveningnews

            pred neg  pred pos
actual neg        35         4
actual pos         0        36
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        39
          1       0.90

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        42        15
actual pos        30        33
             precision    recall  f1-score   support

          0       0.58      0.74      0.65        57
          1       0.69      0.52      0.59        63

avg / total       0.64      0.62      0.62       120


-------------------------------------------------
Publisher: plymouthherald

            pred neg  pred pos
actual neg        36         2
actual pos         2        44
             precision    recall  f1-score   support

          0       0.95      0.95      0.95        38
          1       0.96      0.96      0.96        46

avg / total       0.95      0.95      0.95        84


-------------------------------------------------
Publisher: somersetlive

            pred neg  pred pos
actual neg        48         4
actual pos         4        52
             precision    recall  f1-score   support

          0       0.92      0.92      0.92        52
          1       0.93      0.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        32         7
actual pos         5        48
             precision    recall  f1-score   support

          0       0.86      0.82      0.84        39
          1       0.87      0.91      0.89        53

avg / total       0.87      0.87      0.87        92


-------------------------------------------------
Publisher: thenorthernecho

            pred neg  pred pos
actual neg        57         9
actual pos         0        66
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        66
          1       0.88      1.00      0.94        66

avg / total       0.94      0.93      0.93       132


-------------------------------------------------
Publisher: thestar

            pred neg  pred pos
actual neg        44         3
actual pos         0        52
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        47
          1       0.95      1.00  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


            pred neg  pred pos
actual neg        53         2
actual pos         4        36
             precision    recall  f1-score   support

          0       0.93      0.96      0.95        55
          1       0.95      0.90      0.92        40

avg / total       0.94      0.94      0.94        95


-------------------------------------------------
Publisher: walesonline

            pred neg  pred pos
actual neg        93         2
actual pos         5        98
             precision    recall  f1-score   support

          0       0.95      0.98      0.96        95
          1       0.98      0.95      0.97       103

avg / total       0.97      0.96      0.96       198


-------------------------------------------------
Publisher: yorkshirepost

            pred neg  pred pos
actual neg        39         0
actual pos         1        38
             precision    recall  f1-score   support

          0       0.97      1.00      0.99        39
          1       1.00      0.97

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [15]:
stds = pub_feature_set.groupby("publisher")[["lat", "lng"]].std()

In [16]:
stds.loc[large_publishers]

Unnamed: 0_level_0,lat,lng
publisher,Unnamed: 1_level_1,Unnamed: 2_level_1
belfastlive,0.801781,1.394171
birminghammail,0.395649,0.514844
blackpoolgazette,0.816504,0.495305
bournemouthecho,0.355959,0.285121
chroniclelive,0.734752,0.375422
coventrytelegraph,0.613991,0.565776
dailyrecord,1.239655,1.10484
devonlive,0.602221,0.684825
eastangliandailytimes,0.195936,0.598938
easterndailypress,0.289984,0.712664


In [17]:
(sample["Y"] == 0).sum()

129

In [18]:
(test["Y"] == 1).sum()

39

In [19]:
sample.shape[0]

258