#Using SKLL to Test Models/Generate Predictions

In [1]:
import skll

In [2]:
# Let's find out where we are, first, and then see what models we have
from os import getcwd, listdir
getcwd()

'/home/mulhollandm2/reviews_project/reviewer_experience_prediction/ipython_notebooks'

In [3]:
from os.path import dirname, join
listdir(dirname(getcwd()))

['apln_paper',
 'working',
 'results',
 'conda_requirements.txt',
 'models',
 '.gitignore',
 'arff_files_original_values',
 'data',
 'reports',
 'config',
 'logs',
 'predictions',
 '.git',
 '.ipynb_checkpoints',
 'README.md',
 'ipython_notebooks',
 'src',
 'util',
 'arff_files_collapsed_values',
 'Makefile']

In [4]:
# There is a "models" directory that contains model files that we can
# try to open with SKLL and use here
# Now let's see what models we have (because models consist of many
# files, let's just get the one with ".model" as a suffix)
from os.path import join
[print(d) for d in listdir(join(dirname(getcwd()),
                                'models')) if d.endswith('.model')]

Arma_3_first500.train_Arma_3_first500_RescaledSVR.model
Football_Manager_2015.train_Football_Manager_2015_RescaledSVR.model
Football_Manager_2015.RSVRquadr.train_Football_Manager_2015_RescaledSVR.model


[None, None, None]

In [5]:
# Alright, so we'll use the
# Arma_3_first500.train_Arma_3_first500_RescaledSVR model just for fun
l = skll.Learner.from_file(join(dirname(getcwd()),
                                'models',
                                'Arma_3_first500.train_Arma_3_first500_RescaledSVR.model'))

In [6]:
# According to the documentation for skll.Learner objects, they have
# a 'predict' method that requires an argument that is an object of
# type FeatureSet, so we'll try to do that next
l.predict?

In [7]:
# We are going to need to get some example data at some point,
# so let's just do it right now
import pymongo
connection_string = 'mongodb://localhost:27017'
connection = pymongo.MongoClient(connection_string)
db = connection['reviews_project']
reviewdb = db['reviews']
reviewdb.write_concern['w'] = 0

In [8]:
r1 = reviewdb.find_one({'game': 'Football_Manager_2015',
                        'partition': 'test'})
print(r1)

{'_id': ObjectId('5554f294c134cf3ebe2f5009'), 'review': 'LOVING THE BETA CANT WAIT FOR FULL GAME ON THE 7TH', 'appid': '295270', 'hours_bin': 5, 'hours': 440.0, 'partition': 'test', 'game': 'Football_Manager_2015'}


In [9]:
# So we have an example test review and now we can work on
# getting it into an object that can be passed to the prediction
# function
# But, first we will need to calculate all the features
import sys
sys.path.append('/home/mulhollandm2/reviews_project/reviewer_experience_prediction/')
from src.feature_extraction import Review, extract_features_from_review

In [10]:
# Prints out the documentation for this class that is part of our
# code
r1_review_obj = Review?

In [11]:
from spacy.en import English
spaCy_nlp = English()
r1_review_obj = Review(r1['review'],
                       float(r1['hours_bin']), # using bins, i.e.,
                                               # collapsed, form of
                                               # the hours played
                                               # value
                       r1['game'],
                       r1['appid'],
                       spaCy_nlp)

In [12]:
# Let's take a look at the object
print(r1_review_obj.length)

6


In [13]:
# Here's the full list of attributes that are contained in each
# Review object (attributes being variables or methods)
[a for a in dir(r1_review_obj) if not a.startswith('__')]

['appid',
 'get_entities_from_spaCy',
 'get_token_features_from_spaCy',
 'hours_played',
 'length',
 'lower',
 'norm',
 'normalize',
 'orig',
 'spaCy_annotations',
 'spaCy_sents',
 'tags',
 'tokens']

In [14]:
print(r1_review_obj.tokens)

[['loving', 'the', 'beta', 'can', 'not', 'wait', 'for', 'full', 'game', 'on', 'the', '7th']]


In [92]:
# With the processed representations of the review text, now we
# can proceed to extract features
# Let's see the documentation first
r1_features = extract_features_from_review?

In [93]:
r1_features = extract_features_from_review

In [16]:
r1_features = extract_features_from_review(r1_review_obj)
r1_features

{' 7': 1,
 ' 7T': 1,
 ' 7TH': 1,
 ' B': 1,
 ' BE': 1,
 ' BET': 1,
 ' BETA': 1,
 ' C': 1,
 ' CA': 1,
 ' CAN': 1,
 ' CANT': 1,
 ' F': 2,
 ' FO': 1,
 ' FOR': 1,
 ' FOR ': 1,
 ' FU': 1,
 ' FUL': 1,
 ' FULL': 1,
 ' G': 1,
 ' GA': 1,
 ' GAM': 1,
 ' GAME': 1,
 ' O': 1,
 ' ON': 1,
 ' ON ': 1,
 ' ON T': 1,
 ' T': 2,
 ' TH': 2,
 ' THE': 2,
 ' THE ': 2,
 ' W': 1,
 ' WA': 1,
 ' WAI': 1,
 ' WAIT': 1,
 '6': 1,
 '7T': 1,
 '7TH': 1,
 '7th': 1,
 '7th:the': 1,
 'A ': 1,
 'A C': 1,
 'A CA': 1,
 'A CAN': 1,
 'AI': 1,
 'AIT': 1,
 'AIT ': 1,
 'AIT F': 1,
 'AM': 1,
 'AME': 1,
 'AME ': 1,
 'AME O': 1,
 'AN': 1,
 'ANT': 1,
 'ANT ': 1,
 'ANT W': 1,
 'BE': 1,
 'BET': 1,
 'BETA': 1,
 'BETA ': 1,
 'CA': 1,
 'CAN': 1,
 'CANT': 1,
 'CANT ': 1,
 'E ': 3,
 'E 7': 1,
 'E 7T': 1,
 'E 7TH': 1,
 'E B': 1,
 'E BE': 1,
 'E BET': 1,
 'E O': 1,
 'E ON': 1,
 'E ON ': 1,
 'ET': 1,
 'ETA': 1,
 'ETA ': 1,
 'ETA C': 1,
 'FO': 1,
 'FOR': 1,
 'FOR ': 1,
 'FOR F': 1,
 'FU': 1,
 'FUL': 1,
 'FULL': 1,
 'FULL ': 1,
 'G ': 1,
 'G T': 1,


In [17]:
print(len(r1_features))

210


In [18]:
# As you can see, even for a short review, there are 210
# features present
# Now, how do we put this in an object that we can pass to the
# prediction function?

In [19]:
# For one thing, I just want to re-check the version of SKLL
# being used
print(skll.version.VERSION)

(1, 0, 1)


In [20]:
# Alright, great, we are using version 1.0.1, as I thought

In [21]:
from skll.data.featureset import FeatureSet
import numpy as np

In [22]:
r1_fs = FeatureSet('single_review_Football_Manager_2015',
                   np.array([str(r1['_id'])],
                            dtype=np.chararray),
                   np.array([float(r1['hours_bin'])],
                            dtype=np.float32),
                   [r1_features])

In [23]:
# Let's see what r1_fs is
r1_fs.features

<1x210 sparse matrix of type '<class 'numpy.float64'>'
	with 210 stored elements in Compressed Sparse Row format>

In [24]:
r1_fs.ids

array(['5554f294c134cf3ebe2f5009'], dtype=object)

In [25]:
r1_fs.name

'single_review_Football_Manager_2015'

In [26]:
# Ok, let's try to predict the label with our learner
l.predict(r1_fs)



array([ 1.91173608])

In [27]:
# How does the prediction compare with the actual value? Let's
# see...
r1_review_obj.hours_played

5.0

In [28]:
# So, it is off by a few. Remember, though, that the model was trained on
# a 5-point scale while the game's test data hours played values are on a
# 10-point scale, so the difference isn't quite as drastic as it might
# initially appear, but it is quite different

In [46]:
# Let's find a group of 100 test set reviews
_100rs = reviewdb.find({'game': 'Football_Manager_2015',
                        'partition': 'test'}).limit(100).skip(1)

In [30]:
# Here's an example review from the database
list(_100rs)[61]

{'_id': ObjectId('5554f294c134cf3ebe2f5047'),
 'appid': '295270',
 'game': 'Football_Manager_2015',
 'hours': 944.6,
 'hours_bin': 10,
 'partition': 'test',
 'review': "I've bought every Football manager game after the big split and a fair few champ man ones before that. Have to say I'm slightly disappointed by this years offering. Very little of interest has been added. Sure more conversations to be had but the quickly boil down to the same six or seven answers your're always giving. The management style is fun but quickly seems pointless and gaining the licenses if little more that a tick box exercise. The engine does look better and although far from perfect the board interaction is more detailed but with the bugs still there. What has to be the most annoying part for me though is the amount of time the game wants you to do basic HR work. From the exciting world or contact negotiations to micro managing every action your apparently half witted staff carry out is long winded. It's ma

In [34]:
_100rs_review_objs = [Review(r['review'],
                            float(r['hours_bin']),
                            r['game'],
                            r['appid'],
                            spaCy_nlp) for r in list(_100rs)]

In [35]:
# We can take a look at an example review before feature extraction
sample_r = _100rs_review_objs[61]

In [36]:
sample_r.norm

"i've bought every football manager game after the big split and a fair few champ man ones before that. have to say i'm slightly disappointed by this years offering. very little of interest has been added. sure more conversations to be had but the quickly boil down to the same six or seven answers your're always giving. the management style is fun but quickly seems pointless and gaining the licenses if little more that a tick box exercise. the engine does look better and although far from perfect the board interaction is more detailed but with the bugs still there. what has to be the most annoying part for me though is the amount of time the game wants you to do basic hr work. from the exciting world or contact negotiations to micro managing every action your apparently half witted staff carry out is long winded. it's made even more annoying by the fact your backroom staff rarely follow you and often the first few weeks at a new club (admittedly usually lower league for me) is spent wa

In [37]:
_100rs_features = [extract_features_from_review(r) for r in _100rs_review_objs]

In [38]:
sample_r_features = _100rs_features[61]
sample_r_features

{'t w': 2,
 'or m': 2,
 'who is': 1,
 '...': 13,
 'rare': 2,
 'improve much': 1,
 'not solve': 1,
 'all black': 1,
 'liverpool': 1,
 'more effort': 1,
 'edition haha': 1,
 'ars': 1,
 '0/': 1,
 'for osx': 1,
 'has no': 2,
 'loving the': 1,
 'since i': 1,
 'choose to': 1,
 'ke': 2,
 'memorable': 1,
 'w ch': 1,
 'rarely': 2,
 'aspects at': 1,
 'sam': 2,
 'old way': 1,
 've fo': 1,
 'been great': 1,
 'a bit.......but': 1,
 'their game': 1,
 '10 .': 2,
 'the s': 2,
 'ents ': 1,
 '14 this': 1,
 'should put': 1,
 ' sa': 3,
 'dissapointed': 1,
 'gine': 1,
 'int': 8,
 'yosemite )': 1,
 'pa': 2,
 'my defenders': 1,
 'have champman': 1,
 'ven a': 1,
 'their power': 1,
 'corner kick': 1,
 '-player interaction': 1,
 'r me)': 1,
 'community that': 1,
 'leauge and': 1,
 'be ignored': 1,
 'corners ,': 1,
 'ount': 1,
 'fm05': 1,
 'on their': 2,
 't mo': 1,
 ' vot': 1,
 'ure': 3,
 'manager community': 1,
 'kroom': 2,
 'in exhaustion': 1,
 's sp': 1,
 'last year': 4,
 'is it': 2,
 ' W': 1,
 'though': 4,


In [40]:
# Ok, now we can make the FeatureSet objects usable in SKLL
_100rs_fs = []
i = 1
for r, r_features in zip(list(_100rs),
                         _100rs_features):
    _100rs_fs.append(FeatureSet('Football_Manager_2015_{}'.format(i),
                                np.array([str(r['_id'])],
                                         dtype=np.chararray),
                                np.array([float(r['hours_bin'])],
                                         dtype=np.float32),
                                [r_features]))
    i += 1

In [41]:
# Sample FeatureSet
_100rs_fs[61]

{'name': 'Football_Manager_2015_62', 'labels': array([ 10.], dtype=float32), 'vectorizer': DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True), 'ids': array(['5554f294c134cf3ebe2f5047'], dtype=object), 'features': <1x10640 sparse matrix of type '<class 'numpy.float64'>'
	with 10640 stored elements in Compressed Sparse Row format>}

In [42]:
# Now, let's try some predictions
_100rs_preds = [l.predict(r_fs) for r_fs in _100rs_fs]



In [43]:
# Ok, we have predictions now!
_100rs_preds

[array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 3.29662265]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 2.52603847]),
 array([ 2.11593443]),
 array([ 2.8339743]),
 array([ 1.4825858]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.86768437]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.79209357]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.]),
 array([ 1.39128617]),
 array([

In [51]:
# Let's compare
import pandas as pd
preds_df = pd.DataFrame(dict(preds=[p[0] for p in _100rs_preds],
                             hours_played=[r['hours_bin'] for r in list(_100rs)]))
preds_df

Unnamed: 0,hours_played,preds
0,1,1.000000
1,1,1.000000
2,2,1.000000
3,4,3.296623
4,1,1.000000
5,1,1.000000
6,1,1.000000
7,2,1.000000
8,3,1.000000
9,1,1.000000


In [52]:
# Of course, this comparison doesn't really make sense because the
# scales are different and the model was made with a data-set from
# a completely different game!