In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from py.utils import load_data
import pickle

In [2]:
heads = ['l30_r15', 'l10_r10', 'l5_r5']
directory = '../data/'
n_cv = 5
performances = {}

## Logistic Regression  + L2

In [3]:
for head in heads:
    print('## dataset = %s ##' % head)
    x, y, x_words, vocabs = load_data(head, directory)
    x = normalize(x)
    for cost in [4.0, 1.0, 0.25, 100.0, 1000.0, 100000.0]:
        classifier = LogisticRegression(penalty='l2', C=cost)
        scores = cross_val_score(classifier, x, y, cv=n_cv)
        print('\ncost= %.2f: %s' % (cost, scores))

        performances[('Logistic + L2 (C=%.2f) norm' % cost, head)] = scores
        classifier.fit(x, y)
        model_name = 'Logistic + L2 (C=%.2f) norm ' % cost + head
        with open('../models/%s.pkl' % model_name, 'wb') as f:
            pickle.dump(classifier, f)
            
    print('-'*80)

## dataset = l30_r15 ##
x shape = (15166, 2617)
y shape = (15166,)
# features = 2617
# L words = 15166





cost= 4.00: [ 0.99241925  0.98944939  0.98944939  0.9901088   0.99109792]

cost= 1.00: [ 0.99143045  0.98911968  0.98911968  0.99043851  0.99076822]

cost= 0.25: [ 0.99110086  0.98747115  0.98813056  0.98878998  0.98944939]

cost= 100.00: [ 0.99208965  0.98911968  0.98780086  0.99076822  0.9901088 ]

cost= 1000.00: [ 0.99110086  0.98878998  0.98813056  0.99142763  0.98944939]

cost= 100000.00: [ 0.98681608  0.98351467  0.98780086  0.98944939  0.9848335 ]
--------------------------------------------------------------------------------
## dataset = l10_r10 ##
x shape = (31797, 3297)
y shape = (31797,)
# features = 3297
# L words = 31797





cost= 4.00: [ 0.98632075  0.98726415  0.98741941  0.98584683  0.98584683]

cost= 1.00: [ 0.98569182  0.98663522  0.98694763  0.98600409  0.98506054]

cost= 0.25: [ 0.9836478   0.98569182  0.98584683  0.98458877  0.98348797]

cost= 100.00: [ 0.98726415  0.98836478  0.98773392  0.98663312  0.98694763]

cost= 1000.00: [ 0.98553459  0.98773585  0.98616135  0.98553232  0.98616135]

cost= 100000.00: [ 0.98490566  0.98616352  0.98600409  0.98380248  0.98270168]
--------------------------------------------------------------------------------
## dataset = l5_r5 ##
x shape = (50764, 4995)
y shape = (50764,)
# features = 4995
# L words = 50764





cost= 4.00: [ 0.98384714  0.98325618  0.9844381   0.98355166  0.98256501]

cost= 1.00: [ 0.98236974  0.98236974  0.98404412  0.98236974  0.98118597]

cost= 0.25: [ 0.98049837  0.98030139  0.98187728  0.98089235  0.9784279 ]

cost= 100.00: [ 0.98532453  0.98266522  0.98384714  0.98266522  0.98335303]

cost= 1000.00: [ 0.98512755  0.98236974  0.98355166  0.98197577  0.98355004]

cost= 100000.00: [ 0.98335467  0.98118783  0.98355166  0.98168029  0.98246651]
--------------------------------------------------------------------------------


## Logistic Regression + L1

In [4]:
import pickle

for head in heads:    
    print('## dataset = %s ##' % head)
    x, y, x_words, vocabs = load_data(head, directory)
    x = normalize(x)
    for cost in [4, 1, 0.25]:
        classifier = LogisticRegression(penalty='l1', C=cost, n_jobs=4)
        scores = cross_val_score(classifier, x, y, cv=n_cv)
        print('\ncost=%.2f: %s' % (cost, scores))
        
        performances[('Logistic + L1 (C=%.2f) norm' % cost, head)] = scores        
        model_name = 'Logistic + L1 (C=%.2f) norm ' % cost + head
        classifier.fit(x, y)
        with open('../models/%s.pkl' % model_name, 'wb') as f:
            pickle.dump(classifier, f)   
        with open('performance_logistic_regression_norm.pkl', 'wb') as f:
            pickle.dump(performances, f)    
        
    print('-'*80)

## dataset = l30_r15 ##
x shape = (15166, 2617)
y shape = (15166,)
# features = 2617
# L words = 15166





cost=4.00: [ 0.99176005  0.98911968  0.98780086  0.9897791   0.99043851]

cost=1.00: [ 0.99077126  0.98813056  0.98681174  0.98944939  0.99076822]

cost=0.25: [ 0.98912327  0.98582262  0.98648203  0.98911968  0.99076822]
--------------------------------------------------------------------------------
## dataset = l10_r10 ##
x shape = (31797, 3297)
y shape = (31797,)
# features = 3297
# L words = 31797





cost=4.00: [ 0.98647799  0.98663522  0.98663312  0.98694763  0.98663312]

cost=1.00: [ 0.98569182  0.98632075  0.9863186   0.98584683  0.98663312]

cost=0.25: [ 0.98459119  0.98537736  0.98506054  0.98490329  0.984117  ]
--------------------------------------------------------------------------------
## dataset = l5_r5 ##
x shape = (50764, 4995)
y shape = (50764,)
# features = 4995
# L words = 50764





cost=4.00: [ 0.98404412  0.98355166  0.98374865  0.98345317  0.982171  ]

cost=1.00: [ 0.98424111  0.98246824  0.98374865  0.98394563  0.98148148]

cost=0.25: [ 0.98325618  0.98049837  0.9829607   0.98325618  0.97970843]
--------------------------------------------------------------------------------
