In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from py.utils import load_data
import pickle

In [2]:
heads = ['l30_r15', 'l10_r10', 'l5_r5']
directory = '../data/'
n_cv = 5
performances = {}

## Logistic Regression  + L2

In [3]:
for head in heads:
    print('## dataset = %s ##' % head)
    x, y, x_words, vocabs = load_data(head, directory)
    x = normalize(x)
    for cost in [4.0, 1.0, 0.25, 100.0, 1000.0, 100000.0]:
        classifier = LogisticRegression(penalty='l2', C=cost)
        scores = cross_val_score(classifier, x, y, cv=n_cv)
        print('\ncost= %.2f: %s' % (cost, scores))

        performances[('Logistic + L2 (C=%.2f) norm' % cost, head)] = scores
        classifier.fit(x, y)
        model_name = 'Logistic + L2 (C=%.2f) norm ' % cost + head
        with open('../models/%s.pkl' % model_name, 'wb') as f:
            pickle.dump(classifier, f)
            
    print('-'*80)

## dataset = l30_r15 ##
x shape = (15105, 2770)
y shape = (15105,)
# features = 2770
# L words = 15105





cost= 4.00: [ 0.99636003  0.99668984  0.99735187  0.99602781  0.99569536]

cost= 1.00: [ 0.99636003  0.99635882  0.99668984  0.99569679  0.99403974]

cost= 0.25: [ 0.99569821  0.99569679  0.99503476  0.99404171  0.99337748]

cost= 100.00: [ 0.99602912  0.99635882  0.99702085  0.99635882  0.99536424]

cost= 1000.00: [ 0.99636003  0.99635882  0.99702085  0.99668984  0.99536424]

cost= 100000.00: [ 0.99470549  0.99503476  0.99470374  0.99470374  0.99337748]
--------------------------------------------------------------------------------
## dataset = l10_r10 ##
x shape = (31544, 3515)
y shape = (31544,)
# features = 3515
# L words = 31544





cost= 4.00: [ 0.9958789   0.99476938  0.99397686  0.99540339  0.99524413]

cost= 1.00: [ 0.9957204   0.99445237  0.99365985  0.99476938  0.99429296]

cost= 0.25: [ 0.99461087  0.99334284  0.99255032  0.99381835  0.99318326]

cost= 100.00: [ 0.99698843  0.99524489  0.99429387  0.9955619   0.99508561]

cost= 1000.00: [ 0.99730544  0.99524489  0.99429387  0.99508638  0.99508561]

cost= 100000.00: [ 0.99524489  0.99492788  0.99334284  0.99334284  0.99492708]
--------------------------------------------------------------------------------
## dataset = l5_r5 ##
x shape = (50229, 5361)
y shape = (50229,)
# features = 5361
# L words = 50229





cost= 4.00: [ 0.99462473  0.99343022  0.99452518  0.99402747  0.99442509]

cost= 1.00: [ 0.99402747  0.99293251  0.99362931  0.99392793  0.99352912]

cost= 0.25: [ 0.99323114  0.99104121  0.99263388  0.99253434  0.99203584]

cost= 100.00: [ 0.99512244  0.99303205  0.99482381  0.99442564  0.99362867]

cost= 1000.00: [ 0.99532152  0.9924348   0.99452518  0.99502289  0.99402688]

cost= 100000.00: [ 0.99482381  0.99263388  0.99372885  0.99412702  0.99352912]
--------------------------------------------------------------------------------


## Logistic Regression + L1

In [4]:
import pickle

for head in heads:    
    print('## dataset = %s ##' % head)
    x, y, x_words, vocabs = load_data(head, directory)
    x = normalize(x)
    for cost in [4, 1, 0.25]:
        classifier = LogisticRegression(penalty='l1', C=cost, n_jobs=4)
        scores = cross_val_score(classifier, x, y, cv=n_cv)
        print('\ncost=%.2f: %s' % (cost, scores))
        
        performances[('Logistic + L1 (C=%.2f) norm' % cost, head)] = scores        
        model_name = 'Logistic + L1 (C=%.2f) norm ' % cost + head
        classifier.fit(x, y)
        with open('../models/%s.pkl' % model_name, 'wb') as f:
            pickle.dump(classifier, f)   
        with open('performance_logistic_regression_norm.pkl', 'wb') as f:
            pickle.dump(performances, f)    
        
    print('-'*80)

## dataset = l30_r15 ##
x shape = (15105, 2770)
y shape = (15105,)
# features = 2770
# L words = 15105





cost=4.00: [ 0.99636003  0.99702085  0.99702085  0.99569679  0.99271523]

cost=1.00: [ 0.99536731  0.99602781  0.99635882  0.99536577  0.99172185]

cost=0.25: [ 0.99470549  0.99470374  0.99536577  0.99404171  0.99072848]
--------------------------------------------------------------------------------
## dataset = l10_r10 ##
x shape = (31544, 3515)
y shape = (31544,)
# features = 3515
# L words = 31544





cost=4.00: [ 0.99667142  0.99492788  0.99492788  0.99635441  0.99508561]

cost=1.00: [ 0.99524489  0.99492788  0.99381835  0.99508638  0.99492708]

cost=0.25: [ 0.99365985  0.99365985  0.99096529  0.99381835  0.99445149]
--------------------------------------------------------------------------------
## dataset = l5_r5 ##
x shape = (50229, 5361)
y shape = (50229,)
# features = 5361
# L words = 50229





cost=4.00: [ 0.99442564  0.99253434  0.9943261   0.99422656  0.99342957]

cost=1.00: [ 0.99402747  0.99263388  0.99402747  0.99372885  0.99372822]

cost=0.25: [ 0.99283297  0.99114075  0.99283297  0.99263388  0.99283225]
--------------------------------------------------------------------------------
