In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from py.utils import load_data
import pickle

In [2]:
heads = ['l30_r15', 'l10_r10', 'l5_r5']
directory = '../data/'
n_cv = 5
performances = {}

## Logistic Regression  + L2

In [3]:
for head in heads:
    print('## dataset = %s ##' % head)
    x, y, x_words, vocabs = load_data(head, directory)
    x = normalize(x)
    for cost in [4.0, 1.0, 0.25]:
        classifier = LogisticRegression(penalty='l2', C=1)
        scores = cross_val_score(classifier, x, y, cv=n_cv)
        print('\ncost= %.2f: %s' % (cost, scores))

        performances[('Logistic + L2 (C=%.2f) norm' % cost, head)] = scores
        classifier.fit(x, y)
        model_name = 'Logistic + L2 (C=%.2f) norm ' % cost + head
        with open('../models/%s.pkl' % model_name, 'wb') as f:
            pickle.dump(classifier, f)
            
    print('-'*80)

## dataset = l30_r15 ##
x shape = (15715, 4551)
y shape = (15715,)
# features = 4551
# L words = 15715





cost= 4.00: [ 0.9971374   0.99300032  0.99586382  0.99331849  0.99554424]

cost= 1.00: [ 0.9971374   0.99300032  0.99586382  0.99331849  0.99554424]

cost= 0.25: [ 0.9971374   0.99300032  0.99586382  0.99331849  0.99554424]
--------------------------------------------------------------------------------
## dataset = l10_r10 ##
x shape = (31664, 5857)
y shape = (31664,)
# features = 5857
# L words = 31664





cost= 4.00: [ 0.99415759  0.99257856  0.99305227  0.99289436  0.99194567]

cost= 1.00: [ 0.99415759  0.99257856  0.99305227  0.99289436  0.99194567]

cost= 0.25: [ 0.99415759  0.99257856  0.99305227  0.99289436  0.99194567]
--------------------------------------------------------------------------------
## dataset = l5_r5 ##
x shape = (49547, 9307)
y shape = (49547,)
# features = 9307
# L words = 49547





cost= 4.00: [ 0.99192735  0.99122099  0.99172469  0.99152286  0.99051367]

cost= 1.00: [ 0.99192735  0.99122099  0.99172469  0.99152286  0.99051367]

cost= 0.25: [ 0.99192735  0.99122099  0.99172469  0.99152286  0.99051367]
--------------------------------------------------------------------------------


## Logistic Regression + L1

In [4]:
import pickle

for head in heads:    
    print('## dataset = %s ##' % head)
    x, y, x_words, vocabs = load_data(head, directory)
    x = normalize(x)
    for cost in [4, 1, 0.25]:
        classifier = LogisticRegression(penalty='l1', C=cost, n_jobs=4)
        scores = cross_val_score(classifier, x, y, cv=n_cv)
        print('\ncost=%.2f: %s' % (cost, scores))
        
        performances[('Logistic + L1 (C=%.2f) norm' % cost, head)] = scores        
        model_name = 'Logistic + L1 (C=%.2f) norm ' % cost + head
        classifier.fit(x, y)
        with open('../models/%s.pkl' % model_name, 'wb') as f:
            pickle.dump(classifier, f)   
        with open('performance_logistic_regression_norm.pkl', 'wb') as f:
            pickle.dump(performances, f)    
        
    print('-'*80)

## dataset = l30_r15 ##
x shape = (15715, 4551)
y shape = (15715,)
# features = 4551
# L words = 15715





cost=4.00: [ 0.9971374   0.99522749  0.99713649  0.99395482  0.99618078]

cost=1.00: [ 0.99618321  0.99331849  0.99586382  0.99300032  0.9949077 ]

cost=0.25: [ 0.99650127  0.99109131  0.99204582  0.99268215  0.99458943]
--------------------------------------------------------------------------------
## dataset = l10_r10 ##
x shape = (31664, 5857)
y shape = (31664,)
# features = 5857
# L words = 31664





cost=4.00: [ 0.99542081  0.99226275  0.99415759  0.99368388  0.9932091 ]

cost=1.00: [ 0.9946313   0.99099953  0.99352598  0.99273646  0.9921036 ]

cost=0.25: [ 0.99257856  0.98910469  0.99084162  0.99052582  0.99020846]
--------------------------------------------------------------------------------
## dataset = l5_r5 ##
x shape = (49547, 9307)
y shape = (49547,)
# features = 9307
# L words = 49547





cost=4.00: [ 0.99293643  0.99112008  0.9923302   0.99243112  0.99111918]

cost=1.00: [ 0.99192735  0.98990918  0.99132102  0.99182561  0.99051367]

cost=0.25: [ 0.98980827  0.98819374  0.98940357  0.98990816  0.98819255]
--------------------------------------------------------------------------------
