In [43]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV,ShuffleSplit

from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [44]:
data= pd.read_csv('data/news_dataset_v3_FE.csv')
train_x= pd.read_pickle('data/train.pickle')
train_y= pd.read_pickle('data/train_y.pickle')

val_x= pd.read_pickle('data/test.pickle')
val_y= pd.read_pickle('data/test_y.pickle')

##  Predicting Using Default parameters of Logistics Regression :

In [68]:
lr= LogisticRegression()

In [69]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [70]:
lr.fit(train_x,train_y)

  return f(**kwargs)


LogisticRegression()

In [71]:
answer1= lr.predict(train_x)
accuracy_score(answer1,train_y)*100

97.67318878900053

In [72]:
answer2= lr.predict(val_x)
accuracy_score(answer2,val_y)*100

94.61077844311377

In [10]:
print(classification_report(answer1,train_y))

               precision    recall  f1-score   support

     business       0.96      0.98      0.97       422
entertainment       0.95      0.99      0.97       323
     politics       0.98      0.95      0.97       355
        sport       1.00      0.99      0.99       443
         tech       0.99      0.97      0.98       348

     accuracy                           0.98      1891
    macro avg       0.98      0.98      0.98      1891
 weighted avg       0.98      0.98      0.98      1891



In [50]:
print(classification_report(answer2,val_y))

               precision    recall  f1-score   support

     business       0.98      0.89      0.93        89
entertainment       0.94      0.94      0.94        49
     politics       0.89      0.98      0.93        65
        sport       0.99      1.00      0.99        71
         tech       0.93      0.93      0.93        60

     accuracy                           0.95       334
    macro avg       0.94      0.95      0.95       334
 weighted avg       0.95      0.95      0.95       334



## Prediction by tuning some hyperparameters:

In [73]:
C = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)]
multi_class=['multinomial']
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
random_grid= {'C':C,
             'multi_class':multi_class,
             'solver':solver
             }
lr= LogisticRegression(random_state=9)

In [74]:
random_search= RandomizedSearchCV(
               estimator= lr,
               param_distributions= random_grid,
               scoring='accuracy')

In [75]:
random_search.fit(train_x,np.ravel(train_y))

RandomizedSearchCV(estimator=LogisticRegression(random_state=9),
                   param_distributions={'C': [0.1, 0.2, 0.30000000000000004,
                                              0.4, 0.5, 0.6, 0.7000000000000001,
                                              0.8, 0.9, 1.0],
                                        'multi_class': ['multinomial'],
                                        'solver': ['newton-cg', 'sag', 'saga',
                                                   'lbfgs']},
                   scoring='accuracy')

In [76]:
random_search.best_params_

{'solver': 'saga', 'multi_class': 'multinomial', 'C': 1.0}

In [77]:
random_search.best_score_

0.9523991009479136

In [78]:
model1= random_search.best_estimator_

answer1= model1.predict(train_x)
print('Training Score :',accuracy_score(answer1,train_y))

answer2= model1.predict(val_x)
print('Testing Score :',accuracy_score(answer2,val_y))

Training Score : 0.9767318878900053
Testing Score : 0.9461077844311377


##  More exhaustive search centered in those values:

In [79]:
C = [float(x) for x in np.linspace(start = 0.5, stop = 1.5, num = 10)]
multi_class=['multinomial']
solver = ['saga']
param_grid= {'C':C,
             'multi_class':multi_class,
             'solver':solver
             }
lr= LogisticRegression(random_state=9)

grid_search= GridSearchCV(
                            estimator= lr,
                            param_grid= param_grid,
                            scoring='accuracy',
                          )
grid_search.fit(train_x,np.ravel(train_y))

GridSearchCV(estimator=LogisticRegression(random_state=9),
             param_grid={'C': [0.5, 0.6111111111111112, 0.7222222222222222,
                               0.8333333333333333, 0.9444444444444444,
                               1.0555555555555556, 1.1666666666666665,
                               1.2777777777777777, 1.3888888888888888, 1.5],
                         'multi_class': ['multinomial'], 'solver': ['saga']},
             scoring='accuracy')

In [80]:
grid_search.best_score_

0.9523977049043012

In [81]:
grid_search.best_params_

{'C': 1.3888888888888888, 'multi_class': 'multinomial', 'solver': 'saga'}

In [82]:
model2= grid_search.best_estimator_

In [88]:
answer1= model2.predict(train_x)
answer2= model2.predict(val_x)

print(' Training error : ',accuracy_score(train_y,answer1)*100)
print(' Testing error : ',accuracy_score(val_y,answer2)*100)

 Training error :  98.09624537281861
 Testing error :  94.61077844311377


In [89]:
print(classification_report(answer1,train_y))

               precision    recall  f1-score   support

     business       0.97      0.98      0.97       422
entertainment       0.97      0.99      0.98       328
     politics       0.98      0.96      0.97       352
        sport       1.00      0.99      0.99       442
         tech       0.99      0.97      0.98       347

     accuracy                           0.98      1891
    macro avg       0.98      0.98      0.98      1891
 weighted avg       0.98      0.98      0.98      1891



In [90]:
print(classification_report(answer2,val_y))

               precision    recall  f1-score   support

     business       0.98      0.89      0.93        89
entertainment       0.94      0.94      0.94        49
     politics       0.89      0.98      0.93        65
        sport       0.99      1.00      0.99        71
         tech       0.93      0.93      0.93        60

     accuracy                           0.95       334
    macro avg       0.94      0.95      0.95       334
 weighted avg       0.95      0.95      0.95       334



In [98]:
d = {
     'Model': 'Logistic Regression',
     'Training Set Accuracy': accuracy_score(train_y,answer1)*100,
     'Test Set Accuracy': accuracy_score(val_y,answer2)*100
}

df_models_lrc = pd.DataFrame(d, index=[0])
df_models_lrc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Logistic Regression,98.096245,94.610778


In [102]:
import pickle
with open('Models/best_lrc.pickle', 'wb') as output:
    pickle.dump(model2, output)
    
with open('Models/df_models_lrc.pickle', 'wb') as output:
    pickle.dump(df_models_lrc, output)