In [1]:
from sklearn.datasets import load_files
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import numpy as np

In [2]:
dataset = load_files('Dataset/new_descriptions', shuffle=False, encoding='utf-8')

In [3]:
dataset['target_names']

['Bank_Of_America',
 'Bealls',
 'EBS_Minds_IT',
 'ICON_Technologies',
 'Lorhan',
 'Nordstrom']

In [4]:
data = pd.DataFrame({'data': dataset['data'], 'target': dataset['target']})

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data,
    dataset.target,
    test_size=0.15,
    random_state=42
)

In [6]:
clfs = [
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(max_depth = 2, random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0),
]

In [7]:
vect = CountVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_train)
    print(metrics.classification_report(y_train, y_pred, target_names=dataset['target_names']))

    y_pred = pipeline.predict(X_test)
    print(metrics.classification_report(y_test, y_pred, target_names=dataset['target_names']))

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
                   precision    recall  f1-score   support

  Bank_Of_America       0.50      0.11      0.18        18
           Bealls       0.75      0.07      0.13        41
     EBS_Minds_IT       0.15      0.88      0.26        17
ICON_Technologies       0.63      0.92      0.75        26
           Lorhan       0.00      0.00      0.00        17
        Nordstrom       0.50      0.04      0.07        27

        micro avg       0.31      0.31      0.31       146
        macro avg       0.42      0.34      0.23       146
     weighted avg       0.50      0.31      0.24       146

                   precision    recall  f1-score   support

  Bank_Of_America       0.00      0.00      0.00         1
           Bealls       1.00      0.12      0.22         8
     EBS_Minds_IT       0.25      0.33      0.29         3
ICON_Technologies       0.19      1.00      0.32         4
           Lorhan       0.00      0.00      0.0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [8]:
param_grid = {
    'vect__binary': [True],
    'vect__lowercase': [True],
    'vect__sublinear_tf': [True, False],
    'vect__ngram_range': [(1, 3), (1, 4),(1, 7)],
    'vect__strip_accents': ['ascii'],
    'vect__analyzer': ['word'],
    'vect__min_df': [1, 2, 3],
    'vect__max_df': [1.],
    'clf__multi_class' : ['ovr', 'crammer_singer'],
    'clf__random_state': [0],
    'clf__fit_intercept':[True, False],
    'clf__loss':['hinge', 'squared_hinge'],
    'clf__C':[1.0, 0.1],
}

params_list = list(ParameterGrid(param_grid))

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

results = []
print(len(params_list))

for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    result = {'acc': acc, 'f1': f1}
        
    results.append({
        **result,
        **params,
    })

288


  'precision', 'predicted', average, warn_for)


In [9]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__fit_intercept,clf__loss,clf__multi_class,clf__random_state,f1,vect__analyzer,vect__binary,vect__lowercase,vect__max_df,vect__min_df,vect__ngram_range,vect__strip_accents,vect__sublinear_tf
24,0.576923,1.0,True,hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 3)",ascii,True
25,0.576923,1.0,True,hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 3)",ascii,False
26,0.576923,1.0,True,hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 4)",ascii,True
27,0.576923,1.0,True,hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 4)",ascii,False
28,0.576923,1.0,True,hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 7)",ascii,True
29,0.576923,1.0,True,hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 7)",ascii,False
60,0.576923,1.0,True,squared_hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 3)",ascii,True
61,0.576923,1.0,True,squared_hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 3)",ascii,False
62,0.576923,1.0,True,squared_hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 4)",ascii,True
63,0.576923,1.0,True,squared_hinge,crammer_singer,0,0.490842,word,True,True,1.0,2,"(1, 4)",ascii,False
