# Classifier Optimization
## Prepare the environment

In [None]:
%matplotlib inline
%run settings
%config InlineBackend.figure_format = 'retina'

%reload_ext autoreload
%autoreload 2

import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

logging.getLogger().setLevel(logging.WARNING)
logging.getLogger().info("Logging INFOS.")
logging.getLogger().warning("Logging WARNINGS.")
logging.getLogger().error("Logging ERRORS.")

%precision 3
np.set_printoptions(suppress=True)

import pickle
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

## Prepare the data
To create these files follow the README and the notebook __DataSet__

In [None]:
training_df = pd.read_csv("../resources/training_data.csv")
evaluation_df = pd.read_csv("../resources/evaluation_data.csv")

tfidf_vect = TfidfVectorizer(ngram_range=(1, 2), 
                             min_df=10, 
                             max_df=0.3, 
                             lowercase=True,
                             stop_words=None)

text_col = 'tweet'
label = 'class'

X_tfidf = tfidf_vect.fit_transform(training_df[text_col])
X_tfidf.shape

Train Test Split

In [None]:
X = X_tfidf
y = training_df[label]
# define holdout
test_size = 0.2

if test_size > 0.0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size,
                                                        stratify = y,
                                                        random_state=43
                                                       )
else:
    X_train, X_test, y_train, y_test = X, None, y, None
    
    
training_df['train_test'] = pd.Series(
    training_df.index.isin(y_test.index)).map(
    lambda x: 'Test' if x else 'Train')
print("Trainigsmatrix:", X_train.shape)
print("Testmatrix:    ", X_test.shape)

## Train classifiers

In [None]:
print(f'Training on column {label}')


clfSVC = SVC(random_state = 912, kernel='rbf')
clfSVC.fit(X_train, y_train)

clfSVCLin = LinearSVC(C=1.0, max_iter=10000)
clfSVCLin.fit(X_train, y_train)

clfLogRegr = LogisticRegression(random_state = 42)
clfLogRegr.fit(X_train, y_train)

clfXGB = xgb.XGBClassifier(seed = 82)
clfXGB.fit(X_train, y_train)

clfMulNB = MultinomialNB()
clfMulNB.fit(X_train, y_train)

print("Done.")

## Evaluation

In [None]:
def evaluate(clf, X_train, y_train, X_test, y_test):
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X_train)
    y_pred = clf.predict(X)

    print(f"Classifier: {clf.__class__}\n")

    print('Accuracy Summary')
    print('================')

    print(f'Test:    {accuracy_score(y_test, y_test_pred)*100:6.2f}%')
    print(f'Train:   {accuracy_score(y_train, y_train_pred)*100:6.2f}%')
    print(f'Overall: {accuracy_score(y, y_pred)*100:6.2f}%')
    
    print("Classification Report")
    print("=====================")
    print(classification_report(y_true=y_test, y_pred=y_test_pred))
    
    ###plot confusion matrix###
    
    # label names - specifies order in confusion matrix
    label_names = sorted(y_test.unique())

    # scale figure size depending on number of categories
    fsize = len(label_names)

    conf_mat = confusion_matrix(y_test, y_test_pred, labels=label_names)

    _ = fig, ax = plt.subplots(figsize=(fsize, fsize))
    _ = sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False, 
                    xticklabels=label_names, yticklabels=label_names)
    _ = plt.ylabel("Actual")
    _ = plt.xlabel("Predicted")
    _ = ax.set_title(f"Confusion Matrix for {label}", fontsize=14)

In [None]:
print('clfSVC:')
evaluate(clfSVC, X_train, y_train, X_test, y_test)

In [None]:
print('clfSVCLin:')
evaluate(clfSVCLin, X_train, y_train, X_test, y_test)

In [None]:
print('clfLogRegr:')
evaluate(clfLogRegr, X_train, y_train, X_test, y_test)

In [None]:
print('clfXGB:')
evaluate(clfXGB, X_train, y_train, X_test, y_test)

In [None]:
print('clfMulNB:')
evaluate(clfMulNB, X_train, y_train, X_test, y_test)

## Hyperparameter Tuning

In [None]:
parameters = {'C':[0.1,1]#,10,100,1000]#,
              #'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
              #'gamma': ['scale', 'auto']
}

clf = SVC()
f1_scorer = make_scorer(f1_score,pos_label='H', average='weighted')

grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print (clf)

Save the best classifier

In [None]:
pickle.dump(clf, open('../resources/tuned_senti_model.pkl', 'wb'))
pickle.dump(tfidf_vect, open('../resources/tuned_tfidf_vect.pkl', 'wb'))