### Settings and imports! <div class="tocSkip">
    
`%run settings` executes `settings.py`. It contains most settings and imports.
    
About `autoreload` refer to [this page](https://ipython.org/ipython-doc/stable/config/extensions/autoreload.html).

In [None]:
%matplotlib inline
%run settings
%config InlineBackend.figure_format = 'retina'

%reload_ext autoreload
%autoreload 2

In [None]:
# Print out every value instead of just "last_expr" (default)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

logging.getLogger().setLevel(logging.WARNING)
logging.getLogger().info("Logging INFOS.")
logging.getLogger().warning("Logging WARNINGS.")
logging.getLogger().error("Logging ERRORS.")

In [None]:
# set precision for similarity values
%precision 3
np.set_printoptions(suppress=True) # no scientific for small numbers

In [None]:
# remaining imports
import pickle
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Automated Hate Speech Detection and the Problem of Offensive Language

In [None]:
df = pd.read_csv("../resources/auto_labled_data.csv")
df.sample(10)

#### Prepare data

In [None]:
def classification(x):
    if x == 0:
        return "hate_speech"
    if x == 1:
        return "offensive_language"
    if x == 2:
        return "neither"


df['class'] = df['class'].apply(classification)

In [None]:
df = df.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'])

In [None]:
df

In [None]:
text_col = 'tweet'

label = 'class'

## Training of a classificator for Sentiment-Analysis


### Vektorizer


In [None]:
df[label].value_counts().to_frame()

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2), 
                             min_df=10, 
                             max_df=0.3, 
                             lowercase=True,
                             stop_words=None)

X_tfidf = tfidf_vect.fit_transform(df[text_col])
X_tfidf.shape

### Train-Test-Split

In [None]:
# alternativ: X = X_tf
X = X_tfidf
y = df[label]

In [None]:
# define holdout
test_size = 0.2

if test_size > 0.0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size,
                                                        stratify = y,
                                                        random_state=43
                                                       )
else:
    X_train, X_test, y_train, y_test = X, None, y, None
    
    
print("Trainig matrix:", X_train.shape)
print("Test matrix:    ", X_test.shape)

In [None]:
df['train_test'] = pd.Series(df.index.isin(y_test.index)).map(lambda x: 'Test' if x else 'Train')

In [None]:
df['train_test'].value_counts()

### Training

In [None]:
print(f'Training on column {label}')

clf = LinearSVC(C=1.0, max_iter=10000)

clf.fit(X_train, y_train);

print("Done.")

### Scoring

In [None]:
y_test_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X)

print(f"Classifier: {clf.__class__}\n")

print('Accuracy Summary')
print('================')

print(f'Test:    {accuracy_score(y_test, y_test_pred)*100:6.2f}%')
print(f'Train:   {accuracy_score(y_train, y_train_pred)*100:6.2f}%')
print(f'Overall: {accuracy_score(y, y_pred)*100:6.2f}%')

In [None]:
print("Classification Report")
print("=====================")
print(classification_report(y_true=y_test, y_pred=y_test_pred))

### Confusion Matrix

In [None]:
# label names - specifies order in confusion matrix
label_names = sorted(y_test.unique())

# scale figure size depending on number of categories
fsize = len(label_names)

conf_mat = confusion_matrix(y_test, y_test_pred, labels=label_names)

_ = fig, ax = plt.subplots(figsize=(fsize, fsize))
_ = sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False, 
                xticklabels=label_names, yticklabels=label_names)
_ = plt.ylabel("Actual")
_ = plt.xlabel("Predicted")
_ = ax.set_title(f"Confusion Matrix for {label}", fontsize=14)

## Dump model into a file and use it again

In [None]:
# save the model to disk
pickle.dump(clf, open('senti_svc_model.pkl', 'wb'))
 
# load the model from disk
clf = pickle.load(open('senti_svc_model.pkl', 'rb'))

Same goes for vectorizer (Transformation):

In [None]:
# save the model to disk
pickle.dump(tfidf_vect, open('tfidf_vect.pkl', 'wb'))
 
# load the model from disk
tfidf_vect = pickle.load(open('tfidf_vect.pkl', 'rb'))

# Test this model with a modified fox_news dataset

In [None]:
df_testset = pd.read_csv("../resources/fox_news.csv", sep=";")
df_testset.sample(10)

In [None]:
def classification_foxnews(x):
    if x == 0:
        return "neither"
    if x == 1:
        return "offensive_language"


df_testset['class'] = df_testset['class'].apply(classification_foxnews)
df_testset.sample(10)

In [None]:
x_foxnews = tfidf_vect.transform(df_testset['tweet'])
y_foxnews = df_testset['class']

In [None]:
y_pred = clf.predict(x_foxnews)

In [None]:
print('Accuracy Summary')
print('================')

print(f'Test:    {accuracy_score(y_foxnews, y_pred)*100:6.2f}%')

In [None]:
conf_mat = confusion_matrix(y_foxnews, y_pred, labels=label_names)

_ = fig, ax = plt.subplots(figsize=(fsize, fsize))
_ = sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False, 
                xticklabels=label_names, yticklabels=label_names)
_ = plt.ylabel("Actual")
_ = plt.xlabel("Predicted")
_ = ax.set_title(f"Confusion Matrix for {label}", fontsize=14)

# Test an already existing model with some new data

Load model and vectorizer. (In order to get this model, the code has to be changed manually to use the fox_news dataset --> this should be changed later)

In [None]:
clf = pickle.load(open('../sentimentDetector/fox_news_model.pkl', 'rb'))
tfidf_vect = pickle.load(open('../sentimentDetector/fox_news_vectorizer.pkl', 'rb'))

Load the data and prepare it

In [None]:
df = pd.read_csv("../resources/auto_labled_data.csv")
df['class'] = df['class'].apply(classification)
df = df.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'])

Do the prediction

In [None]:
x = tfidf_vect.transform(df['tweet'])
y = df['class']
y_pred = clf.predict(x)

Evaluate the prediction

In [None]:
print('Accuracy Summary')
print('================')

print(f'Test:    {accuracy_score(y, y_pred)*100:6.2f}%')

In [None]:
conf_mat = confusion_matrix(y, y_pred, labels=label_names)

_ = fig, ax = plt.subplots(figsize=(fsize, fsize))
_ = sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False, 
                xticklabels=label_names, yticklabels=label_names)
_ = plt.ylabel("Actual")
_ = plt.xlabel("Predicted")
_ = ax.set_title(f"Confusion Matrix for {label}", fontsize=14)