In [None]:
#for installing the packages for the 1st time use !pip install [package name]
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import pickle
import scipy.sparse as sparse

#  Choose model to train
choose between svc (support vector classifier), rfc (random forest classifier), nb (naive bayes), and lr (logistic regression)

In [None]:
model_name = 'lr'

# Load feature matrices and labels (used to train the model)

In [None]:
SENTIMENT140_DATA_DIR = 'Sentiment140.data' # sentiment 140 data set saved here
DG_DATA_DIR = 'D_G data' # D&G data set saved here
OUTPUT_DIR = 'output' # intermediate output and models saved here
FIGURES_DIR = 'figures' # figures saved here

In [None]:
word_counts_train = sparse.load_npz(os.path.join(OUTPUT_DIR, 'word_counts_train.npz'))
word_counts_test = sparse.load_npz(os.path.join(OUTPUT_DIR, 'word_counts_test.npz'))
word_counts_chopsticks = sparse.load_npz(os.path.join(OUTPUT_DIR, 'word_counts_chopsticks.npz'))
word_counts_all = sparse.load_npz(os.path.join(OUTPUT_DIR, 'word_counts_all.npz'))

X_140 = sparse.load_npz(os.path.join(OUTPUT_DIR, 'X_140.npz'))
X_test = sparse.load_npz(os.path.join(OUTPUT_DIR, 'X_test.npz'))
X_chopsticks = sparse.load_npz(os.path.join(OUTPUT_DIR, 'X_chopsticks.npz'))
X_all = sparse.load_npz(os.path.join(OUTPUT_DIR, 'X_all.npz'))

X_train = sparse.load_npz(os.path.join(OUTPUT_DIR, 'X_train.npz'))
X_validation = sparse.load_npz(os.path.join(OUTPUT_DIR, 'X_validation.npz'))

y_train = np.load(os.path.join(OUTPUT_DIR, 'y_train.npy'), allow_pickle=True)
y_trainsmall = np.load(os.path.join(OUTPUT_DIR, 'y_trainsmall.npy'), allow_pickle=True)
y_validation = np.load(os.path.join(OUTPUT_DIR, 'y_validation.npy'), allow_pickle=True)
y_test = np.load(os.path.join(OUTPUT_DIR, 'y_test.npy'), allow_pickle=True)

# Fit model

In [None]:
# fit model with tuned parameters
if model_name == 'svc':
    model = SVC(kernel='linear', C=0.1, probability=True)
elif model_name == 'rfc':
    model = RandomForestClassifier(min_samples_split=90)
elif model_name == 'nb':
    model = MultinomialNB() # using default hyperparameters
elif model_name == 'lr':
    model = LogisticRegression() # using default hyperparameters
else:
    raise Exception('unknown model')
    
    
model.fit(X_train, y_trainsmall)

In [None]:
# prediction on training
y_train_pred = model.predict(X_train)
y_train_proba = model.predict_proba(X_train)

# prediction on validation
y_validation_pred = model.predict(X_validation)
y_validation_proba = model.predict_proba(X_validation)

In [None]:
print('Training performance')
print(classification_report(y_trainsmall, y_train_pred))
print()
print('====================')
print()
print('Validation performance')
print(classification_report(y_validation, y_validation_pred))

In [None]:
plt.figure()
plt.hist(y_train_proba[:,1])
plt.xlabel('Predicted probability')
plt.ylabel('Number of tweets')
plt.title('Predicted probability distribution training')
plt.show()

In [None]:
plt.figure()
plt.hist(y_validation_proba[:,1])
plt.xlabel('Predicted probability')
plt.ylabel('Number of tweets')
plt.title('Predicted probability distribution validation')
plt.show()

In [None]:
# prediction on testing (once hyperparameters are tuned)
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)

In [None]:
plt.figure()
plt.hist(y_test_proba[:,1])
plt.xlabel('Predicted probability')
plt.ylabel('Number of tweets')
plt.title('Predicted probability distribution test')
plt.show()

In [None]:
# choose positive and negative thresholds
topacc = 0 # maximum accuracy
top_a = 0 # negative threshold
top_b = 0 # positive threshold

for i in range(100):
    a = 0 + (i*0.01)
    for j in range(100):
        b = 1 - (j*0.01)
        
        y_test_pred_binary = []
        for k in range(len(y_test_proba)):
            if (y_test_proba[k,1]) < a:
                y_test_pred_binary.append(0)
            elif(y_test_proba[k,1]) > b:
                y_test_pred_binary.append(4)
            else:
                y_test_pred_binary.append(2)
        
        accuracy = accuracy_score(y_test, y_test_pred_binary)
        #print(a, b, accuracy)
        if accuracy > topacc:
            topacc = accuracy
            top_a = a
            top_b = b
            y_test_pred = y_test_pred_binary
            print("acc = " + str(accuracy) + " a = " + str(a) + " b = " + str(b))
            
        
print("overall top acc = " + str(topacc)+ " a = " + str(top_a)+" b = " + str(top_b))

In [None]:
print('Testing performance')
print(classification_report(y_test, y_test_pred))

In [None]:
# prediction on chopsticks
y_chopsticks_proba = model.predict_proba(X_chopsticks)
y_chopsticks_pred = [-1 if x < top_a else 1 if x > top_b else 0 for x in y_chopsticks_proba[:,1]] # use thresholds to assign negative (-1), neutral (0), or positive (1) sentiment

# prediction on d&g overall
y_all_proba = model.predict_proba(X_all)
y_all_pred = [-1 if x < top_a else 1 if x > top_b else 0 for x in y_all_proba[:,1]] # use thresholds to assign negative (-1), neutral (0), or positive (1) sentiment

In [None]:
plt.figure()
plt.hist(y_chopsticks_proba[:,1])
plt.xlabel('Predicted probability')
plt.ylabel('Number of tweets')
plt.title('Predicted probability distribution D & G chopsticks')
plt.show()

In [None]:
plt.figure()
plt.hist(y_all_proba[:,1])
plt.xlabel('Predicted probability')
plt.ylabel('Number of tweets')
plt.title('Predicted probability distribution D & G all')
plt.show()

# Save model and predictions

In [None]:
# save model
with open(os.path.join(OUTPUT_DIR, f'{model_name}_model'), 'wb') as f:
    pickle.dump(model, f)

# save sentiment 140 predictions
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_train_pred.npy'), y_train_pred)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_train_proba.npy'), y_train_proba)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_validation_pred.npy'), y_validation_pred)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_validation_proba.npy'), y_validation_proba)

np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_test_pred.npy'), y_test_pred)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_test_proba.npy'), y_test_proba)

# save D&G predictions
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_chopsticks_pred.npy'), y_chopsticks_pred)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_chopsticks_proba.npy'), y_chopsticks_proba)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_all_pred.npy'), y_all_pred)
np.save(os.path.join(OUTPUT_DIR, f'{model_name}_y_all_proba.npy'), y_all_proba)

In [None]:
model = pickle.load(open(os.path.join(OUTPUT_DIR, f'{model_name}_model'), 'rb'))

y_train_pred = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_train_pred.npy'), allow_pickle=True)
y_train_proba = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_train_proba.npy'), allow_pickle=True)
y_validation_pred = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_validation_pred.npy'), allow_pickle=True)
y_validation_proba = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_validation_proba.npy'), allow_pickle=True)

y_test_proba = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_validation_pred.npy'), allow_pickle=True)
y_test_pred = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_validation_pred.npy'), allow_pickle=True)

y_chopsticks_pred = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_chopsticks_pred.npy'), allow_pickle=True)
y_chopsticks_proba = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_chopsticks_proba.npy'), allow_pickle=True)

y_all_pred = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_all_pred.npy'), allow_pickle=True)
y_all_proba = np.load(os.path.join(OUTPUT_DIR, f'{model_name}_y_all_proba.npy'), allow_pickle=True)
