# Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from textblob import TextBlob
import warnings

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier
# Performance metric
from sklearn.metrics import f1_score

pd.set_option('display.max_columns', None)

# Load Data

In [2]:
df_tweets = pd.read_csv('C:/Users/melin/Documents/Springboard Data Science Career Track/Capstone Projects/Capstone Project 2/df_tweets.csv')

# Create Sentiment Labels

In [3]:
# function for labeling sentiment words
def sentimentize(text):
    if TextBlob(str(text)).sentiment.subjectivity <= 0.05:
        label = 'neutral'
    elif TextBlob(str(text)).sentiment.polarity > 0.1:
        label = 'positive'
    else:
        label = 'negative'
    return label

In [4]:
#Apply sentimentize function to cleaned_text column in df_tweets dataframe
df_tweets['sentiment'] = df_tweets['cleaned_text'].apply(lambda x: sentimentize(x))

### Checking Counts of Sentiment Labels

In [5]:
#Checking the value counts of sentiment column in df_tweets
df_tweets['sentiment'].value_counts()

neutral     315501
negative    255354
positive    242523
Name: sentiment, dtype: int64

### Export Dataset with Sentiment Labels to csv

In [6]:
df_tweets.to_csv (r'C:/Users/melin/Documents/Springboard Data Science Career Track/Capstone Projects/Capstone Project 2/tweets_sentiment.csv', index = False, header=True)

# Convert Text to Features

In [7]:
#Remove rows containing missing values under the cleaned_text column 
df_tweets = df_tweets[df_tweets['cleaned_text'].notnull()]

##Format sentiment labels (target variable) for input into sklearn’s MultiLabelBinarizer( )
#Converting values in target variable into lists
y = df_tweets['sentiment'].str.split() 

In [8]:
#One hot encode the target variable by using sklearn’s MultiLabelBinarizer( )
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [9]:
multilabel_binarizer.classes_

array(['negative', 'neutral', 'positive'], dtype=object)

In [10]:
# transform target variable
y = multilabel_binarizer.transform(y)

In [11]:
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df_tweets['cleaned_text'], y, test_size=0.2, random_state=42)

#### TF-IDF

In [12]:
#Using TF-IDF to extract features from the cleaned version of the text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000) # set 10,000 most frequent words in the data as features

In [13]:
#Create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

# Build Sentiment Analysis Model

#### TF-IDF and Naive Bayes Classifier for Multinomial Models with OneVsRestClassifier

In [14]:
#Use sklearn’s OneVsRestClassifier class to solve the Naive Bayes Classifier model's problem as a Binary Relevance or one-vs-all problem
nb = MultinomialNB()
clf_nb = OneVsRestClassifier(nb)

In [15]:
#Fit model on TF-IDF train data
clf_nb.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None)

In [16]:
#Make predictions for validation set
y_pred_tfidf_nb = clf_nb.predict(xval_tfidf)

#### TF-IDF and Linear Support Vector Machine with OneVsRestClassifier

In [17]:
#Use sklearn’s OneVsRestClassifier class to solve the Linear Support Vector Machine model's problem as a Binary Relevance or one-vs-all problem
lsvm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, tol=None)
clf_lsvm = OneVsRestClassifier(lsvm)

In [18]:
#Fit model on TF-IDF train data
clf_lsvm.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=SGDClassifier(alpha=0.001, average=False,
                                            class_weight=None,
                                            early_stopping=False, epsilon=0.1,
                                            eta0=0.0, fit_intercept=True,
                                            l1_ratio=0.15,
                                            learning_rate='optimal',
                                            loss='hinge', max_iter=1000,
                                            n_iter_no_change=5, n_jobs=None,
                                            penalty='l2', power_t=0.5,
                                            random_state=42, shuffle=True,
                                            tol=None, validation_fraction=0.1,
                                            verbose=0, warm_start=False),
                    n_jobs=None)

In [19]:
#Make predictions for validation set
y_pred_tfidf_lsvm = clf_lsvm.predict(xval_tfidf)

#### TF-IDF and Logistic Regression with OneVsRestClassifier

In [20]:
#Use sklearn’s OneVsRestClassifier class to solve the Logistic Regression model's problem as a Binary Relevance or one-vs-all problem
lr = LogisticRegression(max_iter = 4000)
clf_lr = OneVsRestClassifier(lr)

In [21]:
#Fit model on TF-IDF train data
clf_lr.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=4000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [22]:
#Make predictions for validation set
y_pred_tfidf_lr = clf_lr.predict(xval_tfidf)

# Evaluate Model

#### TF-IDF and Naive Bayes Classifier for Multinomial Models with OneVsRestClassifier

In [26]:
##Evaluate TF-IDF and Naive Bayes Classifier for Multinomial Models with OneVsRestClassifier

#confusion matrix
confusion = confusion_matrix(yval.argmax(axis=1), y_pred_tfidf_nb.argmax(axis=1))
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(yval, y_pred_tfidf_nb)))

print('Micro Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_nb, average='micro', labels=np.unique(y_pred_tfidf_nb))))
print('Micro Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_nb, average='micro', labels=np.unique(y_pred_tfidf_nb))))
print('Micro F1-score: {:.2f}\n'.format(f1_score(yval, y_pred_tfidf_nb, average='micro', labels=np.unique(y_pred_tfidf_nb))))

print('Macro Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_nb, average='macro', labels=np.unique(y_pred_tfidf_nb))))
print('Macro Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_nb, average='macro', labels=np.unique(y_pred_tfidf_nb))))
print('Macro F1-score: {:.2f}\n'.format(f1_score(yval, y_pred_tfidf_nb, average='macro', labels=np.unique(y_pred_tfidf_nb))))

print('Weighted Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_nb, average='weighted', labels=np.unique(y_pred_tfidf_nb))))
print('Weighted Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_nb, average='weighted', labels=np.unique(y_pred_tfidf_nb))))
print('Weighted F1-score: {:.2f}'.format(f1_score(yval, y_pred_tfidf_nb, average='weighted', labels=np.unique(y_pred_tfidf_nb))))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(yval, y_pred_tfidf_nb, target_names=['positive', 'neutral', 'negative']))

Confusion Matrix

[[49999   322   836]
 [31499 30901   549]
 [28604    73 19892]]

Accuracy: 0.45

Micro Precision: 0.97
Micro Recall: 0.47
Micro F1-score: 0.63

Macro Precision: 0.97
Macro Recall: 0.47
Macro F1-score: 0.63

Weighted Precision: 0.97
Weighted Recall: 0.47
Weighted F1-score: 0.63

Classification Report

              precision    recall  f1-score   support

    positive       0.96      0.44      0.61     51157
     neutral       0.99      0.49      0.66     62949
    negative       0.93      0.41      0.57     48569

   micro avg       0.96      0.45      0.62    162675
   macro avg       0.96      0.45      0.61    162675
weighted avg       0.96      0.45      0.61    162675
 samples avg       0.45      0.45      0.45    162675



#### TF-IDF and Linear Support Vector Machine with OneVsRestClassifier

In [24]:
##Evaluate TF-IDF and Linear Support Vector Machine with OneVsRestClassifier

#confusion matrix
confusion = confusion_matrix(yval.argmax(axis=1), y_pred_tfidf_lsvm.argmax(axis=1))
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(yval, y_pred_tfidf_lsvm)))

print('Micro Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_lsvm, average='micro', labels=np.unique(y_pred_tfidf_lsvm))))
print('Micro Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_lsvm, average='micro', labels=np.unique(y_pred_tfidf_lsvm))))
print('Micro F1-score: {:.2f}\n'.format(f1_score(yval, y_pred_tfidf_lsvm, average='micro', labels=np.unique(y_pred_tfidf_lsvm))))

print('Macro Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_lsvm, average='macro', labels=np.unique(y_pred_tfidf_lsvm))))
print('Macro Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_lsvm, average='macro', labels=np.unique(y_pred_tfidf_lsvm))))
print('Macro F1-score: {:.2f}\n'.format(f1_score(yval, y_pred_tfidf_lsvm, average='macro', labels=np.unique(y_pred_tfidf_lsvm))))

print('Weighted Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_lsvm, average='weighted', labels=np.unique(y_pred_tfidf_lsvm))))
print('Weighted Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_lsvm, average='weighted', labels=np.unique(y_pred_tfidf_lsvm))))
print('Weighted F1-score: {:.2f}'.format(f1_score(yval, y_pred_tfidf_lsvm, average='weighted', labels=np.unique(y_pred_tfidf_lsvm))))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(yval, y_pred_tfidf_lsvm, target_names=['positive', 'neutral', 'negative']))

Confusion Matrix

[[50659   257   241]
 [56892  6056     1]
 [40072   155  8342]]

Accuracy: 0.11

Micro Precision: 0.94
Micro Recall: 0.08
Micro F1-score: 0.15

Macro Precision: 0.94
Macro Recall: 0.08
Macro F1-score: 0.14

Weighted Precision: 0.94
Weighted Recall: 0.08
Weighted F1-score: 0.15

Classification Report

              precision    recall  f1-score   support

    positive       0.94      0.06      0.11     51157
     neutral       0.94      0.10      0.17     62949
    negative       0.97      0.17      0.29     48569

   micro avg       0.95      0.11      0.19    162675
   macro avg       0.95      0.11      0.19    162675
weighted avg       0.95      0.11      0.19    162675
 samples avg       0.11      0.11      0.11    162675



#### TF-IDF and Logistic Regression with OneVsRestClassifier

In [25]:
##Evaluate TF-IDF and Logistic Regression with OneVsRestClassifier

#confusion matrix
confusion = confusion_matrix(yval.argmax(axis=1), y_pred_tfidf_lr.argmax(axis=1))
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(yval, y_pred_tfidf_lr)))

print('Micro Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_lr, average='micro', labels=np.unique(y_pred_tfidf_lr))))
print('Micro Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_lr, average='micro', labels=np.unique(y_pred_tfidf_lr))))
print('Micro F1-score: {:.2f}\n'.format(f1_score(yval, y_pred_tfidf_lr, average='micro', labels=np.unique(y_pred_tfidf_lr))))

print('Macro Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_lr, average='macro', labels=np.unique(y_pred_tfidf_lr))))
print('Macro Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_lr, average='macro', labels=np.unique(y_pred_tfidf_lr))))
print('Macro F1-score: {:.2f}\n'.format(f1_score(yval, y_pred_tfidf_lr, average='macro', labels=np.unique(y_pred_tfidf_lr))))

print('Weighted Precision: {:.2f}'.format(precision_score(yval, y_pred_tfidf_lr, average='weighted', labels=np.unique(y_pred_tfidf_lr))))
print('Weighted Recall: {:.2f}'.format(recall_score(yval, y_pred_tfidf_lr, average='weighted', labels=np.unique(y_pred_tfidf_lr))))
print('Weighted F1-score: {:.2f}'.format(f1_score(yval, y_pred_tfidf_lr, average='weighted', labels=np.unique(y_pred_tfidf_lr))))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(yval, y_pred_tfidf_lr, target_names=['positive', 'neutral', 'negative']))

Confusion Matrix

[[50014   394   749]
 [  883 62032    34]
 [ 4440   205 43924]]

Accuracy: 0.93

Micro Precision: 0.97
Micro Recall: 0.95
Micro F1-score: 0.96

Macro Precision: 0.97
Macro Recall: 0.95
Macro F1-score: 0.96

Weighted Precision: 0.97
Weighted Recall: 0.95
Weighted F1-score: 0.96

Classification Report

              precision    recall  f1-score   support

    positive       0.94      0.91      0.93     51157
     neutral       0.99      0.99      0.99     62949
    negative       0.96      0.93      0.95     48569

   micro avg       0.97      0.95      0.96    162675
   macro avg       0.96      0.94      0.95    162675
weighted avg       0.97      0.95      0.96    162675
 samples avg       0.94      0.95      0.94    162675

