In [None]:
# !pip install spark-nlp -q
# !pip install pyspark -q

In [None]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC, SVC
from sklearn import metrics

from sklearn.utils import shuffle
import shutil

import sparknlp
spark = sparknlp.start()
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark import SparkContext, SQLContext

LABEL_IDS = {'FAVOUR': 0, 'AGAINST': 1, 'NEUTRAL': 2}
N_LABELS = len(LABEL_IDS)



In [None]:
stance_df = pd.read_csv('labelled_stance_data.csv')

In [None]:
stance_df.shape

(3015, 46)

### Hashing Vectorizer

In [None]:
train_df, test_df = train_test_split(stance_df, test_size=0.3)
print(train_df.shape, test_df.shape)

x_train = train_df['tweet']
y_train = train_df['stance']

x_test = test_df['tweet']
y_test = test_df['stance']

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(2110, 46) (905, 46)
(2110,) (2110,) (905,) (905,)


In [None]:
# Convert tweets into Vectors
hashingVectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)
hashingVectorizer.fit(x_train)
x_train_new = hashingVectorizer.transform(x_train)
x_test_new = hashingVectorizer.transform(x_test)

In [None]:
# Oversampling
smote = SMOTE()
ov_train_x, ov_train_y = smote.fit_resample(x_train_new, y_train)
ov_test_x, ov_test_y = smote.fit_resample(x_test_new, y_test)



In [None]:
# Linear SVM
linearSVM = LinearSVC()
linearSVM.fit(x_train_new, y_train)
y_pred_lsvm = linearSVM.predict(x_test_new)

print('Classification Report for Linear SVM with Hashing Vectorizer: \n', metrics.classification_report(y_test, y_pred_lsvm))

Classification Report for Linear SVM with Hashing Vectorizer: 
               precision    recall  f1-score   support

     AGAINST       0.57      0.33      0.42        76
      FAVOUR       0.76      0.90      0.82       651
     NEUTRAL       0.30      0.16      0.21       178

    accuracy                           0.70       905
   macro avg       0.54      0.46      0.48       905
weighted avg       0.66      0.70      0.67       905



In [None]:
linearSVM.fit(ov_train_x, ov_train_y)
y_pred_lsvm = linearSVM.predict(ov_test_x)

print('Classification Report for Linear SVM with Hashing Vectorizer & Oversampling: \n', metrics.classification_report(ov_test_y, y_pred_lsvm))

Classification Report for Linear SVM with Hashing Vectorizer & Oversampling: 
               precision    recall  f1-score   support

     AGAINST       0.83      0.52      0.64       651
      FAVOUR       0.47      0.78      0.59       651
     NEUTRAL       0.44      0.31      0.36       651

    accuracy                           0.54      1953
   macro avg       0.58      0.54      0.53      1953
weighted avg       0.58      0.54      0.53      1953



In [None]:
# Linear SVM
svm = SVC()
svm.fit(x_train_new, y_train)
y_pred_lsvm = svm.predict(x_test_new)

print('Classification Report for SVM with Hashing Vectorizer: \n', metrics.classification_report(y_test, y_pred_lsvm))

Classification Report for SVM with Hashing Vectorizer: 
               precision    recall  f1-score   support

     AGAINST       0.68      0.20      0.31        76
      FAVOUR       0.73      0.99      0.84       651
     NEUTRAL       0.22      0.01      0.02       178

    accuracy                           0.73       905
   macro avg       0.55      0.40      0.39       905
weighted avg       0.63      0.73      0.64       905



In [None]:
svm.fit(ov_train_x, ov_train_y)
y_pred_lsvm = svm.predict(ov_test_x)

print('Classification Report for SVM with Hashing Vectorizer & Oversampling: \n', metrics.classification_report(ov_test_y, y_pred_lsvm))

Classification Report for SVM with Hashing Vectorizer & Oversampling: 
               precision    recall  f1-score   support

     AGAINST       0.93      0.53      0.68       651
      FAVOUR       0.58      0.98      0.73       651
     NEUTRAL       0.64      0.47      0.54       651

    accuracy                           0.66      1953
   macro avg       0.72      0.66      0.65      1953
weighted avg       0.72      0.66      0.65      1953



In [None]:
# Logistic Regression
logisticRegression = LogisticRegression()
logisticRegression.fit(x_train_new, y_train)
y_pred_lr = logisticRegression.predict(x_test_new)

print('Classification Report for Logistic Regression with Hashing Vectorizer: \n', metrics.classification_report(y_test, y_pred_lr))

Classification Report for Logistic Regression with Hashing Vectorizer: 
               precision    recall  f1-score   support

     AGAINST       0.72      0.24      0.36        76
      FAVOUR       0.74      0.97      0.84       651
     NEUTRAL       0.30      0.06      0.09       178

    accuracy                           0.73       905
   macro avg       0.59      0.42      0.43       905
weighted avg       0.66      0.73      0.65       905



In [None]:
# Logistic Regression
logisticRegression = LogisticRegression()
logisticRegression.fit(ov_train_x, ov_train_y)
y_pred_lr = logisticRegression.predict(ov_test_x)

print('Classification Report for Logistic Regression with Hashing Vectorizer & OverSampling: \n', metrics.classification_report(ov_test_y, y_pred_lr))

Classification Report for Logistic Regression with Hashing Vectorizer & OverSampling: 
               precision    recall  f1-score   support

     AGAINST       0.84      0.61      0.71       651
      FAVOUR       0.48      0.75      0.59       651
     NEUTRAL       0.46      0.34      0.39       651

    accuracy                           0.57      1953
   macro avg       0.60      0.57      0.56      1953
weighted avg       0.60      0.57      0.56      1953



### TF-IDF

In [None]:
tfidf_vec = TfidfVectorizer(max_features=300)
tfidf_vec.fit(x_train)
x_train_tfidf = tfidf_vec.transform(x_train)
x_test_tfidf = tfidf_vec.transform(x_test)

In [None]:
ov_train_x_tfidf, ov_train_y_tfidf = smote.fit_resample(x_train_tfidf, y_train)
ov_test_x_tfidf, ov_test_y_tfidf = smote.fit_resample(x_test_tfidf, y_test)



In [None]:
# Linear SVM
linearSVM = LinearSVC()
linearSVM.fit(x_train_tfidf, y_train)
y_pred_lsvm = linearSVM.predict(x_test_tfidf)

print('Classification Report for Linear SVM with TF-IDF: \n', metrics.classification_report(y_test, y_pred_lsvm))

Classification Report for Linear SVM with TF-IDF: 
               precision    recall  f1-score   support

     AGAINST       0.55      0.38      0.45        76
      FAVOUR       0.77      0.92      0.84       651
     NEUTRAL       0.36      0.14      0.20       178

    accuracy                           0.72       905
   macro avg       0.56      0.48      0.50       905
weighted avg       0.67      0.72      0.68       905



In [None]:
linearSVM.fit(ov_train_x_tfidf, ov_train_y_tfidf)
y_pred_lsvm = linearSVM.predict(ov_test_x_tfidf)

print('Classification Report for Linear SVM with TF-IDF & Oversampling: \n', metrics.classification_report(ov_test_y_tfidf, y_pred_lsvm))

Classification Report for Linear SVM with TF-IDF & Oversampling: 
               precision    recall  f1-score   support

     AGAINST       0.68      0.63      0.66       651
      FAVOUR       0.49      0.61      0.54       651
     NEUTRAL       0.47      0.39      0.42       651

    accuracy                           0.54      1953
   macro avg       0.55      0.54      0.54      1953
weighted avg       0.55      0.54      0.54      1953



In [None]:
# SVM
svm = SVC()
svm.fit(x_train_tfidf, y_train)
y_pred = svm.predict(x_test_tfidf)

print('Classification Report for SVM with TF-IDF: \n', metrics.classification_report(y_test, y_pred))

Classification Report for SVM with TF-IDF: 
               precision    recall  f1-score   support

     AGAINST       0.72      0.28      0.40        76
      FAVOUR       0.74      0.98      0.85       651
     NEUTRAL       0.31      0.02      0.04       178

    accuracy                           0.73       905
   macro avg       0.59      0.43      0.43       905
weighted avg       0.65      0.73      0.65       905



In [None]:
svm.fit(ov_train_x_tfidf, ov_train_y_tfidf)
y_pred = svm.predict(ov_test_x_tfidf)

print('Classification Report for SVM with TF-IDF & Oversampling: \n', metrics.classification_report(ov_test_y_tfidf, y_pred))

Classification Report for SVM with TF-IDF & Oversampling: 
               precision    recall  f1-score   support

     AGAINST       0.84      0.61      0.71       651
      FAVOUR       0.58      0.95      0.72       651
     NEUTRAL       0.63      0.41      0.49       651

    accuracy                           0.66      1953
   macro avg       0.69      0.66      0.64      1953
weighted avg       0.69      0.66      0.64      1953



In [None]:
# Logistic Regression
logisticRegression = LogisticRegression()
logisticRegression.fit(x_train_new, y_train)
y_pred_lr = logisticRegression.predict(x_test_new)

print('Classification Report for Logistic Regression with TF-IDF: \n', metrics.classification_report(y_test, y_pred_lr))

Classification Report for Logistic Regression with TF-IDF: 
               precision    recall  f1-score   support

     AGAINST       0.72      0.24      0.36        76
      FAVOUR       0.74      0.97      0.84       651
     NEUTRAL       0.30      0.06      0.09       178

    accuracy                           0.73       905
   macro avg       0.59      0.42      0.43       905
weighted avg       0.66      0.73      0.65       905



In [None]:
logisticRegression.fit(ov_train_x_tfidf, ov_train_y_tfidf)
y_pred_lr = logisticRegression.predict(ov_test_x_tfidf)

print('Classification Report for Logistic Regression with TF-IDF & Oversampling: \n', metrics.classification_report(ov_test_y_tfidf, y_pred_lr))

Classification Report for Logistic Regression with TF-IDF & Oversampling: 
               precision    recall  f1-score   support

     AGAINST       0.71      0.66      0.69       651
      FAVOUR       0.49      0.63      0.56       651
     NEUTRAL       0.48      0.38      0.43       651

    accuracy                           0.56      1953
   macro avg       0.56      0.56      0.56      1953
weighted avg       0.56      0.56      0.56      1953



### Spark NLP (Universal Sentence Encoder & Classifier DL)

In [None]:
sqlContext = SQLContext(SparkContext.getOrCreate())

# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]': return TimestampType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)

# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
      struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlContext.createDataFrame(pandas_df, p_schema)

In [None]:
trainDataset = pandas_to_spark(train_df)
testDataset = pandas_to_spark(test_df)

In [None]:
# Get tweets
document = DocumentAssembler().setInputCol('tweet').setOutputCol('tweet')

# Download Universal Sentence Encoder
use = UniversalSentenceEncoder.pretrained().setInputCols(['tweet']).setOutputCol('tweet_embeddings')

# Classifier DL
classifier_dl = ClassifierDLApproach().setInputCols(['tweet_embeddings']).setOutputCol('class').setLabelColumn('stance').setMaxEpochs(5).setEnableOutputLogs(True)

use_clf_dl_pipeline = Pipeline(
    stages=[
            document,
            use, classifier_dl
    ]
)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
use_pipeline_model = use_clf_dl_pipeline.fit(trainDataset)

In [None]:
preds = use_pipeline_model.transform(testDataset)

pred_df = use_pipeline_model.transform(testDataset).select('stance','tweet','class.result').toPandas()

pred_df['result'] = pred_df['result'].apply(lambda x:x[0])

print(metrics.classification_report(pred_df.stance, pred_df.result))

              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        76
      FAVOUR       0.72      1.00      0.84       651
     NEUTRAL       0.00      0.00      0.00       178

    accuracy                           0.72       905
   macro avg       0.24      0.33      0.28       905
weighted avg       0.52      0.72      0.60       905



  _warn_prf(average, modifier, msg_start, len(result))
