# Twitter Sentiment Analysis Pipeline #2
*Refer to `notebooks/README.md` for an explanation of the various pipelines*

## Import dependencies

In [1]:
# Built-in
import re
import json
from enum import Enum
from collections import defaultdict
from datetime import datetime

# Importing datasets
import opendatasets as od

# Data manipulation
import pandas as pd
import numpy as np
from joblib import load as jl_load
from joblib import dump as jl_dump

# Graphing/Visualizing
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from IPython.display import display

In [2]:
# ML
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [12]:
# User-defined
from pipeline_1 import pipeline1, formatParams

## Define parameters

In [4]:
DATA_IMPORT_PARAMS = {
    'data_path': 'https://www.kaggle.com/datasets/towhidultonmoy/russia-vs-ukraine-tweets-datasetdaily-updated{version}',
    # 'version': 1,
    'local_path': '../../data/russia_vs_ukraine_tweets{version}.csv',
    'new_data': False
}


PIPE1_PARAMS = {
    'control': {
        'build_models': True
        # 'config': '../pipeline_1/config/config.json',
        # 'nb_export': '../pipeline_3/pipeline_2'
    },
    'extract': {
        'clean_tweet': [], # list of regex rules
        'clean_hashtag': [], # list of regex rules
        'column_mappings': {
            'date': 'date',
            'user_name': 'username',
            'retweets': 'retweets',
            'text': 'tweet',
            'hashtags': 'hashtags'
        },
        # 'filter_words': ['ukraine', 'russia', 'zelensky'], # Only process tweets containing these words
        'save_transform': './data/transformed/russia_ukraine_sentiment_{timestamp}.csv'
    },
    'transform': {
        'load_word_vec': '',
        'save_word_vec': './models/russia_ukraine_word_vec_{timestamp}.model',
        'word_vec_args': {},
        'load_kmeans': '',
        'save_kmeans': './models/russia_ukraine_kmeans_{timestamp}.joblib',
        'kmeans_args': {},
        'display_terms': 25,
        'load_embeddings': '',
        'save_embeddings': './data/embeddings/russia_ukraine_words_{timestamp}.csv',
        'sentiment_threshold': 0.15,
        'sentiment_map': { 
            -1: "negative",
            0: "neutral", 
            1: "positive"
        }
    }
}

PIPE1_TRANSFORM_ARGS = {
}

PIPE1_MODEL_ARGS = {
    'load_vectorizer': '',
    'save_vectorizer': './models/russia_ukraine_vectorizer_{timestamp}.joblib',
    'vectorizer_args': {},
    'load_svc': '',
    'save_svc': './models/russia_ukraine_linearSVC_{timestamp}.joblib',
    'load_nb': '',
    'save_nb': './models/russia_ukraine_multinomialNB_{timestamp}.joblib'
}


PIPE2_CONTROL_PARAMS = {
    'config': './config/config_{timestamp}.json'
}

PIPE2_EXTRACT_ARGS = {
    'load_transform': './data/transformed/russia_ukraine_sentiment.csv',
    'load_svc': '../pipeline_1/models/slava_linearSVC.joblib',
    'load_nb': '../pipeline_1/models/slava_multinomialNB.joblib',
    'load_vectorizer': '../pipeline_1/models/slava_vectorizer.joblib',
    'x_col': 'clean_tweet',
    'y_col': 'sentiment_val'
}

PIPE2_MODEL_ARGS = {
}

PIPE2_INSPECT_ARGS = {
}

## Build Pipeline 2

### Define `extract` function

In [5]:

def extract(tweets_df, x_col, y_col, **kwargs):

    # Load vectorizer
    if (vector_loc := kwargs.get('load_vectorizer')):
        vectorizer = jl_load(vector_loc)
    else: # need associated vectorizer 
        return
    
    X = tweets_df[x_col]
    y = tweets_df[y_col]

    # Transform text using vectorizer
    X_test = vectorizer.transform(X.reset_index()[x_col]).toarray()
    X_test_fit = vectorizer.fit_transform(X.reset_index()[x_col]).toarray()

    # Collect features
    feature_names = vectorizer.get_feature_names_out() 

    ## Isn't this just a groupby?
    # Convert each sentiment to df (performance should be OK, small dataset)
    pos_df = tweets_df[tweets_df["sentiment"]=="positive"]
    neg_df = tweets_df[tweets_df["sentiment"]=="negative"]
    neu_df = tweets_df[tweets_df["sentiment"]=="neutral"]

    # Combine all sentiments in one df
    sentiments_df_list = [pos_df, neg_df, neu_df] 
    agg_sentiment_df = pd.concat(sentiments_df_list)

    # Load linearSVC
    collected_models = {}
    if (svc_loc := kwargs.get('load_svc')):
        collected_models['linear_svc'] = {
            'model': jl_load(svc_loc),
            'x_test': X_test,
            'x_test_fit': X_test_fit,
            'features': feature_names
        }
    
    # Load MultinomialNB
    if (nb_loc := kwargs.get('load_nb')):
        collected_models['multi_nb'] = {
            'model': jl_load(nb_loc),
            'y_test': y,
            'x_test': X_test,
            'x_test_fit': X_test_fit,
            'features': feature_names
        }
    
    return collected_models, agg_sentiment_df

### Define `transform` function

In [6]:
def model(models, tweets_df, config_path, **kwargs):

    # Import pipeline 1's config
    p1_config_data = {}
    with open(config_path, 'r') as f:
        p1_config_data = json.load(f)
    
    # Map sentiment encodings
    p1_emotion = p1_config_data['sentiment_vals']['value_mapping']

    emot_codes = np.array([int(x) for x in p1_emotion.keys()])
    emot_labels = np.array(list(p1_emotion.values()))

    map_offset = np.abs(emot_codes.min())
    mapping_arr = np.zeros(len(emot_codes), dtype=emot_labels.dtype)
    mapping_arr[emot_codes+map_offset] = emot_labels

    model_predictions = {}
    for model_name, model_info in models.items():
        model = model_info['model']
        X_test = model_info['x_test']

        # Generate prediction
        predicts = model.predict(X_test)

        predictions_arr = mapping_arr[predicts]

        # Build df from predictions
        predictions_df = pd.DataFrame(zip(tweets_df['clean_tweet'], tweets_df['sentiment_val'], predictions_arr), 
                                        columns=['clean_tweet', 'sentiment_val', 'sentiment'])
        
        model_predictions[model_name] = predictions_df

    return model_predictions, mapping_arr                                

### Define `inspect` function
Used to calculate each model's performance

#### **Cluster** model validation
1. Internal validation
    - Typically will combine cohesion (within each cluster) and separation (between different clusters)
    - Compute the validation score of each cluster and then uses weights in the aggregation to produce a final score for the entire model

2. External validation
    - Necessary to have *true* cluster labels
    - Measure the statistical similarity between the *true* cluster labels and the actual values

#### **Classification** metrics
1. Classification Accuracy:
    - The ratio of correct predictions to the total number of predicitions
    - Popular but flawed (often misused/misinterpreted); there are two criteria to meet for this calculation:
        1. Equal number of observations in all classes
        2. All predictions and prediction errors are equally important
2. Log Loss
    - Evaluates the predictions of probabilities of membership to a given class
    - Can be seen as a measure of confidence for a prediction algorithm
3. Area Under ROC Curve
    - Designed for binary classification problems
4. Confusion Matrix
    - Provides the accuracy of a model which has two or more classes
    - Presents the predicitions in relation to the accuracy outcome
5. Classificaiton Report
    - `scikit-learn`'s function to summarize a classification model

#### Small evaluation functions to be used by `inspect`

In [7]:

class MetricReports:

    def getTests():
        return {
            'cross_val': MetricReports.crossValidation,
            'confusion': MetricReports.confusionMatrix,
            'classification': MetricReports.classificationReport
        }

    def execute(model, features, y_test, y_pred, **kwargs):
        return [
            MetricReports.crossValidation(model, features, y_pred, kwargs),
            MetricReports.confusionMatrix(y_test, y_pred),
            MetricReports.classificationReport(y_test, y_pred)
        ]

    ## Metric Functions ##

    def crossValidation(model, features, y_pred, scoring=['accuracy'], kfold=5):
        res = []
        for score in scoring:
            res.append({
                'name': f'CV Classification - {score}',
                'result': cross_val_score(estimator=model,
                                            X=features, 
                                            y=y_pred, 
                                            scoring=score, 
                                            cv=kfold)
            })
        return res

    def confusionMatrix(y_test, y_pred):
        ''' FIXME: This might not be a valid metric... not sure if it can handle unlabeled data
        '''
        return {
            'name': 'Confusion Matrix',
            'result': metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
        }

    def classificationReport(y_test, y_pred):
        ''' FIXME: This might not be a valid metric... not sure if it can handle unlabeled data
        '''
        return {
            'name': 'Classification Report',
            'result': metrics.classification_report(y_true=y_test, y_pred=y_pred, output_dict=True)
        }


## Best suited for unsupervised clustering algorithms ##

class MetricScores:

    def getTests():
        return {
            'silhouette': MetricScores.silhouetteScore,
            'calinski_harabaz': MetricScores.calinskiHarabaz,
            'dabies_bouldin': MetricScores.dabiesBouldin,
            'mean_acc': MetricScores.meanAccuracy
        }

    def execute(model, features, y_pred):
        return [
            MetricScores.silhouetteScore(features, y_pred),
            MetricScores.calinskiHarabaz(features, y_pred),
            MetricScores.dabiesBouldin(features, y_pred),
            MetricScores.meanAccuracy(model, features, y_pred)
        ]

    ## Metric Functions ##

    def silhouetteScore(features, y_pred):
        ''' Attempts to describe how similar a datapoint is to other datapoints in its cluster, 
        relative to datapoints not in its cluster (aggregated over all datapoints to get the score for 
        an overall clustering). It evaluates how ‘distinct’ the clusters are in space
        It's bounded between -1 and 1. Closer to -1 suggests incorrect clustering, while 
        closer to +1 shows that each cluster is very dense.
        '''
        return {
            'name': 'Silhouette Score',
            'result': metrics.silhouette_score(X=features, labels=y_pred)
        }

    def calinskiHarabaz(features, y_pred):
        ''' A ratio of the variance of a datapoint compared to points in other clusters, 
        against the variance compared to points within its cluster. This score is not bounded.
        '''
        return {
            'name': 'Calinski Harabaz Index',
            'result': metrics.calinski_harabasz_score(X=features, labels=y_pred)
        }

    def dabiesBouldin(features, y_pred):
        ''' The average similarity measure of each cluster with its most similar cluster, 
        where similarity is the ratio of within-cluster distances to between-cluster distances. 
        Thus, clusters which are farther apart and less dispersed will result in a better score.
        The minimum score is zero, with lower values indicating better clustering.
        '''
        return {
            'name': 'Davies-Bouldin Index',
            'result': metrics.davies_bouldin_score(X=features, labels=y_pred)
        }

    def meanAccuracy(model, features, y_pred):
        '''
        '''
        return {
            'name': 'Mean Accuracy',
            'result': model.score(X=features, y=y_pred)
        }

In [8]:

def inspect(models, prediction_dfs, tweets_df, **kwargs):

    # Set up variables
    cv_scores = kwargs.get('cv_scores', ['accuracy'])
    crossVal_accuracies = { x: [] for x in cv_scores }

    test_names = list(MetricReports.getTests().keys()) + list(MetricScores.getTests().keys())
    metric_results = { m: { k: [] for k in test_names } for m in models.keys() }
    cv_dfs = {m: {} for m in models.keys()}

    ## Begin evaluations

    for model_name, model_info in models.items():

        # y_test = tweets_df['sentiment_val']
        y_test = model_info['y_test']
        y_pred = prediction_dfs[model_name]['sentiment_val']

        X_test = model_info['x_test']
        X_test_fit = model_info['x_test_fit']

        metric_reports = MetricReports.execute(model=model_info['model'], 
                                                features=X_test, 
                                                y_test=y_test, 
                                                y_pred=y_pred, 
                                                scoring=cv_scores, 
                                                kfold=kwargs.get('kfold'))
        metric_scores = MetricScores.execute(model=model_info['model'], 
                                                features=X_test, 
                                                y_pred=y_pred)

        metric_results[model_name] = metric_reports + metric_scores

        # Aggregate cross validation results
        for res in metric_reports:
            if 'CV' in res['name']:
                score_name = res['name'].split(' ')[-1]
                indexed_acc = [(model_name, idx, acc) for idx, acc in enumerate(res['result'])]
                crossVal_accuracies[score_name].append(
                    pd.DataFrame(indexed_acc, columns=['model_name', 'fold_idx', score_name]))
        
        # Combine cross val results by scoring type
        for score_type, models in crossVal_accuracies.items():
            cv_dfs[model_name][score_type] = pd.concat(models)

    # Create a dict of Sentiment_val
    sentiment_id_df = tweets_df[['sentiment_val', 'sentiment']].drop_duplicates().sort_values('sentiment_val')
    sentiment_to_id = dict(sentiment_id_df.values)

    metric_results['sentiment_maps'] = {
        'sentiment_id': sentiment_id_df,
        'sentiment_to': sentiment_to_id
    }


    return metric_results, cv_dfs

### Define `load` function

In [9]:
def load(predictions, results, cv_dfs=[], io_options={}):
    pass

### Build `pipeline` function using above processes

In [10]:
def pipeline2(pipe1_args, extract_args={}, model_args={}, inspect_args={}, control_params={}):

    
    if not extract_args.get('sentiment_map'):
        extract_args['sentiment_map'] = { -1: "negative", 0: "neutral", 1: "positive" }

    print('--- Beginning Part 1: transform new dataset using Pipeline 1 ---\n')


    if (existing_transform := pipe1_args.get('load_transform')):
        p1_sentiment_df = pd.read_csv(existing_transform)

    else:
        # Can reuse pipeline 1 without building models
        p1_sentiment_df, _, _ = pipeline1(**pipe1_args)

    if not p1_sentiment_df.empty:

        print('Completed Pipelin 1.\n\n--- Beginning Part 2: assess accuracy of Pipeline 1 models ---')
        print('Stage 1: Extracting...')
        config = defaultdict(dict)

        models_dict, sentiment_df = extract(tweets_df=p1_sentiment_df,
                                                    **extract_args)

        print('Completed Stage 1.\n\nStage 2: Transforming...')

        model_predictions, mapping_arr = model(models=models_dict,
                                                tweets_df=p1_sentiment_df,
                                                config=pipe1_args.get('config'),
                                                **model_args)
        
        print('Completed Stage 2.\n\nStage 2.5: Analyzing...')

        metric_results, crossVal_dfs = inspect(models=models_dict, 
                                                        prediction_dfs=model_predictions,
                                                        tweets_df=p1_sentiment_df,
                                                        **inspect_args)

        print('Completed Stage 2.5.\n\nStage 3: Loading...')
        load(predictions=model_predictions, results=metric_results, cv_dfs=crossVal_dfs, **control_params)
        
        # Need to append to Pipeline 1 config...
        with open(control_params.get('config', './config.json'), 'w') as f:
            json.dump(config, f)

    print('\n<done>')
    return p1_sentiment_df, model_predictions, metric_results

## Execute `pipeline`

In [15]:
# Format parameters
data_import_params, curr_time = formatParams(DATA_IMPORT_PARAMS)

p1_extract_args, _ = formatParams(PIPE1_EXTRACT_ARGS, curr_time)
p1_transform_args, _ = formatParams(PIPE1_TRANSFORM_ARGS, curr_time)
p1_model_args, _ = formatParams(PIPE1_MODEL_ARGS, curr_time)
p1_control_params, _ = formatParams(PIPE1_CONTROL_PARAMS, curr_time)

pipe1_args = {
    'import_path': data_import_params['data_path'],
    'import_params': data_import_params,
    'extract_args': p1_extract_args,
    'transform_args': p1_transform_args,
    'model_args': p1_model_args,
    'control_params': p1_control_params 
}

p2_extract_args, _ = formatParams(PIPE2_EXTRACT_ARGS, curr_time)
p2_model_args, _ = formatParams(PIPE2_MODEL_ARGS, curr_time)
p2_inspect_args, _ = formatParams(PIPE2_INSPECT_ARGS, curr_time)
p2_control_params, _ = formatParams(PIPE2_CONTROL_PARAMS, curr_time)


ru_sentiment_df, ru_predictions, model_metrics = pipeline2(pipe1_args=pipe1_args,
                                                            extract_args=p2_extract_args,
                                                            model_args=p2_model_args,
                                                            inspect_args=p2_inspect_args,
                                                            control_params=p2_control_params)

--- Beginning Part 1: transform new dataset using Pipeline 1 ---

Stage 1: Extracting...
Downloading russia-vs-ukraine-tweets-datasetdaily-updated.zip to ../../data/russia-vs-ukraine-tweets-datasetdaily-updated


100%|██████████| 2.08M/2.08M [00:00<00:00, 10.3MB/s]





KeyError: "['user_name', 'retweets', 'text'] not in index"

In [None]:
model_metrics