# Twitter Sentiment Analysis Pipeline #2
*Refer to `notebooks/README.md` for an explanation of the various pipelines*

## Import dependencies

In [1]:
# Built-in
import json
from collections import defaultdict
from operator import itemgetter
from functools import reduce 

# Data manipulation
import pandas as pd
import numpy as np
from joblib import load, dump

In [2]:
# ML
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [3]:
import sys

sys.path.append("../pipeline_1")
from modules.pipeline_1 import pipeline1
sys.path.remove("../pipeline_1")

sys.path.append("../utils")
from import_data import importData
from config_parser import buildConfig, mergeDicts
from control_signal import ControlSignal, CONTROL_ACTIONS, CONTROL_FLAGS, processSignals
from grapher import Grapher
sys.path.remove("../utils")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Build Pipeline 2

### Define `extract` function

In [4]:
MODEL_DEF_TYPES = [str, LinearSVC, MultinomialNB]
MODEL_NUM_FEATURES = [
        ('n_features_in_', lambda x: x.n_features_in_)
    ]

def extract(sentiment_dataset=None, slava_vectorizer=None, slava_models=None, 
                x_col='', y_col='', slava_config=None, **kwargs):
    signals = []
    existing_models = {}

    if not sentiment_dataset:
        signals.append(ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.INVALID_REQUIRED,
                            'Missing sentiment dataset definition.'))
    if not slava_vectorizer:
        signals.append(ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.INVALID_REQUIRED,
                            'Missing the slava vectorizer definition produced by Pipeline 1.'))
    if not slava_models:
        signals.append(ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.INVALID_REQUIRED,
                            'Missing the slava prediction models definition(s) produced by Pipeline 1.'))
    if signals:
        return signals, None
    if not x_col:
        signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.INVALID_REQUIRED,
                        'Missing feature column definition for the sentiment dataset.'))
    if not x_col:
        signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.INVALID_REQUIRED,
                        'Missing label column definition for the sentiment dataset.'))
    if signals:
        return signals, None 
    

    INVALID, VALID = 0, 1
    searchParams = lambda keys, param: reduce(lambda d, k: d.get(k) , keys, param)
    def validateObject(obj, allowed_types, expected_type=None, obj_protocol=None, required=False, err_signal=None, **kwargs): 
        if not hasattr(allowed_types, '__iter__') or isinstance(allowed_types, str) or allowed_types == str:
            obj_type = allowed_types if isinstance(obj, allowed_types) else None
        else:
            obj_type = next((t for t in allowed_types if isinstance(obj, t)), None)
        
        if not obj_type:
            if err_signal:
                signals.append(err_signal)
            return INVALID, None
        
        obj_signals, extracted_obj = importData(import_loc=obj, import_protocol=obj_protocol,
                                        signals=signals, expected_type=expected_type, required=required, kwargs=kwargs)
        signals.extend(obj_signals)
        
        if extracted_obj is None or (hasattr(extracted_obj,'size') and extracted_obj.size < 1):
            return INVALID, None
        
        return VALID, extracted_obj
        
    
    # Load sentiment df
    valid_obj, sentiment_df = validateObject(obj=sentiment_dataset, obj_protocol=searchParams(['sentiment_dataset_protocol'], kwargs), 
                                    allowed_types=str, expected_type=pd.DataFrame, 
                                    err_signal=ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.MISSING_REQUIRED,
                                        "Missing ground-truth sentiment dataframe which is required."),
                                    kwargs=kwargs)
    if valid_obj == INVALID:
        return signals, None

    # Load vectorizer
    valid_obj, vectorizer = validateObject(obj=slava_vectorizer, obj_protocol=searchParams(['slava_vectorizer_protocol'], kwargs),
                                    allowed_types=str, expected_types=TfidfVectorizer, 
                                    err_signal=ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.MISSING_NECCESSARY, 
                                        'Must provide an existing vectorizer model associated with the sentiment data.'),
                                    kwargs=kwargs)
    if valid_obj == INVALID:
        return signals, None
        
    num_vectorizer_features = len(vectorizer.idf_)    


    # Load Pipeline 1 config
    slava_config_data = {}
    if isinstance(slava_config, str):
        try:
            with(open(slava_config, 'r') as f):
                slava_config_data = json.load(f)
        except:
            signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.IMPORT_EXCEPTION,
                            f'Could not read Pipeline 1 configuration from {slava_config}'))
            return signals, None
    elif not isinstance(slava_config, dict):
        signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.MISSING_NECCESSARY,
                            (f'Invalid argument type for `slava_config`: {type(slava_config)} - {slava_config}\n' + 
                            'Expected str or dict')))
        return signals, None
    else:
        slava_config_data = slava_config
    
    if not (model_config := slava_config_data.get('MODEL')) or not model_config.get('sentiment_vals'):
        signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.MISSING_REQUIRED,
                        f"Provided `slava_config` is missing required configuration - [MODEL][sentiment_vals]"))
        return signals, None
            
    slava_sentiment_vals = slava_config_data['MODEL']['sentiment_vals']
    

    def validateModel(model_name, model_obj):
        if not model_obj:
            return False
        features_accessor = next((x for x in MODEL_NUM_FEATURES if hasattr(model_obj, x[0])), None)
        if not features_accessor:
            return False
        num_model_features = features_accessor[1](model_obj)
        if num_model_features != num_vectorizer_features: # Model must be associated with the vectorizer!
            signals.append(ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.FILE_MANAGEMENT,
                                            (f'Mismatched number of features between model [{model_name}]' +
                                                f'and the provided vectorizer.\n# Model Features: {num_model_features}\n' +
                                                f'# Vectorizer Features: {num_vectorizer_features}')))
            return False
        return True
    

    if isinstance(slava_models, (tuple, list)):
        for model_info in slava_models:
            if isinstance(model_info, (tuple, list)):
                model_name, model_def = model_info[0], model_info[1]  
            else:
                model_name, model_def = model_info['name'], { k: v for k, v in model_info.items() if k != 'name' }
            
            if 'protocol' not in model_name:
                valid_obj, model = validateObject(obj=model_def, obj_protocol=searchParams([f'{model_name}_protocol'], slava_models), 
                                        allowed_types=MODEL_DEF_TYPES, 
                                        err_signal=ControlSignal(CONTROL_ACTIONS.INFO, CONTROL_FLAGS.IMPORT_EXCEPTION,
                                            f'Could not import predictive model: {model_def}'))
                if valid_obj == INVALID:
                    continue
                if validateModel(model_name, model):
                    existing_models[model_name] = model

    elif isinstance(slava_models, dict):
        for model_name, model_def in slava_models.items():
            if 'protocol' not in model_name:
                valid_obj, model = validateObject(obj=model_def, obj_protocol=searchParams([f'{model_name}_protocol'], slava_models), 
                                        allowed_types=MODEL_DEF_TYPES, 
                                        err_signal=ControlSignal(CONTROL_ACTIONS.INFO, CONTROL_FLAGS.IMPORT_EXCEPTION,
                                            f'Could not import predictive model: {model_def}'))
                if valid_obj == INVALID:
                    continue
                if validateModel(model_name, model):
                    existing_models[model_name] = model
    
    if not existing_models:
        signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.MISSING_REQUIRED),
                        'Could not import Slava models which are required for inferences.')
        return signals, None

    X = sentiment_df[x_col]
    y = sentiment_df[y_col]


    # Transform text using vectorizer
    X_test = vectorizer.transform(X.reset_index()[x_col]).toarray()
    X_test_fit = vectorizer.fit_transform(X.reset_index()[x_col]).toarray()

    # Collect features
    feature_names = vectorizer.get_feature_names_out() 

    # Load linearSVC
    if (linear_svc_path := kwargs.get('linear_svc')):
        linear_svc = load(linear_svc_path)
        existing_models['linear_svc'] = (linear_svc, {}) # Place holder for performance metrics

    # Load MultinomialNB
    if (multi_nb_path := kwargs.get('multi_nb')):
        existing_models['multi_nb'] = (load(multi_nb_path), {}) # Place holder for performance metrics

    model_params = {
        'x_test': X_test,
        'x_test_fit': X_test_fit,
        'y_test': y,
        'features': feature_names
    }
    
    return signals, (sentiment_df, existing_models, model_params, slava_sentiment_vals)

### Define `model` function

In [5]:
def model(sentiment_df, existing_models, model_params, sentiment_vals, **kwargs):
    signals = []
    
    # Map sentiment encodings
    p1_sentiment_encoding = sentiment_vals['value_mapping']

    sentiment_codes = np.array([int(x) for x in p1_sentiment_encoding.keys()])
    sentiment_labels = np.array(list(p1_sentiment_encoding.values()))

    map_offset = np.abs(sentiment_codes.min())
    mapping_arr = np.zeros(len(sentiment_codes), dtype=sentiment_labels.dtype)
    mapping_arr[sentiment_codes+map_offset] = sentiment_labels

    model_predictions = {}
    for model_name, model in existing_models.items():
        X_test = model_params['x_test']

        # Generate prediction
        inferences = model.predict(X_test)
        predictions_arr = mapping_arr[inferences]

        # Build df from predictions
        zipped_data = zip(sentiment_df['clean_tweet'], sentiment_df['sentiment_val'], predictions_arr)
        inferences_df = pd.DataFrame(zipped_data, columns=['clean_tweet', 'pred_sentiment_val', 'pred_sentiment'])
        
        model_predictions[model_name] = inferences_df

    return signals, (model_predictions, mapping_arr)

### Define `inspect` function
Used to calculate each model's performance

#### **Cluster** model validation
1. Internal validation
    - Typically will combine cohesion (within each cluster) and separation (between different clusters)
    - Compute the validation score of each cluster and then uses weights in the aggregation to produce a final score for the entire model

2. External validation
    - Necessary to have *True* cluster labels
    - Measure the statistical similarity between the *True* cluster labels and the actual values

#### **Classification** metrics
1. Classification Accuracy:
    - The ratio of correct predictions to the total number of predicitions
    - Popular but flawed (often misused/misinterpreted); there are two criteria to meet for this calculation:
        1. Equal number of observations in all classes
        2. All predictions and prediction errors are equally important
2. Log Loss
    - Evaluates the predictions of probabilities of membership to a given class
    - Can be seen as a measure of confidence for a prediction algorithm
3. Area Under ROC Curve
    - Designed for binary classification problems
4. Confusion Matrix
    - Provides the accuracy of a model which has two or more classes
    - Presents the predicitions in relation to the accuracy outcome
5. Classificaiton Report
    - `scikit-learn`'s function to summarize a classification model

#### Small evaluation functions to be used by `inspect`

In [6]:

class MetricReports:

    def getTests():
        return {
            'cross_val': MetricReports.crossValidation,
            'confusion': MetricReports.confusionMatrix,
            'classification': MetricReports.classificationReport
        }

    def execute(model, features, y_test, y_pred, **kwargs):
        return [
            MetricReports.crossValidation(model, features, y_pred, **kwargs),
            MetricReports.confusionMatrix(y_test, y_pred),
            MetricReports.classificationReport(y_test, y_pred)
        ]

    ## Metric Functions ##

    def crossValidation(model, features, y_pred, scoring=None, kfold=5):
        res = []
        if not scoring:
            scoring = ['accuracy']
        for score in scoring:
            res.append({
                'name': f'CV Classification - {score}',
                'result': cross_val_score(estimator=model,
                                            X=features, 
                                            y=y_pred, 
                                            scoring=score, 
                                            cv=kfold)
            })
        return res

    def confusionMatrix(y_test, y_pred):
        ''' FIXME: This might not be a valid metric... not sure if it can handle unlabeled data
        '''
        return {
            'name': 'Confusion Matrix',
            'result': metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
        }

    def classificationReport(y_test, y_pred):
        ''' FIXME: This might not be a valid metric... not sure if it can handle unlabeled data
        '''
        return {
            'name': 'Classification Report',
            'result': metrics.classification_report(y_true=y_test, y_pred=y_pred, output_dict=True)
        }


## Best suited for unsupervised clustering algorithms ##

class MetricScores:

    def getTests():
        return {
            'silhouette': MetricScores.silhouetteScore,
            'calinski_harabaz': MetricScores.calinskiHarabaz,
            'dabies_bouldin': MetricScores.dabiesBouldin,
            'mean_acc': MetricScores.meanAccuracy
        }

    def execute(model, features, y_pred):
        return [
            MetricScores.silhouetteScore(features, y_pred),
            MetricScores.calinskiHarabaz(features, y_pred),
            MetricScores.dabiesBouldin(features, y_pred),
            MetricScores.meanAccuracy(model, features, y_pred)
        ]

    ## Metric Functions ##

    def silhouetteScore(features, y_pred):
        ''' Attempts to describe how similar a datapoint is to other datapoints in its cluster, 
        relative to datapoints not in its cluster (aggregated over all datapoints to get the score for 
        an overall clustering). It evaluates how ‘distinct’ the clusters are in space
        It's bounded between -1 and 1. Closer to -1 suggests incorrect clustering, while 
        closer to +1 shows that each cluster is very dense.
        '''
        return {
            'name': 'Silhouette Score',
            'result': metrics.silhouette_score(X=features, labels=y_pred)
        }

    def calinskiHarabaz(features, y_pred):
        ''' A ratio of the variance of a datapoint compared to points in other clusters, 
        against the variance compared to points within its cluster. This score is not bounded.
        '''
        return {
            'name': 'Calinski Harabaz Index',
            'result': metrics.calinski_harabasz_score(X=features, labels=y_pred)
        }

    def dabiesBouldin(features, y_pred):
        ''' The average similarity measure of each cluster with its most similar cluster, 
        where similarity is the ratio of within-cluster distances to between-cluster distances. 
        Thus, clusters which are farther apart and less dispersed will result in a better score.
        The minimum score is zero, with lower values indicating better clustering.
        '''
        return {
            'name': 'Davies-Bouldin Index',
            'result': metrics.davies_bouldin_score(X=features, labels=y_pred)
        }

    def meanAccuracy(model, features, y_pred):
        '''
        '''
        return {
            'name': 'Mean Accuracy',
            'result': model.score(X=features, y=y_pred)
        }

### Define `evaluate` function

In [7]:

def evaluate(sentiment_df, prediction_dfs, existing_models, model_params, **kwargs):
    signals = []

    # Set up variables
    cv_scores = kwargs.get('cv_scores', ['accuracy'])
    # crossVal_accuracies = list(zip(cv_scores, [[]]*len(cv_scores)))
    crossVal_accuracies = defaultdict(list)

    test_names = list(MetricReports.getTests().keys()) + list(MetricScores.getTests().keys())
    metric_results = { m: { k: [] for k in test_names } for m in existing_models.keys() }
    cv_dfs = { m: {} for m in existing_models.keys() }

     # y_test = tweets_df['sentiment_val']
    y_test = model_params['y_test']
    X_test = model_params['x_test']
    X_test_fit = model_params['x_test_fit']

    ## Begin evaluations
    for model_name, model in existing_models.items():
        y_pred = prediction_dfs[model_name]['pred_sentiment_val']

        metric_reports = MetricReports.execute(model=model, 
                                                features=X_test, 
                                                y_test=y_test, 
                                                y_pred=y_pred, 
                                                scoring=cv_scores, 
                                                kfold=kwargs.get('kfold'))
        metric_scores = MetricScores.execute(model=model, 
                                                features=X_test, 
                                                y_pred=y_pred)

        metric_results[model_name] = metric_reports + metric_scores

        # Aggregate cross validation results
        for results in metric_reports:
            iterable = [results] if not isinstance(results, (list, tuple)) else results
            for res in iterable:
                if 'CV' in res['name']:
                    score_name = res['name'].split(' ')[-1]
                    indexed_acc = [(model_name, idx, acc) for idx, acc in enumerate(res['result'])]
                    crossVal_df = pd.DataFrame(indexed_acc, columns=['model_name', 'fold_idx', score_name])
                    crossVal_accuracies[score_name].append(crossVal_df)
        
        # Combine cross val results by scoring type
        for score_type, models in crossVal_accuracies.items():
            cv_dfs[model_name][score_type] = pd.concat(models)

    # Create a dict of Sentiment_val
    sentiment_id_df = sentiment_df[['sentiment_val', 'sentiment']].drop_duplicates().sort_values('sentiment_val')
    sentiment_to_id = dict(sentiment_id_df.values)

    sentiment_maps = {
        'sentiment_id': sentiment_id_df.to_json(),
        'sentiment_to': sentiment_to_id
    }

    return signals, (metric_results, cv_dfs, sentiment_maps)

### Define `load` function

In [8]:
def load(prediction_dfs, results={}, cv_dfs={}, destinations={}, exec_config=None):
    signals = []

    # Export the execution config
    if (config_loc := destinations.get('config')):
        with open(config_loc, 'w') as f:
            json.dump(exec_config, f)
    
    def loadGroup(group_name, group_data):
        if not group_name in destinations:
            return
        group_destinations = destinations[group_name]
        export_locs = { name: path for name, path in group_destinations.items()
                                        if (name in group_data and group_data.get(name) is not None) }
        for name, path in export_locs.items():
            export_obj = group_data[name]
            if isinstance(export_obj, pd.DataFrame): # Expected origin: `prediction_dfs`
                export_obj.to_csv(path)
            elif isinstance(export_obj, dict): # Expected origin: `cv_dfs`
                aggregated_obj = pd.concat(export_obj.values(), keys=export_obj.keys())
                aggregated_obj.to_csv(path)
            else:
                print(path)
                print(export_obj)
                print('\n\n')
                dump(export_obj, path) # Expected origin: `results` or `built_models`
    
    for name, data in zip(['predictions', 'metrics', 'cross_validations'], 
                            [prediction_dfs, results, cv_dfs]):
        if data:
            loadGroup(name, data)
            
    # Save current notebook for import
    if (notebook_dest := destinations.get('notebook')):
        
        !jupyter nbconvert --output {notebook_dest} --to script pipeline_1.ipynb

        # Get rid of excess
        with open(notebook_dest + '.py', 'r+') as fp:
            lines = fp.readlines()
            fp.seek(0)
            fp.truncate()
            cell_markers = set([])
            term_index = len(lines) - 1
            for i, line in enumerate(lines):
                if '# Execute `pipeline`' in line:
                    term_index = i
                    break
                elif '# In[' in line:
                    cell_markers.add(i)

            fp.writelines([l for i, l in enumerate(lines[:term_index]) if i not in cell_markers])
    return signals

### Build `pipeline` function using above processes

In [9]:
def pipeline2(default_configs, user_configs=None, extract_args={}, model_args={}, evaluate_args={}, load_args={}, log_level=None, **kwargs):

    parsing_signals, valid_p1_params = buildConfig(dflt_configs=default_configs, usr_configs=user_configs,
                                                    extract_config=extract_args, evaluate_config=evaluate_args,
                                                    model_config=model_args, load_config=load_args,
                                                    nested_params='PIPE_1')
    processSignals(signals=parsing_signals, log_level=log_level) # Process error/info signals

    p2_user_config = user_configs[0] if hasattr(user_configs, '__iter__') else user_configs
    p2_dflt_config = default_configs[0] if hasattr(default_configs, '__iter__') else default_configs
    parsing_signals, valid_p2_params = buildConfig(dflt_configs=p2_dflt_config, usr_configs=p2_user_config, 
                                                    extract_config=extract_args, evaluate_config=evaluate_args,
                                                    model_config=model_args, load_config=load_args,
                                                    excluded_params='PIPE_1')
    processSignals(signals=parsing_signals, log_level=LOG_LEVEL)

    pipeline_stages = ['EXTRACT', 'MODEL', 'EVALUATE', 'LOAD']
    p2_extract_params, p2_model_params, p2_evaluate_params, p2_load_params = itemgetter(*pipeline_stages)(valid_p2_params)

    # Store run-specific information
    execution_config = defaultdict(dict)
    print('\n--- Executing Pipeline 2 ---\n')

    ## Extract (import)
    print('[Pipeline 2] Stage 1: Extracting...')
    extract_signals, extracted_data = extract(**p2_extract_params)
    processSignals(signals=extract_signals, generated_files=p2_load_params, log_level=log_level) # Process error/info signals
    
    if not extracted_data:
        print('\n*** Aborting Pipeline 2 ***')
        print('Will attempt to use Pipeline 1 to produce required data.\n')

        # Transform dataset using Pipeline 1 to produce a sentiment distribution of the data
        sentiment_df, word_vecs, models = pipeline1(default_configs=valid_p1_params)
        extract_signals, extracted_data = extract(sentiment_dataset=sentiment_df, slava_vectorizer=models.get('vectorizer'),
                                                    slava_models={k:v for k, v in models.items() if k != 'vectorizer'} 
                                                    **{ k: v for k, v in p2_extract_params.items() if k != 'vectorizer' })
        processSignals(signals=extract_signals, generated_files=p2_load_params, log_level=log_level) # Process error/info signals
    
    sentiment_df, existing_models, model_params, slava_sentiment_vals = extracted_data
    print('[Pipeline 2] Completed Stage 1.', end='\n\n')

    ## Model
    print('[Pipeline 2] Stage 2: Modeling...')
    model_signals, model_data = model(sentiment_df=sentiment_df, existing_models=existing_models,
                                        model_params=model_params, sentiment_vals=slava_sentiment_vals,
                                        **p2_model_params)
    processSignals(signals=model_signals, generated_files=p2_load_params, log_level=log_level) # Process error/info signals
    model_predictions, mapping_arr = model_data
    execution_config['mapping_array'] = mapping_arr.tolist()
    print('[Pipeline 2] Completed Stage 2.', end='\n\n')

    ## Evaluation
    print('[Pipeline 2] Stage 3: Evaluating...')
    evaluation_signals, evaluation_data = evaluate(sentiment_df=sentiment_df, prediction_dfs=model_predictions,
                                                    existing_models=existing_models, model_params=model_params,
                                                    **p2_evaluate_params)
    processSignals(signals=evaluation_signals, generated_files=p2_load_params, log_level=log_level) # Process error/info signals
    metric_results, cross_val_dfs, sentiment_maps = evaluation_data
    execution_config['sentiment_maps'] = sentiment_maps
    print('[Pipeline 2] Completed Stage 3.', end='\n\n')
    valid_p2_params['PIPE_1'] = valid_p1_params
    mergeDicts(execution_config, valid_p2_params)

    ## Loading (export)
    print('[Pipeline 2] Stage 4: Loading...')
    load_signals = load(prediction_dfs=model_predictions, results=metric_results, 
                        cv_dfs=cross_val_dfs, destinations=p2_load_params, exec_config=execution_config)
    processSignals(signals=load_signals, generated_files=p2_load_params, log_level=log_level) # Process error/info signals
    print('[Pipeline 2] Completed Stage 4.', end='\n\n')
    print('[Pipeline 2] <done>')
    
    return model_predictions, metric_results, cross_val_dfs
    

## Execute `pipeline`

In [13]:
PIPE1_USER_CONFIG = '../pipeline_1/config/user_config.json'
PIPE2_USER_CONFIG = './config/user_config.json'

PIPE1_DFLT_CONFIG = '../pipeline_1/config/default_config.json'
PIPE2_DFLT_CONFIG= './config/default_config.json'
LOG_LEVEL = CONTROL_ACTIONS.WARNING

model_predictions, metric_results, cross_val_dfs = pipeline2(default_configs=[PIPE2_DFLT_CONFIG, PIPE1_DFLT_CONFIG],
                                                            user_configs=[PIPE2_USER_CONFIG, PIPE1_USER_CONFIG],
                                                            log_level=LOG_LEVEL)


--- Executing Pipeline 2 ---

[Pipeline 2] Stage 1: Extracting...

*** Aborting Pipeline 2 ***
Will attempt to use Pipeline 1 to produce required data.


--- Executing Pipeline 1. ---

Stage 1: Extracting...
Importing data from "../../data/russia_vs_ukraine_tweets.csv"...
Completed Stage 1.

Stage 2: Transforming...
** Top 25 Similar Word Vectors By Cluster **

Unique Terms from Clusters


Unnamed: 0,Cluster 0,Cluster 1,Cluster 2
0,arm_force,remind,sport
1,year_ago,highly,australian
2,appear,link,acknowledge
3,,,addition
4,,,align
5,,,alternative
6,,,transform
7,,,behalf



Duplicate Terms from Clusters


Unnamed: 0_level_0,Cluster 0 (baseline),Cluster 1 relative to Cluster 0,Cluster 2 relative to Cluster 0
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
argentina,0.0,-1.0,-1.0
former,0.999864,-2.384508e-07,2.3547e-06
equation,0.999864,-4.769017e-07,2.771988e-06
economic,0.999864,-1.192254e-07,0.9999961
fee,0.999865,-8.345779e-07,0.9999961
none,0.999865,1.0,2.950826e-06
tactic,0.999865,4.172889e-07,0.9999964
embarrass,0.999865,-5.365144e-07,4.143079e-06
surely,0.999865,1.192254e-07,0.9999968
significant,0.999866,2.324896e-06,0.9999977



Label each cluster: -1 = negative, 0 = neutral, 1 = positive ("r" for new samples, "q" to exit)


Generating next 25 samples...

Unique Terms from Clusters


Unnamed: 0,Cluster 0,Cluster 1,Cluster 2
0,align,impoverish,hello
1,allegedly,none,peace_talk
2,argentina,,insane
3,remind,,elon_musk
4,,,bunker
5,,,chain
6,,,see_happen
7,,,neocon
8,,,ball
9,,,independent_country



Duplicate Terms from Clusters


Unnamed: 0_level_0,Cluster 0 (baseline),Cluster 1 relative to Cluster 0,Cluster 2 relative to Cluster 0
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
throughout,0.0,-0.9999985,-0.9999998
arm_force,0.0,-0.9999999,-0.9999999
appear,0.0,-1.0,-1.0
year_ago,0.0,-0.9999999,-0.9999997
kremlin,0.999861,-2.682575e-07,9.538081e-07
reform,0.999862,-3.87483e-07,1.848003e-06
accord,0.999862,8.941915e-08,0.9999977
compliance,0.999862,-2.086447e-07,1.013421e-06
deserve,0.999862,-5.067085e-07,0.9999979
australian,0.999862,1.490319e-07,0.9999979


Current state: []
Setting cluster: 0
Set cluster 0 to 0 (neutral)
Set cluster 1 to -1 (negative)
Set cluster 2 to 1(positive)

Applying sentiment mapping...

Calculated Sentiment Distribution:


neutral     8177
positive    1188
negative     547
Name: sentiment, dtype: int64

[ERROR] Aborting execution - User-caused. Distribution was unsatisfactory.

Terminating Process...


JupyterExit: 

In [None]:
from sklearn.metrics import get_scorer_names
get_scorer_names()