# Twitter Sentiment Analysis Pipeline #1
*Refer to `notebooks/README.md` for an explanation of the various pipelines*

## Import dependencies

In [1]:
# Built-in
import re
import sys
import os
import json
import multiprocessing
from collections import defaultdict

# Importing datasets
import opendatasets as od

# Data manipulation
import pandas as pd
import numpy as np
from joblib import dump, load

# Graphing/Visualizing
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from IPython.display import display

In [2]:
# ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [3]:
# NLP
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Prep nltk library
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# User-defined
sys.path.append("..") # Adds higher directory for ControlSignal
from controlSignal import ControlSignal, CONTROL_ACTIONS, CONTROL_FLAGS, processSignals
from configParser import validateConfig
sys.path.remove('..')

## Define Parameters

In [5]:

PIPE1_PARAMS = {
    'EXTRACT': {
        'data_import_path': 'https://www.kaggle.com/datasets/gpreda/slava-ukraini-tweets/versions/{version}',
        'data_import_version': 15, 
        'data_import_dest': '../../data/slava_ukraini_tweets{version}.csv',
        'new_dataset': False,
    },

    'TRANSFORM': {
        'clean_tweet': [
            r'https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', # match URLs
            r'\$[a-zA-Z0-9]*', # match tickers/stock symbols 
            r'\@[a-zA-Z0-9]*', # match usernames/tags
            r'[^a-zA-Z\']' # match any non-letter (except apostrophe)
        ],
        'clean_hashtag': [
            r'\$[a-zA-Z0-9]*', # match tickers/stock symbols 
            r'[^a-zA-Z\']' # match any non-letter (except apostrophe)
        ],
        'column_mappings': {
            'date': 'date',
            'user_name': 'username',
            'retweets': 'retweets',
            'text': 'tweet',
            'hashtags': 'hashtags'
        },
        'filter_words': [
            'ukraine', 
            'russia', 
            'zelensky'
        ],
        'sentiment_map': { 
            -1: "negative",
            0: "neutral", 
            1: "positive"
        },
        'display_terms': 25 # Number of terms to display when user is assigning cluster sentiment
    },

    'MODEL': {
        'build_models': True
    },

    'LOAD': {
        'config': './config/config_{timestamp}.json', # Output the settings used for the execution
        'nb_export': '../pipeline_2/pipeline_1', # Export this entire notebook to be used by another script
        'transform': './data/transformed/slava_ukraine_sentiment_{timestamp}.csv', # Store the transformed dataset including sentiment
        'word_vec': './models/slava_word_vec_{timestamp}.model', # Save the word2vec model
        'kmeans': './models/slava_kmeans_{timestamp}.joblib', # Save the kMeans model
        'embeddings': './data/embeddings/slava_words_{timestamp}.csv', # Save the generated embeddings
        'vectorizer': './models/slava_vectorizer_{timestamp}.joblib', # Save the vectorizer model
        'svc': './models/slava_linearSVC_{timestamp}.joblib', # Save the LinearSVC model
        'nb': './models/slava_multinomialNB_{timestamp}.joblib' # Save the MultinomialNB model
    }
}

## Validate & format config

In [6]:
DFLT_CONFIG_PATH = './config/default_config.json'
USER_CONFIG_PATH = './config/config.json'

params, signals = validateConfig(DFLT_CONFIG_PATH, USER_CONFIG_PATH)
processSignals(signals, params['LOAD'])

ERROR: Aborting execution.
Missing a required config value. Cound not find the following fields: data_import_path
<done>


JupyterExit: 

## Build `extract` function

In [None]:

def extract(import_path, local_dest, **kwargs):
    signals=[]
    if kwargs.get('new_dataset'): 
        # Requesting new data
        dest_dir = os.path.join(*local_dest.split('/')[:-1])
        
        # Check for existing dataset
        if os.path.isfile(local_dest):
            print('Found existing file:', local_dest)
            user_input = input('Remove? (y/n)')
            if user_input != 'y':
                existing_files = [f for f in os.listdir(dest_dir) if f.endswith('.csv')]
                parts = local_dest.split('/')
                ext_index = parts[-1].index('.')
                parts[-1] = f"{parts[-1][:ext_index]}_{len(existing_files)}{parts[-1][ext_index:]}"
                local_dest = os.path.join(*(parts))
            else:
                try:
                    os.remove(local_dest)
                except OSError as e:
                    signals.append(ControlSignal(CONTROL_ACTIONS.WARNING, CONTROL_FLAGS.INVALID_LOCATION, f'Could not delete existing file. Received error {str(e)}'))

        # Download dataset
        od.download(import_path, data_dir=dest_dir)

        # Collect downloaded dataset
        data_import_dir = os.path.join(dest_dir, import_path.split('/')[-1])
        imported_file = next(f for f in os.listdir(data_import_dir) if f.endswith('.csv'))

        if not imported_file:
            print('Error importing data. File was either not downloaded or moved')
            signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.INVALID_LOCATION, f'Failed importing data. File was either moved or not downloaded. Searched for .csv file in {data_import_dir}'))
            return signals

        # Move and rename file
        temp_import_loc = os.path.join(data_import_dir, imported_file)
        os.rename(temp_import_loc, local_dest)

        # Remove temporary directory created when downloaded
        try:
            if os.listdir((data_import_dir)):
                signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.INVALID_LOCATION, f'Import directory [{data_import_dir}] already exists! Unable to move new data'))
                return signals
            os.rmdir(data_import_dir)
        except OSError as e:
            print('Could not delete import directory, got' + str(e))
            signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.INVALID_LOCATION, f'Could not delete import directory [{data_import_dir}]. Exiting for safety.'))
            return signals

    # Import data
    raw_tweets_df = pd.read_csv(local_dest)

    # Look for existing models
    existing_models = {}
    if (word_vec_path := kwargs.get('word_vec')):
        existing_models['word_vec'] = Word2Vec.load(word_vec_path).wv
    if (kmeans_path := kwargs.get('kmeans')):
        existing_models['kmeans'] = load(kmeans_path)
    if (embeddings_path := kwargs.get('embeddings')):
        existing_models['embeddings'] = pd.read_csv(embeddings_path)
    if (vectorizer_path := kwargs.get('vectorizer')):
        existing_models['vectorizer'] = load(vectorizer_path)
    if (svc_path := kwargs.get('svc')):
        existing_models['svc'] = load(svc_path)
    if (nb_path := kwargs.get('nb')):
        existing_models['nb'] = load(nb_path)

    return raw_tweets_df, existing_models

## Define helper function for `transform`
Handles user input for cluster sentiment assignment

In [None]:
## Need user input to determine each cluster's sentiment ##

def setClusterSentiment(vectors, model, mapping, display_terms=20):
    signals = []
    print(f'** Top {display_terms} Similar Word Vectors By Cluster **\n')

    def collectSamples(multiplier=0):
        word_vec_list = [vectors.similar_by_vector(model.cluster_centers_[x], 
                                                            topn=(display_terms * (multiplier+1)), 
                                                            restrict_vocab=None) for x in range(len(mapping))]
        
        cluster_values = np.array(list(zip(*[x[(display_terms * multiplier):] for x in word_vec_list])))

        # Collect terms spanning multiple clusters for deciphering
        term_freq, counts = np.unique([x[0] for x in np.vstack(cluster_values)], axis=0, return_counts=True)
        unique_terms = term_freq[counts == 1]

        # Separate unique from duplicate terms
        uniq_cluster_vals = defaultdict(lambda : np.full(len(cluster_values), np.nan, dtype=object))
        shared_cluster_vals = defaultdict(lambda : [0] * len(mapping))
        for iy, ix in np.ndindex(cluster_values.shape[:len(mapping)-1]):
            tmp = cluster_values[iy, ix]
            if tmp[0] in unique_terms:
                uniq_cluster_vals[ix][iy] = tuple(tmp)
            else:
                shared_cluster_vals[tmp[0]][ix] = tmp[1]

        max_uniq_in_cluster = max([len([x for x in l if not pd.isnull(x)]) for l in uniq_cluster_vals.values()])
        formatted_unique = np.array([np.pad(vals[~pd.isnull(vals)], 
                                        (0,max_uniq_in_cluster-np.count_nonzero(~pd.isnull(vals))), constant_values=None) 
                                        for vals in uniq_cluster_vals.values()], dtype=object).T

        cols = [f'Cluster {x}' for x in range(len(mapping))]

        print('Unique Terms from Clusters')
        unique_terms_df = pd.DataFrame([[x[0] if x else '' for x in y] for y in formatted_unique], columns=cols)
        display(unique_terms_df)

        print('\nDuplicate Terms from Clusters')
        duplicate_terms_df = pd.DataFrame.from_dict(shared_cluster_vals, orient='index', columns=cols)
        display(duplicate_terms_df)
    
    collectSamples()

    ## Get input
    
    print('\nLabel each cluster: -1 = negative, 0 = neutral, 1 = positive ("r" for new samples, "q" to exit)')
    cluster_sentiment_defs = []
    user_input = ''
    resets = 0
    while len(cluster_sentiment_defs) < len(mapping)-1 and user_input != 'q':
        
        user_input = input(f'Cluster {len(cluster_sentiment_defs)} value:')
        if user_input == 'q':
            signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, CONTROL_FLAGS.USER_INPUT))
            return signals, []
        if user_input == 'r':
            print(f'\n\nGenerating next {display_terms} samples...\n')
            resets += 1
            collectSamples(resets)
            print('Current state:', cluster_sentiment_defs)
            print('Setting cluster:', len(cluster_sentiment_defs))
            continue
        try:
            value = int(user_input)
            if value in cluster_sentiment_defs or value not in range(-1, 2):
                print('Already used this sentiment or not in range (-1, 0, 1)')
                continue
            cluster_sentiment_defs.append(value)
            print(f'Set cluster {len(cluster_sentiment_defs)-1} to {value} ({mapping[value]})')
        except ValueError:
            print('Need a number in range [-1, 0, 1]. Press q to exit')
            
    cluster_sentiment_defs.append((set(mapping.keys()) - set(cluster_sentiment_defs)).pop())

    print(f'Set cluster {len(cluster_sentiment_defs)-1} to {cluster_sentiment_defs[-1]} ({mapping[cluster_sentiment_defs[-1]]})')
    return signals, cluster_sentiment_defs

## Build `transform` function

In [None]:
def cleanAndFilter(raw_tweets_df, column_mappings={}, filter_words={}, **kwargs):
    # Rename columns
    tweets_df = raw_tweets_df[list(column_mappings.keys())].rename(columns=column_mappings) \
                    if column_mappings else raw_tweets_df.copy()

    # Drop duplicate tweets
    tweets_df = tweets_df.drop_duplicates(subset='tweet', keep='first')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    stop_words = stopwords.words("english")
    tweet_regexs = kwargs.get('cleen_tweet', [r'https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*',
                                                r'\$[a-zA-Z0-9]*',
                                                r'[^a-zA-Z\']',
                                                r'\@[a-zA-Z0-9]*'])
    def cleanTweet(tweet):
        tweet = tweet.lower()
        for pattern in tweet_regexs:
            tweet = re.sub(pattern, ' ', tweet)
        tweet = ' '.join([w for w in tweet.split() if len(w) > 1])
        
        trimmed_lemma_words = [lemma.lemmatize(x) for x in nltk.wordpunct_tokenize(tweet) 
                                    if x not in stop_words]
        clean_tweet = ' '.join(trimmed_lemma_words)
        
        return [lemma.lemmatize(x, nltk.corpus.reader.wordnet.VERB) 
                    for x in nltk.wordpunct_tokenize(clean_tweet) if x not in stop_words]

    hashtag_regexs = kwargs.get('clean_hashtag', [r'\$[a-zA-Z0-9]*', r'[^a-zA-Z\']'])
    def cleanHashtags(hashtags):
        if hashtags:
            hashtags = hashtags.lower()
            for pattern in hashtag_regexs:
                hashtags = re.sub(pattern, ' ', hashtags)
            hashtags = hashtags.strip() 
        return hashtags
    
    
    # Clean tweets
    tweets_df['clean_tweet_words'] = tweets_df['tweet'].apply(lambda x: cleanTweet(x))
    tweets_df['clean_tweet'] = tweets_df['clean_tweet_words'].apply(lambda x:' '.join(x))

    # Clean hashtags
    tweets_df["hashtags"] = tweets_df["hashtags"].astype(str)
    tweets_df["hashtags"] = tweets_df["hashtags"].apply(lambda x: cleanHashtags(x))

    # Convert date to datetime and extract month/year
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])
    tweets_df['month'] = tweets_df['date'].dt.month
    tweets_df['year'] = tweets_df['date'].dt.year

    if filter_words:
        # Remove all tweets which do not have the provided target words
        keywords_str = '|'.join(filter_words)
        filtered_tweets_df = tweets_df.copy()
        filtered_tweets_df = filtered_tweets_df[filtered_tweets_df["clean_tweet"].str.contains(keywords_str)]
        return tweets_df, filtered_tweets_df
        
    return tweets_df, None 

In [None]:
def buildWordVectors(tweets_df, progress_per=50000, epohcs=30, **kwargs):
    # Restructure the `clean_text` column
    row_sentences = [row for row in tweets_df["clean_tweet_words"]]

    # Detect common phrases (bigrams) from a list of sentences
    phrases = Phrases(row_sentences, min_count=1, progress_per=50000)
    bigram = Phraser(phrases)
    sentences = bigram[row_sentences]
    
    # Initialize vector model
    if (word_vec_params := kwargs.get('word_vec_args')):
        word_vec_model = Word2Vec(**word_vec_params)
        word_vec_model.build_vocab(sentences, progress_per=progress_per)
        word_vec_model.train(sentences, total_examples=word_vec_model.corpus_count, 
                                epochs=epohcs, report_delay=1)
    else:
        word_vec_model = Word2Vec(vector_size=300, 
                                window=5, 
                                min_count=4, 
                                workers=multiprocessing.cpu_count()-1,
                                negative=20, 
                                sample=1e-5, 
                                alpha=0.03, 
                                min_alpha=0.007,  
                                seed= 42)

        # Establish dataset for the vector model
        word_vec_model.build_vocab(sentences, progress_per=50000)

        # Train the model
        word_vec_model.train(sentences, total_examples=word_vec_model.corpus_count, 
                                epochs=30, report_delay=1)

    return word_vec_model.wv

def buildWordEmbeddings(word_vectors, model, sentiment_defs, sentiment_map):
     # Create a DataFrame of words with their embeddings and cluster values
    words_df = pd.DataFrame(word_vectors.index_to_key)
    words_df.columns = ['words']
    words_df['vectors'] = words_df.words.apply(lambda x: word_vectors[f'{x}'])
    words_df['cluster'] = words_df.vectors.apply(lambda x: model.predict([np.array(x)]))
    words_df.cluster = words_df.cluster.apply(lambda x: x[0])

    
    words_df['cluster_value'] = [sentiment_defs[i] for i in words_df.cluster]

    # Calculate proximity of words in each vector
    calc_vector_nearness = lambda x: 1 / (model.transform([x.vectors]).min())
    words_df['closeness_score'] = words_df.apply(calc_vector_nearness, axis=1)
    words_df['sentiment_coeff'] = words_df.closeness_score * words_df.cluster_value

    # Map sentiment encodings
    words_df["sentiment"] = words_df["cluster_value"].map(sentiment_map)

    return words_df

def peekSentimentDistrib(tweets_df):
    print('\nCalculated Sentiment Distribution:')
    display(tweets_df['sentiment'].value_counts())
    user_input = input('Distribution okay? (y/n) ')
    if user_input != 'y':
        return False
    return True


In [None]:

def transform(raw_tweets_df, sentiment_map, column_mappings={}, 
                                filter_words=[], models={}, **kwargs):
    signals = []
    clean_tweets, filtered_tweets = cleanAndFilter(raw_tweets_df=raw_tweets_df, 
                                                    column_mappings=column_mappings, 
                                                    filter_words=filter_words, 
                                                    kwargs=kwargs)

    if (existing_word_vec := models.get('word_vec')):
        word_vectors = existing_word_vec
    else:
        word_vectors = buildWordVectors(clean_tweets, kwargs)
    
    if (existing_kmeans := models.get('kmeans')):
        cluster_model = existing_kmeans
    else:
        # Build KMeans model to cluster words into positive, negative, and neutral clusters
        if (kmeans_params := kwargs.get('kmeans_args')):
            cluster_model = KMeans(**kmeans_params)
        else:
            cluster_model = KMeans(n_clusters=3, max_iter=1000, 
                                    random_state=42, n_init=50)
        cluster_model = cluster_model.fit(X=word_vectors.vectors.astype('double'))
                        
    ############# Get Input ###############
    cluster_sentiment_defs = setClusterSentiment(vectors=word_vectors, 
                                                    model=cluster_model, 
                                                    mapping=sentiment_map, 
                                                    display_terms=kwargs.get('display_terms'))
    if not cluster_sentiment_defs:
        signals.append(ControlSignal(CONTROL_ACTIONS.ABORT, ))
        return
    print('\nApplying sentiment mapping...')
    #######################################

    if (existing_embeddings := kwargs.get('embeddings')):
        words_df = existing_embeddings
    else:
        words_df = buildWordEmbeddings(word_vectors=word_vectors, 
                                        model=cluster_model, 
                                        sentiment_defs=cluster_sentiment_defs, 
                                        sentiment_map=sentiment_map)
        
    # Get the sentiment for the entire tweet
    threshold = kwargs.get('sentiment_threshold', 0.15)
    words_cluster_dict = dict(zip(words_df.words, words_df.cluster_value))
    def getSentiment(row):
        total, count = 0, 0
        test = row["clean_tweet_words"]
        for t in test:
            total += int(words_cluster_dict.get(t, 0))
            count += 1 
            
        avg = total / count
        return -1 if (avg < -threshold) else 1 if (avg > threshold) else 0

    # Add sentiment column (integer values)
    filtered_tweets["sentiment_val"] = filtered_tweets.apply(getSentiment, axis=1)
    # Map integer sentiment to word value
    filtered_tweets["sentiment"] = filtered_tweets["sentiment_val"].map(sentiment_map)

    # Confirm sentiment spread with user
    if not peekSentimentDistrib(filtered_tweets):
        return
    return filtered_tweets, words_df, cluster_sentiment_defs

In [None]:
def testModel(model, X_test, y_test):
    # Predict
    y_pred = model.predict(X_test)

    # Build confusion matrix to evaluate the model results
    confusion = confusion_matrix(y_test, y_pred, labels=np.unique(y_pred))

    # Get classification report
    classification = classification_report(y_test, y_pred, labels=np.unique(y_pred))

    # Use score method to get accuracy of model
    acc_score = model.score(X_test, y_test)

    return {
        'confusion': confusion,
        'classification': classification,
        'acc_score': acc_score,
    }

## Build `model` function

In [None]:

def model(sentiment_df, test_size=0.2, models={}, **kwargs):
    
    # Convert each sentiment to df (no need to worry about memory crash, small dataset)
    pos_df = sentiment_df[sentiment_df["sentiment"]=="positive"]
    neg_df = sentiment_df[sentiment_df["sentiment"]=="negative"]
    neu_df = sentiment_df[sentiment_df["sentiment"]=="neutral"]

    # Combine all sentiments in one df
    sentiments_df_list = [pos_df, neg_df, neu_df] 
    agg_sentiment_df = pd.concat(sentiments_df_list)

    # Split the data to training, testing, and validation data 
    train_test_df, _ = train_test_split(agg_sentiment_df, test_size=test_size, random_state=10)

    X = train_test_df['clean_tweet']
    y = train_test_df['sentiment_val']

    # Split the dataset set into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Instantiate TfidfVectorizer 
    if (existing_vectorizer := models.get('vectorizer')):
        vectorizer = existing_vectorizer
    else: 
        if (vectorizer_params := kwargs.get('vectorizer_args')):
            vectorizer = TfidfVectorizer(**vectorizer_params)
        else:
            vectorizer = TfidfVectorizer(min_df=3,
                                            sublinear_tf=True,
                                            ngram_range=(1,2),
                                            stop_words='english')

    # Fit vectorizer
    X_train_tf = vectorizer.fit_transform(X_train.reset_index()["clean_tweet"]).toarray()
    X_test_tf = vectorizer.transform(X_test.reset_index()["clean_tweet"]).toarray()

    # Store features from the vectors
    feature_names = vectorizer.get_feature_names_out()

    # Create a dict of Sentiment_val: sentiments to use with the confusion matrix
    sentiment_id_df = agg_sentiment_df[['sentiment', 'sentiment_val']].drop_duplicates() \
                                                                        .sort_values('sentiment_val')
    sentiment_to_id = dict(sentiment_id_df.values)

    ## LinearSVC ##

    if (existing_svc := models.get('svc')):
        # (NOTE: Must use same vectorizer from its fitting)
        linearSVC = existing_svc
    else:
        # Instantiate the model
        linearSVC = LinearSVC()

        # Fit the model
        linearSVC.fit(X_train_tf, y_train)

    svc_performance = testModel(model=linearSVC, X_test=X_test_tf, y_test=y_test)

    ## MultinomialNB ##

    if (existing_nb := models.get('nb')):
        # (NOTE: Must use same vectorizer from its fitting)
        multiNB = existing_nb
    else:
        # Instantiate the model
        multiNB = MultinomialNB()

        # Fit the model
        multiNB.fit(X_train_tf, y_train)

    nb_performance = testModel(model=multiNB, X_test=X_test_tf, y_test=y_test)

    return {
        'LinearSVC': svc_performance,
        'MultinomialNB': nb_performance,
        'features': feature_names,
        'vectorizer': vectorizer,
        'sentiment_id': sentiment_id_df,
        'sentiment_to': sentiment_to_id
    }

## Build `load` function

In [None]:
def load(transform_df, transform_dest=None, models={}, model_dest={}, nb_dest=None):

    # Export the sentiment dataframe
    if transform_dest:
        transform_df.to_csv(transform_dest)
    
    # Pair up models with their export destination 
    model_exports = { name : (path, models[name]) for name, path in model_dest 
                                            if (name in models and models[name]) }
    for name, export_info in model_exports.items():
        export_path, model = export_info
        if name == 'word_vec':
            model.save(export_path)
        elif name == 'embeddings':
            model.to_csv(export_path)
        else:
            dump(model, export_path)
            
    # Save current notebook for import by Pipeline 2
    if nb_dest:
        
        !jupyter nbconvert --output {nb_dest} --to script pipeline_1.ipynb

        # Get rid of excess
        with open(nb_dest + '.py', 'r+') as fp:
            lines = fp.readlines()
            fp.seek(0)
            fp.truncate()
            cell_markers = []
            execute_start, execute_end = -1, -1
            for i, line in enumerate(lines):
                if '## Execute `pipeline`' in line:
                    execute_start = i
                elif '## Visualizations' in line:
                    execute_end = i
                elif '# In[' in line:
                    cell_markers.append(i)
            
            exclude_list = list(range(execute_start, execute_end))
            exclude_list.extend(cell_markers)

            fp.writelines([l for i, l in enumerate(lines) if i not in set(exclude_list)])

## Build `pipeline` function from above processes

In [None]:
def pipeline1(import_path, import_params={}, extract_args={}, transform_args={}, 
                model_args={}, control_params={}):

    
    if not transform_args.get('sentiment_map'):
        transform_args['sentiment_map'] = { -1: "negative", 0: "neutral", 1: "positive" }

    print('Stage 1: Extracting...')

    config = defaultdict(dict)

    all_tweets_df, target_df = extract(import_path=import_path,
                                local_path=import_params.get('local_path', f"./{import_path.split('/')[-1]}.csv"),
                                **extract_args)

    print('Completed Stage 1.\n\nStage 2: Transforming...')
    transform_tweets_df, word_vecs, sentiment_defs = transform(filtered_df=target_df, 
                        cumulative_df=all_tweets_df,
                        **transform_args)

    model_dict = {}
    if not transform_tweets_df.empty:
        if sentiment_defs:
            config['sentiment_vals'] = {
                'value_mapping': transform_args['sentiment_map'],
                'cluster_mapping': sentiment_defs
            }

        print('Completed Stage 2.')

        if (save_transform := control_params.get('save_transform')):
            transform_tweets_df.to_csv(save_transform)

        if control_params.get('build_models'):
            print('\nStage 3: Modeling...')
            model_dict = model(sentiment_df=transform_tweets_df, **model_args)
            print('Completed Stage 3.')

        if (nb_dest := control_params.get('nb_export')):
            load(nb_dest)

        if (config_loc := control_params.get('config')):
            with open(config_loc, 'w') as f:
                json.dump(config, f)

    else:
        # Process aborted. Clean up...
        print('Cleaning...')
        save_params = ['save_word_vec', 'save_embeddings', 'save_kmeans']
        staged_paths = [transform_args[p] for p in save_params if transform_args.get(p)]
        for path in staged_paths:
            try:
                os.remove(path)
            except OSError as e:
                print('Could not delete file, got' + str(e))
        
    print('\n<done>')
    return transform_tweets_df, word_vecs, model_dict

## Execute `pipeline`

In [None]:

# Format parameters
data_import_params, curr_time = formatParams(DATA_IMPORT_PARAMS)
extract_args, _ = formatParams(EXTRACT_ARGS, curr_time)
transform_args, _ = formatParams(TRANSFORM_ARGS, curr_time)
model_args, _ = formatParams(MODEL_ARGS, curr_time)
control_params, _ = formatParams(CONTROL_PARAMS, curr_time)


sentiment_df, word_vecs, model_dict = pipeline1(import_path=data_import_params['data_path'],
                                                import_params=data_import_params, 
                                                extract_args=extract_args,
                                                transform_args=transform_args, 
                                                model_args=model_args, 
                                                control_params=control_params)

## Visualizations

In [None]:
class GraphicProcessors:
    
    # Display a word cloud with the given text
    def generateWordcloud(text):
        words=' '.join([words for words in text])
        wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(words)
        plt.figure(figsize=(10, 7))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')
        plt.show()

    # Shorthand functions for commonly used plots
    graphWordDistribution = lambda word_vecs: GraphicProcessors.graphDistribution(word_vecs, 'sentiment', 'Sentiment Distribution of Words')
    graphTweetDistribution = lambda tweet_df: GraphicProcessors.graphDistribution(tweet_df, 'sentiment', 'Sentiment Distribution of Tweets')

    # Make a pie chart from a dataframe's column distribution
    def graphDistribution(df, plot_col, title='', fig_size=(7, 7)):
        fig = plt.gcf()
        fig.set_size_inches(fig_size)
        colors = ["crimson", "steelblue", "mediumseagreen"]

        pie_df = df[plot_col].value_counts().reset_index()

        plt.pie(pie_df[plot_col],
                labels=pie_df["index"],
                radius=2,
                colors=colors,
                autopct="%1.1f%%")

        plt.axis('equal')
        if title:
            plt.title(title, fontsize=20)
        plt.show()
        return pie_df

    # Display a bar chart with the counts of values within a specific column
    def graphKeywordCounts(df, plot_col, count_col, keywords):
        # Inspect keyword sentiment
        pattern = '|'.join(keywords)
        keyword_sent_df = df[(df[plot_col].str.contains(pattern))]
        sns.countplot(x=keyword_sent_df[count_col]);

    # Shorthand functions for commonly used plots
    graphTop10Usernames = lambda tweets_df: GraphicProcessors.graphCounts(tweets_df, 'username', 'sentiment', 'Top 10 Highest Tweeting usernames', tweets_df['username'].value_counts().iloc[:10].index)
    graphTop10Hashtags = lambda tweets_df: GraphicProcessors.graphCounts(tweets_df, 'hashtags', 'sentiment', 'Top 10 Hashtags', tweets_df['hashtags'].value_counts().iloc[1:10].index, (15,10))

    # Display a more detailed bar chart from `graphKeywordSentiment`
    def graphCounts(df, x_col, hue_col=None, title='', order=None, plt_size=(10,8)):
        fig = plt.subplots(figsize=plt_size)
        if title:
            plt.title(title, fontsize=20)
        chart = sns.countplot(x=x_col, 
                                data=df, 
                                palette="Set2", 
                                hue=hue_col,
                                order=order)

        chart.set_xticklabels(chart.get_xticklabels(),
                                rotation=30, 
                                horizontalalignment='right')

    # Display a confusion matrix generated by a sklearn/tensorflow model
    def graphConfusionmatrix(conf_mat, sentiment_id_df):
        fig, ax = plt.subplots(figsize=(5,5))
        sns.heatmap(conf_mat, 
                    annot=True, 
                    fmt='d',
                    xticklabels=sentiment_id_df.sentiment.values, 
                    yticklabels=sentiment_id_df.sentiment_val.values)

        plt.ylabel('Actual')
        plt.xlabel('Predicted')

    # Graph a bar chart of the top portion of feature coefficients from a model
    def graphCoefficients(model, feature_names, top_features=20, fig_size=(15, 5)):

        coefficients_and_features = sorted(zip(model.coef_[0], feature_names)) 
        features_coef_df = pd.DataFrame(coefficients_and_features)
        features_coef_df.columns = 'coefficient','word'
        features_coef_df.sort_values(by='coefficient')

        num_features = len(feature_names)
        neg_coefficients = model.coef_[-1][:num_features]
        pos_coefficients = model.coef_[1][:num_features]
        top_pos_coefficients = np.argsort(pos_coefficients[pos_coefficients > 0])[-top_features:]
        top_neg_coefficients = np.argsort(pos_coefficients[neg_coefficients < 0])[:top_features]
        top_coefficients = np.hstack([top_neg_coefficients, top_pos_coefficients])
        total_coefficients = np.hstack([neg_coefficients, pos_coefficients])
        
        # create plot
        fig = plt.figure(figsize=fig_size)
        colors = ['red' if c < 0 else 'blue' for c in total_coefficients[top_coefficients]]
        feature_names = np.array(feature_names)

        plt.bar(np.arange(2 * top_features), total_coefficients[top_coefficients], color=colors)
        plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
        title="Positive and Negative Labels"
        plt.title(title)
        plt.show()

### Distributions

In [None]:
GraphicProcessors.graphWordDistribution(word_vecs)

In [None]:
GraphicProcessors.graphTweetDistribution(sentiment_df)

In [None]:
GraphicProcessors.graphKeywordCounts(sentiment_df, 'clean_tweet', 'sentiment', ['russia'])

### Counts

In [None]:
GraphicProcessors.graphTop10Usernames(sentiment_df)

In [None]:
GraphicProcessors.graphTop10Hashtags(sentiment_df)

### Word Clouds

In [None]:
# Cloud for positive sentiment tweets
GraphicProcessors.generateWordcloud(sentiment_df[sentiment_df['sentiment_val']==1]['clean_tweet'].values)

In [None]:
# Cloud for negative sentiment tweets
GraphicProcessors.generateWordcloud(sentiment_df[sentiment_df['sentiment_val']==-1]['clean_tweet'].values)

### Confusion Matricies

In [None]:
GraphicProcessors.graphConfusionmatrix(model_dict['LinearSVC']['conf_mat'], model_dict['sentiment_id'])

In [None]:
GraphicProcessors.graphConfusionmatrix(model_dict['MultinomialNB']['conf_mat'], model_dict['sentiment_id'])

### Feature Coefficients

In [None]:
GraphicProcessors.graphCoefficients(model_dict['LinearSVC']['model'], model_dict['features'])