# ETL of Pipeline 1 Data for Vizualization

## `words.csv` embeddings transformation

*Eventually want this to be an included in `pipeline_1`*

In [1]:
# Import dependencies
import pandas as pd
import seaborn as sns

In [6]:
# Declare constants
PIPE_1_DATA_LOC = './data/'
VIZ_1_DATA_LOC = '../../../visualizations/part1/data/' # UGLYYYY, want to reference based on root (dev/)
# ABS_ROOT = '/Users/petergish/Desktop/working_pipeline1_data/'

# Word vectors
WORD_VECS_LOC = PIPE_1_DATA_LOC + 'embeddings/slava_words.csv'
WORDS_DISTRIB_OUTPUT_LOC = VIZ_1_DATA_LOC + 'words_sentiment_distrib.csv'
CLEAN_WORDS_OUTPUT_LOC = VIZ_1_DATA_LOC + 'cleaned_words.csv'

# Transformed tweets
TRANSFORMED_TWEETS_LOC = PIPE_1_DATA_LOC + 'transformed/slava_sentiment_transform.csv'
TWEETS_DISTRIB_OUTPUT_LOC = VIZ_1_DATA_LOC + 'slava_tweets_sentiment_distrib.csv'
CLEAN_TWEETS_OUTPUT_LOC = VIZ_1_DATA_LOC + 'cleaned_slava_tweets.csv'

In [7]:
# Helper function

def formatPipelineOutput(df, drop_cols=[], col_mappings={}):
    # Drop designated columns
    clean_df = df.drop(columns=drop_cols).fillna('') # May need to do more cleaning than this...

    # Collect sentiment value counts
    summary_df = pd.DataFrame(clean_df['sentiment'].value_counts())
    
    # Format dataframe for output
    summary_df = summary_df.reset_index().rename(columns=col_mappings)

    # Build column representing each sentiment values "part of the whole"
    total_values = sum(summary_df['count'])
    summary_df['percent'] = summary_df['count'] / total_values * 100

    return clean_df, summary_df

In [8]:
# Import word vectors
words_df = pd.read_csv(WORD_VECS_LOC)
words_df.head()

Unnamed: 0.1,Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff,sentiment
0,0,slavaukraini,[-0.00593482 -0.02993274 -0.0728035 0.065911...,0,1,14.854185,14.854185,positive
1,1,ukraine,[-2.77568027e-03 -2.84301732e-02 -7.52523988e-...,0,1,16.292625,16.292625,positive
2,2,putin,[-1.96325756e-03 -3.00392564e-02 -6.95326552e-...,0,1,13.2207,13.2207,positive
3,3,ukrainian,[-0.00642006 -0.03554767 -0.07539388 0.072524...,0,1,13.666969,13.666969,positive
4,4,russia,[ 9.40235914e-05 -3.21617350e-02 -7.10022524e-...,1,-1,16.337045,-16.337045,negative


In [9]:
# Get data visualization dataframes
col_mappings = {
    'index': 'sentiment', 
    'sentiment': 'count'
}
drop_cols = ['Unnamed: 0', 'vectors', 'cluster']

clean_words_df, words_distrib_df = formatPipelineOutput(words_df, drop_cols, col_mappings)

In [10]:
# Output to visualizations/
words_distrib_df.to_csv(WORDS_DISTRIB_OUTPUT_LOC)
clean_words_df.to_csv(CLEAN_WORDS_OUTPUT_LOC)

## `slava_ukraine_sentiment_transform.csv` transformation

In [11]:
# Import transformed tweets
tweets_df = pd.read_csv(TRANSFORMED_TWEETS_LOC)
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,date,username,retweets,tweet,hashtags,clean_tweet_words,clean_tweet,day,month,sentiment_val,sentiment
0,0,2022-02-28 23:55:31,Rob Smith üá®üá¶ üá∫üá¶,0,@kiraincongress Photos of rally in support of ...,,"['photo', 'rally', 'support', 'ukraine', 'toro...",photo rally support ukraine toronto canada sun...,28,2,-1,negative
1,1,2022-02-28 23:55:21,Frags,0,#Russia propaganda on #Twitter attempting to d...,russia twitter ukraine,"['russia', 'propaganda', 'twitter', 'attempt',...",russia propaganda twitter attempt diminish eve...,28,2,-1,negative
2,4,2022-02-28 23:54:28,Sydfish üá∫üá¶üá∫üá¶üá∫üá¶üá∫üá¶,0,I‚Äôve done more than 100 in St. Petersburg. \n\...,,"['do', 'st', 'petersburg', 'dm', 'would', 'lik...",do st petersburg dm would like blurb russian t...,28,2,-1,negative
3,7,2022-02-28 23:54:06,Jinx Spidox ‚û°Ô∏è Gdakon,0,#SlavaUkraini good night #Ukraine keep up the ...,slavaukraini ukraine,"['slavaukraini', 'good', 'night', 'ukraine', '...",slavaukraini good night ukraine keep good figh...,28,2,-1,negative
4,8,2022-02-28 23:53:58,Jackie Blue üá∫üá∏ üá®üá¶ üá∫üá¶ üåª,0,@anagin40 @NATO It sadly took Ukraine being a ...,,"['sadly', 'take', 'ukraine', 'sacrificial', 'l...",sadly take ukraine sacrificial lamb world wake...,28,2,0,neutral


In [18]:
# Get data visualization dataframes
col_mappings = {
    'index': 'sentiment',
    'sentiment': 'count'
}
drop_cols = ['Unnamed: 0', 'date', 'tweet', 'retweets', 'clean_tweet_words']


clean_tweets_df, tweets_distrib_df = formatPipelineOutput(tweets_df, drop_cols, col_mappings)

In [19]:
# Output to visualizations/
tweets_distrib_df.to_csv(TWEETS_DISTRIB_OUTPUT_LOC)
clean_tweets_df.to_csv(CLEAN_TWEETS_OUTPUT_LOC)